regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102 /* Valid for non-utf8 strings: avoids the reginclass
 103  * call if there are no complications: i.e., if everything matchable is
 104  * straight forward in the bitmap */
 105 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 106                                               : ANYOF_BITMAP_TEST(p,*(c)))
 107
 108 /*
 109  * Forwards.
 110  */
 111
 112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 113 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 114
 115 #define HOPc(pos,off) \
 116         (char *)(reginfo->is_utf8_target \
 117             ? reghop3((U8*)pos, off, \
 118                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 119             : (U8*)(pos + off))
 120 #define HOPBACKc(pos, off) \
 121         (char*)(reginfo->is_utf8_target \
 122             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 123             : (pos - off >= reginfo->strbeg)    \
 124                 ? (U8*)pos - off                \
 125                 : NULL)
 126
 127 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 129
 130
 131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 132 #define NEXTCHR_IS_EOS (nextchr < 0)
 133
 134 #define SET_nextchr \
 135     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 136
 137 #define SET_locinput(p) \
 138     locinput = (p);  \
 139     SET_nextchr
 140
 141
 142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
 143         if (!swash_ptr) {                                                     \
 144             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 145             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 146                                          1, 0, NULL, &flags);                 \
 147             assert(swash_ptr);                                                \
 148         }                                                                     \
 149     } STMT_END
 150
 151 /* If in debug mode, we test that a known character properly matches */
 152 #ifdef DEBUGGING
 153 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 154                                           property_name,                      \
 155                                           utf8_char_in_property)              \
 156         LOAD_UTF8_CHARCLASS(swash_ptr, property_name);                        \
 157         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 158 #else
 159 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 160                                           property_name,                      \
 161                                           utf8_char_in_property)              \
 162         LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
 163 #endif
 164
 165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 166                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 167                                         swash_property_names[_CC_WORDCHAR],   \
 168                                         GREEK_SMALL_LETTER_IOTA_UTF8)
 169
 170 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 171     STMT_START {                                                              \
 172         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 173                                        "_X_regular_begin",                    \
 174                                        GREEK_SMALL_LETTER_IOTA_UTF8);         \
 175         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 176                                        "_X_extend",                           \
 177                                        COMBINING_GRAVE_ACCENT_UTF8);          \
 178     } STMT_END
 179
 180 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 181 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 182
 183 /* for use after a quantifier and before an EXACT-like node -- japhy */
 184 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 185  *
 186  * NOTE that *nothing* that affects backtracking should be in here, specifically
 187  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 188  * node that is in between two EXACT like nodes when ascertaining what the required
 189  * "follow" character is. This should probably be moved to regex compile time
 190  * although it may be done at run time beause of the REF possibility - more
 191  * investigation required. -- demerphq
 192 */
 193 #define JUMPABLE(rn) (      \
 194     OP(rn) == OPEN ||       \
 195     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 196     OP(rn) == EVAL ||   \
 197     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 198     OP(rn) == PLUS || OP(rn) == MINMOD || \
 199     OP(rn) == KEEPS || \
 200     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 201 )
 202 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 203
 204 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 205
 206 #if 0
 207 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 208    we don't need this definition. */
 209 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 210 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 211 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 212
 213 #else
 214 /* ... so we use this as its faster. */
 215 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 216 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 217 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 218 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 219
 220 #endif
 221
 222 /*
 223   Search for mandatory following text node; for lookahead, the text must
 224   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 225 */
 226 #define FIND_NEXT_IMPT(rn) STMT_START { \
 227     while (JUMPABLE(rn)) { \
 228         const OPCODE type = OP(rn); \
 229         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 230             rn = NEXTOPER(NEXTOPER(rn)); \
 231         else if (type == PLUS) \
 232             rn = NEXTOPER(rn); \
 233         else if (type == IFMATCH) \
 234             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 235         else rn += NEXT_OFF(rn); \
 236     } \
 237 } STMT_END
 238
 239 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 240  * These are for the pre-composed Hangul syllables, which are all in a
 241  * contiguous block and arranged there in such a way so as to facilitate
 242  * alorithmic determination of their characteristics.  As such, they don't need
 243  * a swash, but can be determined by simple arithmetic.  Almost all are
 244  * GCB=LVT, but every 28th one is a GCB=LV */
 245 #define SBASE 0xAC00    /* Start of block */
 246 #define SCount 11172    /* Length of block */
 247 #define TCount 28
 248
 249 #define SLAB_FIRST(s) (&(s)->states[0])
 250 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 251
 252 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 253 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 254 static void S_clear_backtrack_stack(pTHX_ void *p);
 255 static regmatch_state * S_push_slab(pTHX);
 256
 257 #define REGCP_PAREN_ELEMS 3
 258 #define REGCP_OTHER_ELEMS 3
 259 #define REGCP_FRAME_ELEMS 1
 260 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 261  * are needed for the regexp context stack bookkeeping. */
 262
 263 STATIC CHECKPOINT
 264 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 265 {
 266     dVAR;
 267     const int retval = PL_savestack_ix;
 268     const int paren_elems_to_push =
 269                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 270     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 271     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 272     I32 p;
 273     GET_RE_DEBUG_FLAGS_DECL;
 274
 275     PERL_ARGS_ASSERT_REGCPPUSH;
 276
 277     if (paren_elems_to_push < 0)
 278         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 279                    paren_elems_to_push);
 280
 281     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 282         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 283                    " out of range (%lu-%ld)",
 284                    total_elems,
 285                    (unsigned long)maxopenparen,
 286                    (long)parenfloor);
 287
 288     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 289
 290     DEBUG_BUFFERS_r(
 291         if ((int)maxopenparen > (int)parenfloor)
 292             PerlIO_printf(Perl_debug_log,
 293                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 294                 PTR2UV(rex),
 295                 PTR2UV(rex->offs)
 296             );
 297     );
 298     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 299 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 300         SSPUSHINT(rex->offs[p].end);
 301         SSPUSHINT(rex->offs[p].start);
 302         SSPUSHINT(rex->offs[p].start_tmp);
 303         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 304             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 305             (UV)p,
 306             (IV)rex->offs[p].start,
 307             (IV)rex->offs[p].start_tmp,
 308             (IV)rex->offs[p].end
 309         ));
 310     }
 311 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 312     SSPUSHINT(maxopenparen);
 313     SSPUSHINT(rex->lastparen);
 314     SSPUSHINT(rex->lastcloseparen);
 315     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 316
 317     return retval;
 318 }
 319
 320 /* These are needed since we do not localize EVAL nodes: */
 321 #define REGCP_SET(cp)                                           \
 322     DEBUG_STATE_r(                                              \
 323             PerlIO_printf(Perl_debug_log,                       \
 324                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 325                 (IV)PL_savestack_ix));                          \
 326     cp = PL_savestack_ix
 327
 328 #define REGCP_UNWIND(cp)                                        \
 329     DEBUG_STATE_r(                                              \
 330         if (cp != PL_savestack_ix)                              \
 331             PerlIO_printf(Perl_debug_log,                       \
 332                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 333                 (IV)(cp), (IV)PL_savestack_ix));                \
 334     regcpblow(cp)
 335
 336 #define UNWIND_PAREN(lp, lcp)               \
 337     for (n = rex->lastparen; n > lp; n--)   \
 338         rex->offs[n].end = -1;              \
 339     rex->lastparen = n;                     \
 340     rex->lastcloseparen = lcp;
 341
 342
 343 STATIC void
 344 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 345 {
 346     dVAR;
 347     UV i;
 348     U32 paren;
 349     GET_RE_DEBUG_FLAGS_DECL;
 350
 351     PERL_ARGS_ASSERT_REGCPPOP;
 352
 353     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 354     i = SSPOPUV;
 355     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 356     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 357     rex->lastcloseparen = SSPOPINT;
 358     rex->lastparen = SSPOPINT;
 359     *maxopenparen_p = SSPOPINT;
 360
 361     i -= REGCP_OTHER_ELEMS;
 362     /* Now restore the parentheses context. */
 363     DEBUG_BUFFERS_r(
 364         if (i || rex->lastparen + 1 <= rex->nparens)
 365             PerlIO_printf(Perl_debug_log,
 366                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 367                 PTR2UV(rex),
 368                 PTR2UV(rex->offs)
 369             );
 370     );
 371     paren = *maxopenparen_p;
 372     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 373         I32 tmps;
 374         rex->offs[paren].start_tmp = SSPOPINT;
 375         rex->offs[paren].start = SSPOPINT;
 376         tmps = SSPOPINT;
 377         if (paren <= rex->lastparen)
 378             rex->offs[paren].end = tmps;
 379         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 380             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 381             (UV)paren,
 382             (IV)rex->offs[paren].start,
 383             (IV)rex->offs[paren].start_tmp,
 384             (IV)rex->offs[paren].end,
 385             (paren > rex->lastparen ? "(skipped)" : ""));
 386         );
 387         paren--;
 388     }
 389 #if 1
 390     /* It would seem that the similar code in regtry()
 391      * already takes care of this, and in fact it is in
 392      * a better location to since this code can #if 0-ed out
 393      * but the code in regtry() is needed or otherwise tests
 394      * requiring null fields (pat.t#187 and split.t#{13,14}
 395      * (as of patchlevel 7877)  will fail.  Then again,
 396      * this code seems to be necessary or otherwise
 397      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 398      * --jhi updated by dapm */
 399     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 400         if (i > *maxopenparen_p)
 401             rex->offs[i].start = -1;
 402         rex->offs[i].end = -1;
 403         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 404             "    \\%"UVuf": %s   ..-1 undeffing\n",
 405             (UV)i,
 406             (i > *maxopenparen_p) ? "-1" : "  "
 407         ));
 408     }
 409 #endif
 410 }
 411
 412 /* restore the parens and associated vars at savestack position ix,
 413  * but without popping the stack */
 414
 415 STATIC void
 416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 417 {
 418     I32 tmpix = PL_savestack_ix;
 419     PL_savestack_ix = ix;
 420     regcppop(rex, maxopenparen_p);
 421     PL_savestack_ix = tmpix;
 422 }
 423
 424 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 425
 426 STATIC bool
 427 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 428 {
 429     /* Returns a boolean as to whether or not 'character' is a member of the
 430      * Posix character class given by 'classnum' that should be equivalent to a
 431      * value in the typedef '_char_class_number'.
 432      *
 433      * Ideally this could be replaced by a just an array of function pointers
 434      * to the C library functions that implement the macros this calls.
 435      * However, to compile, the precise function signatures are required, and
 436      * these may vary from platform to to platform.  To avoid having to figure
 437      * out what those all are on each platform, I (khw) am using this method,
 438      * which adds an extra layer of function call overhead (unless the C
 439      * optimizer strips it away).  But we don't particularly care about
 440      * performance with locales anyway. */
 441
 442     switch ((_char_class_number) classnum) {
 443         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 444         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 445         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 446         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 447         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 448                                         || isUPPER_LC(character);
 449         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 450         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 451         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 452         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 453         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 454         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 455         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 456         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 457         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 458         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 459         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 460         default:    /* VERTSPACE should never occur in locales */
 461             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 462     }
 463
 464     assert(0); /* NOTREACHED */
 465     return FALSE;
 466 }
 467
 468 STATIC bool
 469 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 470 {
 471     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 472      * 'character' is a member of the Posix character class given by 'classnum'
 473      * that should be equivalent to a value in the typedef
 474      * '_char_class_number'.
 475      *
 476      * This just calls isFOO_lc on the code point for the character if it is in
 477      * the range 0-255.  Outside that range, all characters avoid Unicode
 478      * rules, ignoring any locale.  So use the Unicode function if this class
 479      * requires a swash, and use the Unicode macro otherwise. */
 480
 481     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 482
 483     if (UTF8_IS_INVARIANT(*character)) {
 484         return isFOO_lc(classnum, *character);
 485     }
 486     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 487         return isFOO_lc(classnum,
 488                         TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
 489     }
 490
 491     if (classnum < _FIRST_NON_SWASH_CC) {
 492
 493         /* Initialize the swash unless done already */
 494         if (! PL_utf8_swash_ptrs[classnum]) {
 495             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 496             PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
 497                 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
 498         }
 499
 500         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 501                                  character,
 502                                  TRUE /* is UTF */ ));
 503     }
 504
 505     switch ((_char_class_number) classnum) {
 506         case _CC_ENUM_SPACE:
 507         case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 508
 509         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 510         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 511         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 512         default:                 return 0;  /* Things like CNTRL are always
 513                                                below 256 */
 514     }
 515
 516     assert(0); /* NOTREACHED */
 517     return FALSE;
 518 }
 519
 520 /*
 521  * pregexec and friends
 522  */
 523
 524 #ifndef PERL_IN_XSUB_RE
 525 /*
 526  - pregexec - match a regexp against a string
 527  */
 528 I32
 529 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 530          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 531 /* stringarg: the point in the string at which to begin matching */
 532 /* strend:    pointer to null at end of string */
 533 /* strbeg:    real beginning of string */
 534 /* minend:    end of match must be >= minend bytes after stringarg. */
 535 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 536  *            itself is accessed via the pointers above */
 537 /* nosave:    For optimizations. */
 538 {
 539     PERL_ARGS_ASSERT_PREGEXEC;
 540
 541     return
 542         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 543                       nosave ? 0 : REXEC_COPY_STR);
 544 }
 545 #endif
 546
 547 /*
 548  * Need to implement the following flags for reg_anch:
 549  *
 550  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 551  * USE_INTUIT_ML
 552  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 553  * INTUIT_AUTORITATIVE_ML
 554  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 555  * INTUIT_ONCE_ML
 556  *
 557  * Another flag for this function: SECOND_TIME (so that float substrs
 558  * with giant delta may be not rechecked).
 559  */
 560
 561 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 562
 563 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 564    Otherwise, only SvCUR(sv) is used to get strbeg. */
 565
 566 /* XXXX We assume that strpos is strbeg unless sv. */
 567
 568 /* XXXX Some places assume that there is a fixed substring.
 569         An update may be needed if optimizer marks as "INTUITable"
 570         RExen without fixed substrings.  Similarly, it is assumed that
 571         lengths of all the strings are no more than minlen, thus they
 572         cannot come from lookahead.
 573         (Or minlen should take into account lookahead.)
 574   NOTE: Some of this comment is not correct. minlen does now take account
 575   of lookahead/behind. Further research is required. -- demerphq
 576
 577 */
 578
 579 /* A failure to find a constant substring means that there is no need to make
 580    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 581    finding a substring too deep into the string means that fewer calls to
 582    regtry() should be needed.
 583
 584    REx compiler's optimizer found 4 possible hints:
 585         a) Anchored substring;
 586         b) Fixed substring;
 587         c) Whether we are anchored (beginning-of-line or \G);
 588         d) First node (of those at offset 0) which may distinguish positions;
 589    We use a)b)d) and multiline-part of c), and try to find a position in the
 590    string which does not contradict any of them.
 591  */
 592
 593 /* Most of decisions we do here should have been done at compile time.
 594    The nodes of the REx which we used for the search should have been
 595    deleted from the finite automaton. */
 596
 597 /* args:
 598  * rx:     the regex to match against
 599  * sv:     the SV being matched: only used for utf8 flag; the string
 600  *         itself is accessed via the pointers below. Note that on
 601  *         something like an overloaded SV, SvPOK(sv) may be false
 602  *         and the string pointers may point to something unrelated to
 603  *         the SV itself.
 604  * strbeg: real beginning of string
 605  * strpos: the point in the string at which to begin matching
 606  * strend: pointer to the byte following the last char of the string
 607  * flags   currently unused; set to 0
 608  * data:   currently unused; set to NULL
 609  */
 610
 611 char *
 612 Perl_re_intuit_start(pTHX_
 613                     REGEXP * const rx,
 614                     SV *sv,
 615                     const char * const strbeg,
 616                     char *strpos,
 617                     char *strend,
 618                     const U32 flags,
 619                     re_scream_pos_data *data)
 620 {
 621     dVAR;
 622     struct regexp *const prog = ReANY(rx);
 623     I32 start_shift = 0;
 624     /* Should be nonnegative! */
 625     I32 end_shift   = 0;
 626     char *s;
 627     SV *check;
 628     char *t;
 629     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 630     I32 ml_anch;
 631     char *other_last = NULL;    /* other substr checked before this */
 632     char *check_at = NULL;              /* check substr found at this pos */
 633     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 634     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 635     RXi_GET_DECL(prog,progi);
 636     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 637     regmatch_info *const reginfo = &reginfo_buf;
 638 #ifdef DEBUGGING
 639     const char * const i_strpos = strpos;
 640 #endif
 641     GET_RE_DEBUG_FLAGS_DECL;
 642
 643     PERL_ARGS_ASSERT_RE_INTUIT_START;
 644     PERL_UNUSED_ARG(flags);
 645     PERL_UNUSED_ARG(data);
 646
 647     reginfo->is_utf8_target = cBOOL(utf8_target);
 648
 649     /* CHR_DIST() would be more correct here but it makes things slow. */
 650     if (prog->minlen > strend - strpos) {
 651         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 652                               "String too short... [re_intuit_start]\n"));
 653         goto fail;
 654     }
 655
 656     reginfo->info_aux = NULL;
 657     reginfo->strbeg = strbeg;
 658     reginfo->strend = strend;
 659     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 660     reginfo->intuit = 1;
 661     /* not actually used within intuit, but zero for safety anyway */
 662     reginfo->poscache_maxiter = 0;
 663
 664     if (utf8_target) {
 665         if (!prog->check_utf8 && prog->check_substr)
 666             to_utf8_substr(prog);
 667         check = prog->check_utf8;
 668     } else {
 669         if (!prog->check_substr && prog->check_utf8) {
 670             if (! to_byte_substr(prog)) {
 671                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 672             }
 673         }
 674         check = prog->check_substr;
 675     }
 676     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 677         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 678                      || ( (prog->extflags & RXf_ANCH_BOL)
 679                           && !multiline ) );    /* Check after \n? */
 680
 681         if (!ml_anch) {
 682           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 683                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 684                && (strpos != strbeg)) {
 685               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 686               goto fail;
 687           }
 688           if (prog->check_offset_min == prog->check_offset_max
 689               && !(prog->extflags & RXf_CANY_SEEN)
 690               && ! multiline)   /* /m can cause \n's to match that aren't
 691                                    accounted for in the string max length.
 692                                    See [perl #115242] */
 693           {
 694             /* Substring at constant offset from beg-of-str... */
 695             I32 slen;
 696
 697             s = HOP3c(strpos, prog->check_offset_min, strend);
 698
 699             if (SvTAIL(check)) {
 700                 slen = SvCUR(check);    /* >= 1 */
 701
 702                 if ( strend - s > slen || strend - s < slen - 1
 703                      || (strend - s == slen && strend[-1] != '\n')) {
 704                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 705                     goto fail_finish;
 706                 }
 707                 /* Now should match s[0..slen-2] */
 708                 slen--;
 709                 if (slen && (*SvPVX_const(check) != *s
 710                              || (slen > 1
 711                                  && memNE(SvPVX_const(check), s, slen)))) {
 712                   report_neq:
 713                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 714                     goto fail_finish;
 715                 }
 716             }
 717             else if (*SvPVX_const(check) != *s
 718                      || ((slen = SvCUR(check)) > 1
 719                          && memNE(SvPVX_const(check), s, slen)))
 720                 goto report_neq;
 721             check_at = s;
 722             goto success_at_start;
 723           }
 724         }
 725         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 726         s = strpos;
 727         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 728         end_shift = prog->check_end_shift;
 729
 730         if (!ml_anch) {
 731             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 732                                          - (SvTAIL(check) != 0);
 733             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 734
 735             if (end_shift < eshift)
 736                 end_shift = eshift;
 737         }
 738     }
 739     else {                              /* Can match at random position */
 740         ml_anch = 0;
 741         s = strpos;
 742         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 743         end_shift = prog->check_end_shift;
 744
 745         /* end shift should be non negative here */
 746     }
 747
 748 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 749     if (end_shift < 0)
 750         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 751                    (IV)end_shift, RX_PRECOMP(prog));
 752 #endif
 753
 754   restart:
 755     /* Find a possible match in the region s..strend by looking for
 756        the "check" substring in the region corrected by start/end_shift. */
 757
 758     {
 759         I32 srch_start_shift = start_shift;
 760         I32 srch_end_shift = end_shift;
 761         U8* start_point;
 762         U8* end_point;
 763         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 764             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 765             srch_start_shift = strbeg - s;
 766         }
 767     DEBUG_OPTIMISE_MORE_r({
 768         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 769             (IV)prog->check_offset_min,
 770             (IV)srch_start_shift,
 771             (IV)srch_end_shift,
 772             (IV)prog->check_end_shift);
 773     });
 774
 775         if (prog->extflags & RXf_CANY_SEEN) {
 776             start_point= (U8*)(s + srch_start_shift);
 777             end_point= (U8*)(strend - srch_end_shift);
 778         } else {
 779             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 780             end_point= HOP3(strend, -srch_end_shift, strbeg);
 781         }
 782         DEBUG_OPTIMISE_MORE_r({
 783             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 784                 (int)(end_point - start_point),
 785                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 786                 start_point);
 787         });
 788
 789         s = fbm_instr( start_point, end_point,
 790                       check, multiline ? FBMrf_MULTILINE : 0);
 791     }
 792     /* Update the count-of-usability, remove useless subpatterns,
 793         unshift s.  */
 794
 795     DEBUG_EXECUTE_r({
 796         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 797             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 798         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 799                           (s ? "Found" : "Did not find"),
 800             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 801                 ? "anchored" : "floating"),
 802             quoted,
 803             RE_SV_TAIL(check),
 804             (s ? " at offset " : "...\n") );
 805     });
 806
 807     if (!s)
 808         goto fail_finish;
 809     /* Finish the diagnostic message */
 810     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 811
 812     /* XXX dmq: first branch is for positive lookbehind...
 813        Our check string is offset from the beginning of the pattern.
 814        So we need to do any stclass tests offset forward from that
 815        point. I think. :-(
 816      */
 817
 818
 819
 820     check_at=s;
 821
 822
 823     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 824        Start with the other substr.
 825        XXXX no SCREAM optimization yet - and a very coarse implementation
 826        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 827                 *always* match.  Probably should be marked during compile...
 828        Probably it is right to do no SCREAM here...
 829      */
 830
 831     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 832                 : (prog->float_substr && prog->anchored_substr))
 833     {
 834         /* Take into account the "other" substring. */
 835         /* XXXX May be hopelessly wrong for UTF... */
 836         if (!other_last)
 837             other_last = strpos;
 838         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 839           do_other_anchored:
 840             {
 841                 char * const last = HOP3c(s, -start_shift, strbeg);
 842                 char *last1, *last2;
 843                 char * const saved_s = s;
 844                 SV* must;
 845
 846                 t = s - prog->check_offset_max;
 847                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 848                     && (!utf8_target
 849                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 850                             && t > strpos)))
 851                     NOOP;
 852                 else
 853                     t = strpos;
 854                 t = HOP3c(t, prog->anchored_offset, strend);
 855                 if (t < other_last)     /* These positions already checked */
 856                     t = other_last;
 857                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 858                 if (last < last1)
 859                     last1 = last;
 860                 /* XXXX It is not documented what units *_offsets are in.
 861                    We assume bytes, but this is clearly wrong.
 862                    Meaning this code needs to be carefully reviewed for errors.
 863                    dmq.
 864                   */
 865
 866                 /* On end-of-str: see comment below. */
 867                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 868                 if (must == &PL_sv_undef) {
 869                     s = (char*)NULL;
 870                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 871                 }
 872                 else
 873                     s = fbm_instr(
 874                         (unsigned char*)t,
 875                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 876                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 877                         must,
 878                         multiline ? FBMrf_MULTILINE : 0
 879                     );
 880                 DEBUG_EXECUTE_r({
 881                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 882                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 883                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 884                         (s ? "Found" : "Contradicts"),
 885                         quoted, RE_SV_TAIL(must));
 886                 });
 887
 888
 889                 if (!s) {
 890                     if (last1 >= last2) {
 891                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 892                                                 ", giving up...\n"));
 893                         goto fail_finish;
 894                     }
 895                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 896                         ", trying floating at offset %ld...\n",
 897                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 898                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 899                     s = HOP3c(last, 1, strend);
 900                     goto restart;
 901                 }
 902                 else {
 903                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 904                           (long)(s - i_strpos)));
 905                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 906                     other_last = HOP3c(s, 1, strend);
 907                     s = saved_s;
 908                     if (t == strpos)
 909                         goto try_at_start;
 910                     goto try_at_offset;
 911                 }
 912             }
 913         }
 914         else {          /* Take into account the floating substring. */
 915             char *last, *last1;
 916             char * const saved_s = s;
 917             SV* must;
 918
 919             t = HOP3c(s, -start_shift, strbeg);
 920             last1 = last =
 921                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 922             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 923                 last = HOP3c(t, prog->float_max_offset, strend);
 924             s = HOP3c(t, prog->float_min_offset, strend);
 925             if (s < other_last)
 926                 s = other_last;
 927  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 928             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 929             /* fbm_instr() takes into account exact value of end-of-str
 930                if the check is SvTAIL(ed).  Since false positives are OK,
 931                and end-of-str is not later than strend we are OK. */
 932             if (must == &PL_sv_undef) {
 933                 s = (char*)NULL;
 934                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 935             }
 936             else
 937                 s = fbm_instr((unsigned char*)s,
 938                               (unsigned char*)last + SvCUR(must)
 939                                   - (SvTAIL(must)!=0),
 940                               must, multiline ? FBMrf_MULTILINE : 0);
 941             DEBUG_EXECUTE_r({
 942                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 943                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 944                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 945                     (s ? "Found" : "Contradicts"),
 946                     quoted, RE_SV_TAIL(must));
 947             });
 948             if (!s) {
 949                 if (last1 == last) {
 950                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 951                                             ", giving up...\n"));
 952                     goto fail_finish;
 953                 }
 954                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 955                     ", trying anchored starting at offset %ld...\n",
 956                     (long)(saved_s + 1 - i_strpos)));
 957                 other_last = last;
 958                 s = HOP3c(t, 1, strend);
 959                 goto restart;
 960             }
 961             else {
 962                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 963                       (long)(s - i_strpos)));
 964                 other_last = s; /* Fix this later. --Hugo */
 965                 s = saved_s;
 966                 if (t == strpos)
 967                     goto try_at_start;
 968                 goto try_at_offset;
 969             }
 970         }
 971     }
 972
 973
 974     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 975
 976     DEBUG_OPTIMISE_MORE_r(
 977         PerlIO_printf(Perl_debug_log,
 978             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 979             (IV)prog->check_offset_min,
 980             (IV)prog->check_offset_max,
 981             (IV)(s-strpos),
 982             (IV)(t-strpos),
 983             (IV)(t-s),
 984             (IV)(strend-strpos)
 985         )
 986     );
 987
 988     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 989         && (!utf8_target
 990             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 991                  && t > strpos)))
 992     {
 993         /* Fixed substring is found far enough so that the match
 994            cannot start at strpos. */
 995       try_at_offset:
 996         if (ml_anch && t[-1] != '\n') {
 997             /* Eventually fbm_*() should handle this, but often
 998                anchored_offset is not 0, so this check will not be wasted. */
 999             /* XXXX In the code below we prefer to look for "^" even in
1000                presence of anchored substrings.  And we search even
1001                beyond the found float position.  These pessimizations
1002                are historical artefacts only.  */
1003           find_anchor:
1004             while (t < strend - prog->minlen) {
1005                 if (*t == '\n') {
1006                     if (t < check_at - prog->check_offset_min) {
1007                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1008                             /* Since we moved from the found position,
1009                                we definitely contradict the found anchored
1010                                substr.  Due to the above check we do not
1011                                contradict "check" substr.
1012                                Thus we can arrive here only if check substr
1013                                is float.  Redo checking for "other"=="fixed".
1014                              */
1015                             strpos = t + 1;
1016                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1017                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1018                             goto do_other_anchored;
1019                         }
1020                         /* We don't contradict the found floating substring. */
1021                         /* XXXX Why not check for STCLASS? */
1022                         s = t + 1;
1023                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1024                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1025                         goto set_useful;
1026                     }
1027                     /* Position contradicts check-string */
1028                     /* XXXX probably better to look for check-string
1029                        than for "\n", so one should lower the limit for t? */
1030                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1031                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1032                     other_last = strpos = s = t + 1;
1033                     goto restart;
1034                 }
1035                 t++;
1036             }
1037             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1038                         PL_colors[0], PL_colors[1]));
1039             goto fail_finish;
1040         }
1041         else {
1042             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1043                         PL_colors[0], PL_colors[1]));
1044         }
1045         s = t;
1046       set_useful:
1047         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1048     }
1049     else {
1050         /* The found string does not prohibit matching at strpos,
1051            - no optimization of calling REx engine can be performed,
1052            unless it was an MBOL and we are not after MBOL,
1053            or a future STCLASS check will fail this. */
1054       try_at_start:
1055         /* Even in this situation we may use MBOL flag if strpos is offset
1056            wrt the start of the string. */
1057         if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1058             /* May be due to an implicit anchor of m{.*foo}  */
1059             && !(prog->intflags & PREGf_IMPLICIT))
1060         {
1061             t = strpos;
1062             goto find_anchor;
1063         }
1064         DEBUG_EXECUTE_r( if (ml_anch)
1065             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1066                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1067         );
1068       success_at_start:
1069         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1070             && (utf8_target ? (
1071                 prog->check_utf8                /* Could be deleted already */
1072                 && --BmUSEFUL(prog->check_utf8) < 0
1073                 && (prog->check_utf8 == prog->float_utf8)
1074             ) : (
1075                 prog->check_substr              /* Could be deleted already */
1076                 && --BmUSEFUL(prog->check_substr) < 0
1077                 && (prog->check_substr == prog->float_substr)
1078             )))
1079         {
1080             /* If flags & SOMETHING - do not do it many times on the same match */
1081             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1082             /* XXX Does the destruction order has to change with utf8_target? */
1083             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1084             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1085             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1086             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1087             check = NULL;                       /* abort */
1088             s = strpos;
1089             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1090                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1091             if (prog->intflags & PREGf_IMPLICIT)
1092                 prog->extflags &= ~RXf_ANCH_MBOL;
1093             /* XXXX This is a remnant of the old implementation.  It
1094                     looks wasteful, since now INTUIT can use many
1095                     other heuristics. */
1096             prog->extflags &= ~RXf_USE_INTUIT;
1097             /* XXXX What other flags might need to be cleared in this branch? */
1098         }
1099         else
1100             s = strpos;
1101     }
1102
1103     /* Last resort... */
1104     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1105     /* trie stclasses are too expensive to use here, we are better off to
1106        leave it to regmatch itself */
1107     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1108         /* minlen == 0 is possible if regstclass is \b or \B,
1109            and the fixed substr is ''$.
1110            Since minlen is already taken into account, s+1 is before strend;
1111            accidentally, minlen >= 1 guaranties no false positives at s + 1
1112            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1113            regstclass does not come from lookahead...  */
1114         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1115            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1116         const U8* const str = (U8*)STRING(progi->regstclass);
1117         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1118                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1119                     : 1);
1120         char * endpos;
1121         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1122             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1123         else if (prog->float_substr || prog->float_utf8)
1124             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1125         else
1126             endpos= strend;
1127
1128         if (checked_upto < s)
1129            checked_upto = s;
1130         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1132
1133         t = s;
1134         s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1135                             reginfo);
1136         if (s) {
1137             checked_upto = s;
1138         } else {
1139 #ifdef DEBUGGING
1140             const char *what = NULL;
1141 #endif
1142             if (endpos == strend) {
1143                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1144                                 "Could not match STCLASS...\n") );
1145                 goto fail;
1146             }
1147             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1148                                    "This position contradicts STCLASS...\n") );
1149             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1150                 goto fail;
1151             checked_upto = HOPBACKc(endpos, start_shift);
1152             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1153                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1154             /* Contradict one of substrings */
1155             if (prog->anchored_substr || prog->anchored_utf8) {
1156                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1157                     DEBUG_EXECUTE_r( what = "anchored" );
1158                   hop_and_restart:
1159                     s = HOP3c(t, 1, strend);
1160                     if (s + start_shift + end_shift > strend) {
1161                         /* XXXX Should be taken into account earlier? */
1162                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1163                                                "Could not match STCLASS...\n") );
1164                         goto fail;
1165                     }
1166                     if (!check)
1167                         goto giveup;
1168                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1169                                 "Looking for %s substr starting at offset %ld...\n",
1170                                  what, (long)(s + start_shift - i_strpos)) );
1171                     goto restart;
1172                 }
1173                 /* Have both, check_string is floating */
1174                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1175                     goto retry_floating_check;
1176                 /* Recheck anchored substring, but not floating... */
1177                 s = check_at;
1178                 if (!check)
1179                     goto giveup;
1180                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1181                           "Looking for anchored substr starting at offset %ld...\n",
1182                           (long)(other_last - i_strpos)) );
1183                 goto do_other_anchored;
1184             }
1185             /* Another way we could have checked stclass at the
1186                current position only: */
1187             if (ml_anch) {
1188                 s = t = t + 1;
1189                 if (!check)
1190                     goto giveup;
1191                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1192                           "Looking for /%s^%s/m starting at offset %ld...\n",
1193                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1194                 goto try_at_offset;
1195             }
1196             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1197                 goto fail;
1198             /* Check is floating substring. */
1199           retry_floating_check:
1200             t = check_at - start_shift;
1201             DEBUG_EXECUTE_r( what = "floating" );
1202             goto hop_and_restart;
1203         }
1204         if (t != s) {
1205             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1206                         "By STCLASS: moving %ld --> %ld\n",
1207                                   (long)(t - i_strpos), (long)(s - i_strpos))
1208                    );
1209         }
1210         else {
1211             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1212                                   "Does not contradict STCLASS...\n");
1213                    );
1214         }
1215     }
1216   giveup:
1217     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1218                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1219                           PL_colors[5], (long)(s - i_strpos)) );
1220     return s;
1221
1222   fail_finish:                          /* Substring not found */
1223     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1224         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1225   fail:
1226     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1227                           PL_colors[4], PL_colors[5]));
1228     return NULL;
1229 }
1230
1231 #define DECL_TRIE_TYPE(scan) \
1232     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1233                     trie_type = ((scan->flags == EXACT) \
1234                               ? (utf8_target ? trie_utf8 : trie_plain) \
1235                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1236
1237 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1238 STMT_START {                               \
1239     STRLEN skiplen;                                                                 \
1240     switch (trie_type) {                                                            \
1241     case trie_utf8_fold:                                                            \
1242         if ( foldlen>0 ) {                                                          \
1243             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1244             foldlen -= len;                                                         \
1245             uscan += len;                                                           \
1246             len=0;                                                                  \
1247         } else {                                                                    \
1248             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1249             len = UTF8SKIP(uc);                                                     \
1250             skiplen = UNISKIP( uvc );                                               \
1251             foldlen -= skiplen;                                                     \
1252             uscan = foldbuf + skiplen;                                              \
1253         }                                                                           \
1254         break;                                                                      \
1255     case trie_latin_utf8_fold:                                                      \
1256         if ( foldlen>0 ) {                                                          \
1257             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1258             foldlen -= len;                                                         \
1259             uscan += len;                                                           \
1260             len=0;                                                                  \
1261         } else {                                                                    \
1262             len = 1;                                                                \
1263             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);   \
1264             skiplen = UNISKIP( uvc );                                               \
1265             foldlen -= skiplen;                                                     \
1266             uscan = foldbuf + skiplen;                                              \
1267         }                                                                           \
1268         break;                                                                      \
1269     case trie_utf8:                                                                 \
1270         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1271         break;                                                                      \
1272     case trie_plain:                                                                \
1273         uvc = (UV)*uc;                                                              \
1274         len = 1;                                                                    \
1275     }                                                                               \
1276     if (uvc < 256) {                                                                \
1277         charid = trie->charmap[ uvc ];                                              \
1278     }                                                                               \
1279     else {                                                                          \
1280         charid = 0;                                                                 \
1281         if (widecharmap) {                                                          \
1282             SV** const svpp = hv_fetch(widecharmap,                                 \
1283                         (char*)&uvc, sizeof(UV), 0);                                \
1284             if (svpp)                                                               \
1285                 charid = (U16)SvIV(*svpp);                                          \
1286         }                                                                           \
1287     }                                                                               \
1288 } STMT_END
1289
1290 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1291 STMT_START {                                              \
1292     while (s <= e) {                                      \
1293         if ( (CoNd)                                       \
1294              && (ln == 1 || folder(s, pat_string, ln))    \
1295              && (reginfo->intuit || regtry(reginfo, &s)) )\
1296             goto got_it;                                  \
1297         s++;                                              \
1298     }                                                     \
1299 } STMT_END
1300
1301 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1302 STMT_START {                                          \
1303     while (s < strend) {                              \
1304         CoDe                                          \
1305         s += UTF8SKIP(s);                             \
1306     }                                                 \
1307 } STMT_END
1308
1309 #define REXEC_FBC_SCAN(CoDe)                          \
1310 STMT_START {                                          \
1311     while (s < strend) {                              \
1312         CoDe                                          \
1313         s++;                                          \
1314     }                                                 \
1315 } STMT_END
1316
1317 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1318 REXEC_FBC_UTF8_SCAN(                                  \
1319     if (CoNd) {                                       \
1320         if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1321             goto got_it;                              \
1322         else                                          \
1323             tmp = doevery;                            \
1324     }                                                 \
1325     else                                              \
1326         tmp = 1;                                      \
1327 )
1328
1329 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1330 REXEC_FBC_SCAN(                                       \
1331     if (CoNd) {                                       \
1332         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))  \
1333             goto got_it;                              \
1334         else                                          \
1335             tmp = doevery;                            \
1336     }                                                 \
1337     else                                              \
1338         tmp = 1;                                      \
1339 )
1340
1341 #define REXEC_FBC_TRYIT               \
1342 if ((reginfo->intuit || regtry(reginfo, &s))) \
1343     goto got_it
1344
1345 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1346     if (utf8_target) {                                             \
1347         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1348     }                                                          \
1349     else {                                                     \
1350         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1351     }
1352
1353 #define DUMP_EXEC_POS(li,s,doutf8) \
1354     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1355                 (PL_reg_starttry),doutf8)
1356
1357
1358 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1359         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1360         tmp = TEST_NON_UTF8(tmp);                                              \
1361         REXEC_FBC_UTF8_SCAN(                                                   \
1362             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1363                 tmp = !tmp;                                                    \
1364                 IF_SUCCESS;                                                    \
1365             }                                                                  \
1366             else {                                                             \
1367                 IF_FAIL;                                                       \
1368             }                                                                  \
1369         );                                                                     \
1370
1371 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1372         if (s == reginfo->strbeg) {                                            \
1373             tmp = '\n';                                                        \
1374         }                                                                      \
1375         else {                                                                 \
1376             U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
1377             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1378         }                                                                      \
1379         tmp = TeSt1_UtF8;                                                      \
1380         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1381         REXEC_FBC_UTF8_SCAN(                                                   \
1382             if (tmp == ! (TeSt2_UtF8)) { \
1383                 tmp = !tmp;                                                    \
1384                 IF_SUCCESS;                                                    \
1385             }                                                                  \
1386             else {                                                             \
1387                 IF_FAIL;                                                       \
1388             }                                                                  \
1389         );                                                                     \
1390
1391 /* The only difference between the BOUND and NBOUND cases is that
1392  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1393  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1394  * with the other one being empty */
1395 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1396     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1397
1398 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1399     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1400
1401 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1402     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1403
1404 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1405     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1406
1407
1408 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1409  * be passed in completely with the variable name being tested, which isn't
1410  * such a clean interface, but this is easier to read than it was before.  We
1411  * are looking for the boundary (or non-boundary between a word and non-word
1412  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1413  * must be different.  Find the "wordness" of the character just prior to this
1414  * one, and compare it with the wordness of this one.  If they differ, we have
1415  * a boundary.  At the beginning of the string, pretend that the previous
1416  * character was a new-line */
1417 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1418     if (utf8_target) {                                                         \
1419                 UTF8_CODE \
1420     }                                                                          \
1421     else {  /* Not utf8 */                                                     \
1422         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1423         tmp = TEST_NON_UTF8(tmp);                                              \
1424         REXEC_FBC_SCAN(                                                        \
1425             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1426                 tmp = !tmp;                                                    \
1427                 IF_SUCCESS;                                                    \
1428             }                                                                  \
1429             else {                                                             \
1430                 IF_FAIL;                                                       \
1431             }                                                                  \
1432         );                                                                     \
1433     }                                                                          \
1434     if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s)))           \
1435         goto got_it;
1436
1437 /* We know what class REx starts with.  Try to find this position... */
1438 /* if reginfo->intuit, its a dryrun */
1439 /* annoyingly all the vars in this routine have different names from their counterparts
1440    in regmatch. /grrr */
1441
1442 STATIC char *
1443 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1444     const char *strend, regmatch_info *reginfo)
1445 {
1446     dVAR;
1447     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1448     char *pat_string;   /* The pattern's exactish string */
1449     char *pat_end;          /* ptr to end char of pat_string */
1450     re_fold_t folder;   /* Function for computing non-utf8 folds */
1451     const U8 *fold_array;   /* array for folding ords < 256 */
1452     STRLEN ln;
1453     STRLEN lnc;
1454     U8 c1;
1455     U8 c2;
1456     char *e;
1457     I32 tmp = 1;        /* Scratch variable? */
1458     const bool utf8_target = reginfo->is_utf8_target;
1459     UV utf8_fold_flags = 0;
1460     const bool is_utf8_pat = reginfo->is_utf8_pat;
1461     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1462                                    with a result inverts that result, as 0^1 =
1463                                    1 and 1^1 = 0 */
1464     _char_class_number classnum;
1465
1466     RXi_GET_DECL(prog,progi);
1467
1468     PERL_ARGS_ASSERT_FIND_BYCLASS;
1469
1470     /* We know what class it must start with. */
1471     switch (OP(c)) {
1472     case ANYOF:
1473     case ANYOF_SYNTHETIC:
1474     case ANYOF_WARN_SUPER:
1475         if (utf8_target) {
1476             REXEC_FBC_UTF8_CLASS_SCAN(
1477                       reginclass(prog, c, (U8*)s, utf8_target));
1478         }
1479         else {
1480             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1481         }
1482         break;
1483     case CANY:
1484         REXEC_FBC_SCAN(
1485             if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1486                 goto got_it;
1487             else
1488                 tmp = doevery;
1489         );
1490         break;
1491
1492     case EXACTFA:
1493         if (is_utf8_pat || utf8_target) {
1494             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1495             goto do_exactf_utf8;
1496         }
1497         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1498         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1499         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1500
1501     case EXACTF:
1502         if (utf8_target) {
1503
1504             /* regcomp.c already folded this if pattern is in UTF-8 */
1505             utf8_fold_flags = 0;
1506             goto do_exactf_utf8;
1507         }
1508         fold_array = PL_fold;
1509         folder = foldEQ;
1510         goto do_exactf_non_utf8;
1511
1512     case EXACTFL:
1513         if (is_utf8_pat || utf8_target) {
1514             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1515             goto do_exactf_utf8;
1516         }
1517         fold_array = PL_fold_locale;
1518         folder = foldEQ_locale;
1519         goto do_exactf_non_utf8;
1520
1521     case EXACTFU_SS:
1522         if (is_utf8_pat) {
1523             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1524         }
1525         goto do_exactf_utf8;
1526
1527     case EXACTFU_TRICKYFOLD:
1528     case EXACTFU:
1529         if (is_utf8_pat || utf8_target) {
1530             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1531             goto do_exactf_utf8;
1532         }
1533
1534         /* Any 'ss' in the pattern should have been replaced by regcomp,
1535          * so we don't have to worry here about this single special case
1536          * in the Latin1 range */
1537         fold_array = PL_fold_latin1;
1538         folder = foldEQ_latin1;
1539
1540         /* FALL THROUGH */
1541
1542     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1543                            are no glitches with fold-length differences
1544                            between the target string and pattern */
1545
1546         /* The idea in the non-utf8 EXACTF* cases is to first find the
1547          * first character of the EXACTF* node and then, if necessary,
1548          * case-insensitively compare the full text of the node.  c1 is the
1549          * first character.  c2 is its fold.  This logic will not work for
1550          * Unicode semantics and the german sharp ss, which hence should
1551          * not be compiled into a node that gets here. */
1552         pat_string = STRING(c);
1553         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1554
1555         /* We know that we have to match at least 'ln' bytes (which is the
1556          * same as characters, since not utf8).  If we have to match 3
1557          * characters, and there are only 2 availabe, we know without
1558          * trying that it will fail; so don't start a match past the
1559          * required minimum number from the far end */
1560         e = HOP3c(strend, -((I32)ln), s);
1561
1562         if (reginfo->intuit && e < s) {
1563             e = s;                      /* Due to minlen logic of intuit() */
1564         }
1565
1566         c1 = *pat_string;
1567         c2 = fold_array[c1];
1568         if (c1 == c2) { /* If char and fold are the same */
1569             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1570         }
1571         else {
1572             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1573         }
1574         break;
1575
1576     do_exactf_utf8:
1577     {
1578         unsigned expansion;
1579
1580         /* If one of the operands is in utf8, we can't use the simpler folding
1581          * above, due to the fact that many different characters can have the
1582          * same fold, or portion of a fold, or different- length fold */
1583         pat_string = STRING(c);
1584         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1585         pat_end = pat_string + ln;
1586         lnc = is_utf8_pat       /* length to match in characters */
1587                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1588                 : ln;
1589
1590         /* We have 'lnc' characters to match in the pattern, but because of
1591          * multi-character folding, each character in the target can match
1592          * up to 3 characters (Unicode guarantees it will never exceed
1593          * this) if it is utf8-encoded; and up to 2 if not (based on the
1594          * fact that the Latin 1 folds are already determined, and the
1595          * only multi-char fold in that range is the sharp-s folding to
1596          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1597          * string character.  Adjust lnc accordingly, rounding up, so that
1598          * if we need to match at least 4+1/3 chars, that really is 5. */
1599         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1600         lnc = (lnc + expansion - 1) / expansion;
1601
1602         /* As in the non-UTF8 case, if we have to match 3 characters, and
1603          * only 2 are left, it's guaranteed to fail, so don't start a
1604          * match that would require us to go beyond the end of the string
1605          */
1606         e = HOP3c(strend, -((I32)lnc), s);
1607
1608         if (reginfo->intuit && e < s) {
1609             e = s;                      /* Due to minlen logic of intuit() */
1610         }
1611
1612         /* XXX Note that we could recalculate e to stop the loop earlier,
1613          * as the worst case expansion above will rarely be met, and as we
1614          * go along we would usually find that e moves further to the left.
1615          * This would happen only after we reached the point in the loop
1616          * where if there were no expansion we should fail.  Unclear if
1617          * worth the expense */
1618
1619         while (s <= e) {
1620             char *my_strend= (char *)strend;
1621             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1622                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1623                 && (reginfo->intuit || regtry(reginfo, &s)) )
1624             {
1625                 goto got_it;
1626             }
1627             s += (utf8_target) ? UTF8SKIP(s) : 1;
1628         }
1629         break;
1630     }
1631     case BOUNDL:
1632         RXp_MATCH_TAINTED_on(prog);
1633         FBC_BOUND(isWORDCHAR_LC,
1634                   isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1635                   isWORDCHAR_LC_utf8((U8*)s));
1636         break;
1637     case NBOUNDL:
1638         RXp_MATCH_TAINTED_on(prog);
1639         FBC_NBOUND(isWORDCHAR_LC,
1640                    isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1641                    isWORDCHAR_LC_utf8((U8*)s));
1642         break;
1643     case BOUND:
1644         FBC_BOUND(isWORDCHAR,
1645                   isWORDCHAR_uni(tmp),
1646                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1647         break;
1648     case BOUNDA:
1649         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1650                          isWORDCHAR_A(tmp),
1651                          isWORDCHAR_A((U8*)s));
1652         break;
1653     case NBOUND:
1654         FBC_NBOUND(isWORDCHAR,
1655                    isWORDCHAR_uni(tmp),
1656                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1657         break;
1658     case NBOUNDA:
1659         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1660                           isWORDCHAR_A(tmp),
1661                           isWORDCHAR_A((U8*)s));
1662         break;
1663     case BOUNDU:
1664         FBC_BOUND(isWORDCHAR_L1,
1665                   isWORDCHAR_uni(tmp),
1666                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1667         break;
1668     case NBOUNDU:
1669         FBC_NBOUND(isWORDCHAR_L1,
1670                    isWORDCHAR_uni(tmp),
1671                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1672         break;
1673     case LNBREAK:
1674         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1675                         is_LNBREAK_latin1_safe(s, strend)
1676         );
1677         break;
1678
1679     /* The argument to all the POSIX node types is the class number to pass to
1680      * _generic_isCC() to build a mask for searching in PL_charclass[] */
1681
1682     case NPOSIXL:
1683         to_complement = 1;
1684         /* FALLTHROUGH */
1685
1686     case POSIXL:
1687         RXp_MATCH_TAINTED_on(prog);
1688         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1689                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1690         break;
1691
1692     case NPOSIXD:
1693         to_complement = 1;
1694         /* FALLTHROUGH */
1695
1696     case POSIXD:
1697         if (utf8_target) {
1698             goto posix_utf8;
1699         }
1700         goto posixa;
1701
1702     case NPOSIXA:
1703         if (utf8_target) {
1704             /* The complement of something that matches only ASCII matches all
1705              * UTF-8 variant code points, plus everything in ASCII that isn't
1706              * in the class */
1707             REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1708                                       || ! _generic_isCC_A(*s, FLAGS(c)));
1709             break;
1710         }
1711
1712         to_complement = 1;
1713         /* FALLTHROUGH */
1714
1715     case POSIXA:
1716       posixa:
1717         /* Don't need to worry about utf8, as it can match only a single
1718          * byte invariant character. */
1719         REXEC_FBC_CLASS_SCAN(
1720                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1721         break;
1722
1723     case NPOSIXU:
1724         to_complement = 1;
1725         /* FALLTHROUGH */
1726
1727     case POSIXU:
1728         if (! utf8_target) {
1729             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1730                                                                     FLAGS(c))));
1731         }
1732         else {
1733
1734       posix_utf8:
1735             classnum = (_char_class_number) FLAGS(c);
1736             if (classnum < _FIRST_NON_SWASH_CC) {
1737                 while (s < strend) {
1738
1739                     /* We avoid loading in the swash as long as possible, but
1740                      * should we have to, we jump to a separate loop.  This
1741                      * extra 'if' statement is what keeps this code from being
1742                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1743                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
1744                         goto found_above_latin1;
1745                     }
1746                     if ((UTF8_IS_INVARIANT(*s)
1747                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1748                                                                 classnum)))
1749                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
1750                             && to_complement ^ cBOOL(
1751                                 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1752                                               classnum))))
1753                     {
1754                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1755                             goto got_it;
1756                         else {
1757                             tmp = doevery;
1758                         }
1759                     }
1760                     else {
1761                         tmp = 1;
1762                     }
1763                     s += UTF8SKIP(s);
1764                 }
1765             }
1766             else switch (classnum) {    /* These classes are implemented as
1767                                            macros */
1768                 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1769                                         revert the change of \v matching this */
1770                     /* FALL THROUGH */
1771
1772                 case _CC_ENUM_PSXSPC:
1773                     REXEC_FBC_UTF8_CLASS_SCAN(
1774                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
1775                     break;
1776
1777                 case _CC_ENUM_BLANK:
1778                     REXEC_FBC_UTF8_CLASS_SCAN(
1779                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
1780                     break;
1781
1782                 case _CC_ENUM_XDIGIT:
1783                     REXEC_FBC_UTF8_CLASS_SCAN(
1784                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1785                     break;
1786
1787                 case _CC_ENUM_VERTSPACE:
1788                     REXEC_FBC_UTF8_CLASS_SCAN(
1789                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
1790                     break;
1791
1792                 case _CC_ENUM_CNTRL:
1793                     REXEC_FBC_UTF8_CLASS_SCAN(
1794                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
1795                     break;
1796
1797                 default:
1798                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1799                     assert(0); /* NOTREACHED */
1800             }
1801         }
1802         break;
1803
1804       found_above_latin1:   /* Here we have to load a swash to get the result
1805                                for the current code point */
1806         if (! PL_utf8_swash_ptrs[classnum]) {
1807             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1808             PL_utf8_swash_ptrs[classnum] =
1809                     _core_swash_init("utf8", swash_property_names[classnum],
1810                                      &PL_sv_undef, 1, 0, NULL, &flags);
1811         }
1812
1813         /* This is a copy of the loop above for swash classes, though using the
1814          * FBC macro instead of being expanded out.  Since we've loaded the
1815          * swash, we don't have to check for that each time through the loop */
1816         REXEC_FBC_UTF8_CLASS_SCAN(
1817                 to_complement ^ cBOOL(_generic_utf8(
1818                                       classnum,
1819                                       s,
1820                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
1821                                                   (U8 *) s, TRUE))));
1822         break;
1823
1824     case AHOCORASICKC:
1825     case AHOCORASICK:
1826         {
1827             DECL_TRIE_TYPE(c);
1828             /* what trie are we using right now */
1829             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1830             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1831             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1832
1833             const char *last_start = strend - trie->minlen;
1834 #ifdef DEBUGGING
1835             const char *real_start = s;
1836 #endif
1837             STRLEN maxlen = trie->maxlen;
1838             SV *sv_points;
1839             U8 **points; /* map of where we were in the input string
1840                             when reading a given char. For ASCII this
1841                             is unnecessary overhead as the relationship
1842                             is always 1:1, but for Unicode, especially
1843                             case folded Unicode this is not true. */
1844             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1845             U8 *bitmap=NULL;
1846
1847
1848             GET_RE_DEBUG_FLAGS_DECL;
1849
1850             /* We can't just allocate points here. We need to wrap it in
1851              * an SV so it gets freed properly if there is a croak while
1852              * running the match */
1853             ENTER;
1854             SAVETMPS;
1855             sv_points=newSV(maxlen * sizeof(U8 *));
1856             SvCUR_set(sv_points,
1857                 maxlen * sizeof(U8 *));
1858             SvPOK_on(sv_points);
1859             sv_2mortal(sv_points);
1860             points=(U8**)SvPV_nolen(sv_points );
1861             if ( trie_type != trie_utf8_fold
1862                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
1863             {
1864                 if (trie->bitmap)
1865                     bitmap=(U8*)trie->bitmap;
1866                 else
1867                     bitmap=(U8*)ANYOF_BITMAP(c);
1868             }
1869             /* this is the Aho-Corasick algorithm modified a touch
1870                to include special handling for long "unknown char" sequences.
1871                The basic idea being that we use AC as long as we are dealing
1872                with a possible matching char, when we encounter an unknown char
1873                (and we have not encountered an accepting state) we scan forward
1874                until we find a legal starting char.
1875                AC matching is basically that of trie matching, except that when
1876                we encounter a failing transition, we fall back to the current
1877                states "fail state", and try the current char again, a process
1878                we repeat until we reach the root state, state 1, or a legal
1879                transition. If we fail on the root state then we can either
1880                terminate if we have reached an accepting state previously, or
1881                restart the entire process from the beginning if we have not.
1882
1883              */
1884             while (s <= last_start) {
1885                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1886                 U8 *uc = (U8*)s;
1887                 U16 charid = 0;
1888                 U32 base = 1;
1889                 U32 state = 1;
1890                 UV uvc = 0;
1891                 STRLEN len = 0;
1892                 STRLEN foldlen = 0;
1893                 U8 *uscan = (U8*)NULL;
1894                 U8 *leftmost = NULL;
1895 #ifdef DEBUGGING
1896                 U32 accepted_word= 0;
1897 #endif
1898                 U32 pointpos = 0;
1899
1900                 while ( state && uc <= (U8*)strend ) {
1901                     int failed=0;
1902                     U32 word = aho->states[ state ].wordnum;
1903
1904                     if( state==1 ) {
1905                         if ( bitmap ) {
1906                             DEBUG_TRIE_EXECUTE_r(
1907                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1908                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1909                                         (char *)uc, utf8_target );
1910                                     PerlIO_printf( Perl_debug_log,
1911                                         " Scanning for legal start char...\n");
1912                                 }
1913                             );
1914                             if (utf8_target) {
1915                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1916                                     uc += UTF8SKIP(uc);
1917                                 }
1918                             } else {
1919                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1920                                     uc++;
1921                                 }
1922                             }
1923                             s= (char *)uc;
1924                         }
1925                         if (uc >(U8*)last_start) break;
1926                     }
1927
1928                     if ( word ) {
1929                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1930                         if (!leftmost || lpos < leftmost) {
1931                             DEBUG_r(accepted_word=word);
1932                             leftmost= lpos;
1933                         }
1934                         if (base==0) break;
1935
1936                     }
1937                     points[pointpos++ % maxlen]= uc;
1938                     if (foldlen || uc < (U8*)strend) {
1939                         REXEC_TRIE_READ_CHAR(trie_type, trie,
1940                                          widecharmap, uc,
1941                                          uscan, len, uvc, charid, foldlen,
1942                                          foldbuf, uniflags);
1943                         DEBUG_TRIE_EXECUTE_r({
1944                             dump_exec_pos( (char *)uc, c, strend,
1945                                         real_start, s, utf8_target);
1946                             PerlIO_printf(Perl_debug_log,
1947                                 " Charid:%3u CP:%4"UVxf" ",
1948                                  charid, uvc);
1949                         });
1950                     }
1951                     else {
1952                         len = 0;
1953                         charid = 0;
1954                     }
1955
1956
1957                     do {
1958 #ifdef DEBUGGING
1959                         word = aho->states[ state ].wordnum;
1960 #endif
1961                         base = aho->states[ state ].trans.base;
1962
1963                         DEBUG_TRIE_EXECUTE_r({
1964                             if (failed)
1965                                 dump_exec_pos( (char *)uc, c, strend, real_start,
1966                                     s,   utf8_target );
1967                             PerlIO_printf( Perl_debug_log,
1968                                 "%sState: %4"UVxf", word=%"UVxf,
1969                                 failed ? " Fail transition to " : "",
1970                                 (UV)state, (UV)word);
1971                         });
1972                         if ( base ) {
1973                             U32 tmp;
1974                             I32 offset;
1975                             if (charid &&
1976                                  ( ((offset = base + charid
1977                                     - 1 - trie->uniquecharcount)) >= 0)
1978                                  && ((U32)offset < trie->lasttrans)
1979                                  && trie->trans[offset].check == state
1980                                  && (tmp=trie->trans[offset].next))
1981                             {
1982                                 DEBUG_TRIE_EXECUTE_r(
1983                                     PerlIO_printf( Perl_debug_log," - legal\n"));
1984                                 state = tmp;
1985                                 break;
1986                             }
1987                             else {
1988                                 DEBUG_TRIE_EXECUTE_r(
1989                                     PerlIO_printf( Perl_debug_log," - fail\n"));
1990                                 failed = 1;
1991                                 state = aho->fail[state];
1992                             }
1993                         }
1994                         else {
1995                             /* we must be accepting here */
1996                             DEBUG_TRIE_EXECUTE_r(
1997                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
1998                             failed = 1;
1999                             break;
2000                         }
2001                     } while(state);
2002                     uc += len;
2003                     if (failed) {
2004                         if (leftmost)
2005                             break;
2006                         if (!state) state = 1;
2007                     }
2008                 }
2009                 if ( aho->states[ state ].wordnum ) {
2010                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2011                     if (!leftmost || lpos < leftmost) {
2012                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2013                         leftmost = lpos;
2014                     }
2015                 }
2016                 if (leftmost) {
2017                     s = (char*)leftmost;
2018                     DEBUG_TRIE_EXECUTE_r({
2019                         PerlIO_printf(
2020                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2021                             (UV)accepted_word, (IV)(s - real_start)
2022                         );
2023                     });
2024                     if (reginfo->intuit || regtry(reginfo, &s)) {
2025                         FREETMPS;
2026                         LEAVE;
2027                         goto got_it;
2028                     }
2029                     s = HOPc(s,1);
2030                     DEBUG_TRIE_EXECUTE_r({
2031                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2032                     });
2033                 } else {
2034                     DEBUG_TRIE_EXECUTE_r(
2035                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2036                     break;
2037                 }
2038             }
2039             FREETMPS;
2040             LEAVE;
2041         }
2042         break;
2043     default:
2044         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2045         break;
2046     }
2047     return 0;
2048   got_it:
2049     return s;
2050 }
2051
2052
2053 /*
2054  - regexec_flags - match a regexp against a string
2055  */
2056 I32
2057 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2058               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2059 /* stringarg: the point in the string at which to begin matching */
2060 /* strend:    pointer to null at end of string */
2061 /* strbeg:    real beginning of string */
2062 /* minend:    end of match must be >= minend bytes after stringarg. */
2063 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2064  *            itself is accessed via the pointers above */
2065 /* data:      May be used for some additional optimizations.
2066               Currently its only used, with a U32 cast, for transmitting
2067               the ganch offset when doing a /g match. This will change */
2068 /* nosave:    For optimizations. */
2069
2070 {
2071     dVAR;
2072     struct regexp *const prog = ReANY(rx);
2073     char *s;
2074     regnode *c;
2075     char *startpos = stringarg;
2076     I32 minlen;         /* must match at least this many chars */
2077     I32 dontbother = 0; /* how many characters not to try at end */
2078     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2079     I32 scream_pos = -1;                /* Internal iterator of scream. */
2080     char *scream_olds = NULL;
2081     const bool utf8_target = cBOOL(DO_UTF8(sv));
2082     I32 multiline;
2083     RXi_GET_DECL(prog,progi);
2084     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2085     regmatch_info *const reginfo = &reginfo_buf;
2086     regexp_paren_pair *swap = NULL;
2087     I32 oldsave;
2088     GET_RE_DEBUG_FLAGS_DECL;
2089
2090     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2091     PERL_UNUSED_ARG(data);
2092
2093     /* Be paranoid... */
2094     if (prog == NULL || startpos == NULL) {
2095         Perl_croak(aTHX_ "NULL regexp parameter");
2096         return 0;
2097     }
2098
2099     multiline = prog->extflags & RXf_PMf_MULTILINE;
2100
2101     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2102     reginfo->intuit = 0;
2103     reginfo->is_utf8_target = cBOOL(utf8_target);
2104
2105     /* on first ever match, allocate first slab */
2106     if (!PL_regmatch_slab) {
2107         Newx(PL_regmatch_slab, 1, regmatch_slab);
2108         PL_regmatch_slab->prev = NULL;
2109         PL_regmatch_slab->next = NULL;
2110         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2111     }
2112
2113     /* note current PL_regmatch_state position; at end of match we'll
2114      * pop back to there and free any higher slabs */
2115     oldsave = PL_savestack_ix;
2116
2117     DEBUG_EXECUTE_r(
2118         debug_start_match(rx, utf8_target, startpos, strend,
2119         "Matching");
2120     );
2121
2122     minlen = prog->minlen;
2123
2124     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2125         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2126                               "String too short [regexec_flags]...\n"));
2127         goto phooey;
2128     }
2129
2130
2131     /* Check validity of program. */
2132     if (UCHARAT(progi->program) != REG_MAGIC) {
2133         Perl_croak(aTHX_ "corrupted regexp program");
2134     }
2135
2136     RX_MATCH_TAINTED_off(rx);
2137
2138     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2139     reginfo->warned = FALSE;
2140     reginfo->strbeg  = strbeg;
2141     reginfo->sv = sv;
2142     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2143
2144     /* Mark end of string for $ (and such) */
2145     reginfo->strend = strend;
2146
2147     /* see how far we have to get to not match where we matched before */
2148     reginfo->till = startpos+minend;
2149
2150     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2151     SAVEVPTR(PL_regmatch_slab);
2152     SAVEVPTR(PL_regmatch_state);
2153
2154     /* grab next slot in regmatch_state stack to store regmatch_info_aux
2155      * struct */
2156
2157     reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2158     if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2159         PL_regmatch_state = S_push_slab(aTHX);
2160
2161     SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2162
2163
2164     if ((prog->extflags & RXf_EVAL_SEEN)) {
2165         /* grab next slot in regmatch_state stack to store
2166          * regmatch_info_aux_eval struct */
2167
2168         reginfo->info_aux_eval =
2169         reginfo->info_aux->info_aux_eval =
2170                             &(PL_regmatch_state->u.info_aux_eval);
2171
2172         if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2173             PL_regmatch_state = S_push_slab(aTHX);
2174
2175         S_setup_eval_state(aTHX_ reginfo);
2176     }
2177     else
2178         reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2179
2180
2181     /* If there is a "must appear" string, look for it. */
2182     s = startpos;
2183
2184     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2185         MAGIC *mg;
2186         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2187             reginfo->ganch = startpos + prog->gofs;
2188             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2189               "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2190         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2191                   && SvMAGIC(sv)
2192                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2193                   && mg->mg_len >= 0) {
2194             reginfo->ganch = strbeg + mg->mg_len;       /* Defined pos() */
2195             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2196                 "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2197
2198             if (prog->extflags & RXf_ANCH_GPOS) {
2199                 if (s > reginfo->ganch)
2200                     goto phooey;
2201                 s = reginfo->ganch - prog->gofs;
2202                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2203                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2204                 if (s < strbeg)
2205                     goto phooey;
2206             }
2207         }
2208         else if (data) {
2209             reginfo->ganch = strbeg + PTR2UV(data);
2210             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2211                  "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2212
2213         } else {                                /* pos() not defined */
2214             reginfo->ganch = strbeg;
2215             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2216                  "GPOS: reginfo->ganch = strbeg\n"));
2217         }
2218     }
2219     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2220         /* We have to be careful. If the previous successful match
2221            was from this regex we don't want a subsequent partially
2222            successful match to clobber the old results.
2223            So when we detect this possibility we add a swap buffer
2224            to the re, and switch the buffer each match. If we fail,
2225            we switch it back; otherwise we leave it swapped.
2226         */
2227         swap = prog->offs;
2228         /* do we need a save destructor here for eval dies? */
2229         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2230         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2231             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2232             PTR2UV(prog),
2233             PTR2UV(swap),
2234             PTR2UV(prog->offs)
2235         ));
2236     }
2237     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2238         re_scream_pos_data d;
2239
2240         d.scream_olds = &scream_olds;
2241         d.scream_pos = &scream_pos;
2242         s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
2243         if (!s) {
2244             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2245             goto phooey;        /* not present */
2246         }
2247     }
2248
2249
2250
2251     /* Simplest case:  anchored match need be tried only once. */
2252     /*  [unless only anchor is BOL and multiline is set] */
2253     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2254         if (s == startpos && regtry(reginfo, &startpos))
2255             goto got_it;
2256         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2257                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2258         {
2259             char *end;
2260
2261             if (minlen)
2262                 dontbother = minlen - 1;
2263             end = HOP3c(strend, -dontbother, strbeg) - 1;
2264             /* for multiline we only have to try after newlines */
2265             if (prog->check_substr || prog->check_utf8) {
2266                 /* because of the goto we can not easily reuse the macros for bifurcating the
2267                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2268                 if (utf8_target) {
2269                     if (s == startpos)
2270                         goto after_try_utf8;
2271                     while (1) {
2272                         if (regtry(reginfo, &s)) {
2273                             goto got_it;
2274                         }
2275                       after_try_utf8:
2276                         if (s > end) {
2277                             goto phooey;
2278                         }
2279                         if (prog->extflags & RXf_USE_INTUIT) {
2280                             s = re_intuit_start(rx, sv, strbeg,
2281                                     s + UTF8SKIP(s), strend, flags, NULL);
2282                             if (!s) {
2283                                 goto phooey;
2284                             }
2285                         }
2286                         else {
2287                             s += UTF8SKIP(s);
2288                         }
2289                     }
2290                 } /* end search for check string in unicode */
2291                 else {
2292                     if (s == startpos) {
2293                         goto after_try_latin;
2294                     }
2295                     while (1) {
2296                         if (regtry(reginfo, &s)) {
2297                             goto got_it;
2298                         }
2299                       after_try_latin:
2300                         if (s > end) {
2301                             goto phooey;
2302                         }
2303                         if (prog->extflags & RXf_USE_INTUIT) {
2304                             s = re_intuit_start(rx, sv, strbeg,
2305                                         s + 1, strend, flags, NULL);
2306                             if (!s) {
2307                                 goto phooey;
2308                             }
2309                         }
2310                         else {
2311                             s++;
2312                         }
2313                     }
2314                 } /* end search for check string in latin*/
2315             } /* end search for check string */
2316             else { /* search for newline */
2317                 if (s > startpos) {
2318                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2319                     s--;
2320                 }
2321                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2322                 while (s <= end) { /* note it could be possible to match at the end of the string */
2323                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2324                         if (regtry(reginfo, &s))
2325                             goto got_it;
2326                     }
2327                 }
2328             } /* end search for newline */
2329         } /* end anchored/multiline check string search */
2330         goto phooey;
2331     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2332     {
2333         /* the warning about reginfo->ganch being used without initialization
2334            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2335            and we only enter this block when the same bit is set. */
2336         char *tmp_s = reginfo->ganch - prog->gofs;
2337
2338         if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
2339             goto got_it;
2340         goto phooey;
2341     }
2342
2343     /* Messy cases:  unanchored match. */
2344     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2345         /* we have /x+whatever/ */
2346         /* it must be a one character string (XXXX Except is_utf8_pat?) */
2347         char ch;
2348 #ifdef DEBUGGING
2349         int did_match = 0;
2350 #endif
2351         if (utf8_target) {
2352             if (! prog->anchored_utf8) {
2353                 to_utf8_substr(prog);
2354             }
2355             ch = SvPVX_const(prog->anchored_utf8)[0];
2356             REXEC_FBC_SCAN(
2357                 if (*s == ch) {
2358                     DEBUG_EXECUTE_r( did_match = 1 );
2359                     if (regtry(reginfo, &s)) goto got_it;
2360                     s += UTF8SKIP(s);
2361                     while (s < strend && *s == ch)
2362                         s += UTF8SKIP(s);
2363                 }
2364             );
2365
2366         }
2367         else {
2368             if (! prog->anchored_substr) {
2369                 if (! to_byte_substr(prog)) {
2370                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2371                 }
2372             }
2373             ch = SvPVX_const(prog->anchored_substr)[0];
2374             REXEC_FBC_SCAN(
2375                 if (*s == ch) {
2376                     DEBUG_EXECUTE_r( did_match = 1 );
2377                     if (regtry(reginfo, &s)) goto got_it;
2378                     s++;
2379                     while (s < strend && *s == ch)
2380                         s++;
2381                 }
2382             );
2383         }
2384         DEBUG_EXECUTE_r(if (!did_match)
2385                 PerlIO_printf(Perl_debug_log,
2386                                   "Did not find anchored character...\n")
2387                );
2388     }
2389     else if (prog->anchored_substr != NULL
2390               || prog->anchored_utf8 != NULL
2391               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2392                   && prog->float_max_offset < strend - s)) {
2393         SV *must;
2394         I32 back_max;
2395         I32 back_min;
2396         char *last;
2397         char *last1;            /* Last position checked before */
2398 #ifdef DEBUGGING
2399         int did_match = 0;
2400 #endif
2401         if (prog->anchored_substr || prog->anchored_utf8) {
2402             if (utf8_target) {
2403                 if (! prog->anchored_utf8) {
2404                     to_utf8_substr(prog);
2405                 }
2406                 must = prog->anchored_utf8;
2407             }
2408             else {
2409                 if (! prog->anchored_substr) {
2410                     if (! to_byte_substr(prog)) {
2411                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2412                     }
2413                 }
2414                 must = prog->anchored_substr;
2415             }
2416             back_max = back_min = prog->anchored_offset;
2417         } else {
2418             if (utf8_target) {
2419                 if (! prog->float_utf8) {
2420                     to_utf8_substr(prog);
2421                 }
2422                 must = prog->float_utf8;
2423             }
2424             else {
2425                 if (! prog->float_substr) {
2426                     if (! to_byte_substr(prog)) {
2427                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2428                     }
2429                 }
2430                 must = prog->float_substr;
2431             }
2432             back_max = prog->float_max_offset;
2433             back_min = prog->float_min_offset;
2434         }
2435
2436         if (back_min<0) {
2437             last = strend;
2438         } else {
2439             last = HOP3c(strend,        /* Cannot start after this */
2440                   -(I32)(CHR_SVLEN(must)
2441                          - (SvTAIL(must) != 0) + back_min), strbeg);
2442         }
2443         if (s > reginfo->strbeg)
2444             last1 = HOPc(s, -1);
2445         else
2446             last1 = s - 1;      /* bogus */
2447
2448         /* XXXX check_substr already used to find "s", can optimize if
2449            check_substr==must. */
2450         scream_pos = -1;
2451         dontbother = end_shift;
2452         strend = HOPc(strend, -dontbother);
2453         while ( (s <= last) &&
2454                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2455                                   (unsigned char*)strend, must,
2456                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2457             DEBUG_EXECUTE_r( did_match = 1 );
2458             if (HOPc(s, -back_max) > last1) {
2459                 last1 = HOPc(s, -back_min);
2460                 s = HOPc(s, -back_max);
2461             }
2462             else {
2463                 char * const t = (last1 >= reginfo->strbeg)
2464                                     ? HOPc(last1, 1) : last1 + 1;
2465
2466                 last1 = HOPc(s, -back_min);
2467                 s = t;
2468             }
2469             if (utf8_target) {
2470                 while (s <= last1) {
2471                     if (regtry(reginfo, &s))
2472                         goto got_it;
2473                     if (s >= last1) {
2474                         s++; /* to break out of outer loop */
2475                         break;
2476                     }
2477                     s += UTF8SKIP(s);
2478                 }
2479             }
2480             else {
2481                 while (s <= last1) {
2482                     if (regtry(reginfo, &s))
2483                         goto got_it;
2484                     s++;
2485                 }
2486             }
2487         }
2488         DEBUG_EXECUTE_r(if (!did_match) {
2489             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2490                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2491             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2492                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2493                                ? "anchored" : "floating"),
2494                 quoted, RE_SV_TAIL(must));
2495         });
2496         goto phooey;
2497     }
2498     else if ( (c = progi->regstclass) ) {
2499         if (minlen) {
2500             const OPCODE op = OP(progi->regstclass);
2501             /* don't bother with what can't match */
2502             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2503                 strend = HOPc(strend, -(minlen - 1));
2504         }
2505         DEBUG_EXECUTE_r({
2506             SV * const prop = sv_newmortal();
2507             regprop(prog, prop, c);
2508             {
2509                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2510                     s,strend-s,60);
2511                 PerlIO_printf(Perl_debug_log,
2512                     "Matching stclass %.*s against %s (%d bytes)\n",
2513                     (int)SvCUR(prop), SvPVX_const(prop),
2514                      quoted, (int)(strend - s));
2515             }
2516         });
2517         if (find_byclass(prog, c, s, strend, reginfo))
2518             goto got_it;
2519         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2520     }
2521     else {
2522         dontbother = 0;
2523         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2524             /* Trim the end. */
2525             char *last= NULL;
2526             SV* float_real;
2527             STRLEN len;
2528             const char *little;
2529
2530             if (utf8_target) {
2531                 if (! prog->float_utf8) {
2532                     to_utf8_substr(prog);
2533                 }
2534                 float_real = prog->float_utf8;
2535             }
2536             else {
2537                 if (! prog->float_substr) {
2538                     if (! to_byte_substr(prog)) {
2539                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2540                     }
2541                 }
2542                 float_real = prog->float_substr;
2543             }
2544
2545             little = SvPV_const(float_real, len);
2546             if (SvTAIL(float_real)) {
2547                     /* This means that float_real contains an artificial \n on
2548                      * the end due to the presence of something like this:
2549                      * /foo$/ where we can match both "foo" and "foo\n" at the
2550                      * end of the string.  So we have to compare the end of the
2551                      * string first against the float_real without the \n and
2552                      * then against the full float_real with the string.  We
2553                      * have to watch out for cases where the string might be
2554                      * smaller than the float_real or the float_real without
2555                      * the \n. */
2556                     char *checkpos= strend - len;
2557                     DEBUG_OPTIMISE_r(
2558                         PerlIO_printf(Perl_debug_log,
2559                             "%sChecking for float_real.%s\n",
2560                             PL_colors[4], PL_colors[5]));
2561                     if (checkpos + 1 < strbeg) {
2562                         /* can't match, even if we remove the trailing \n
2563                          * string is too short to match */
2564                         DEBUG_EXECUTE_r(
2565                             PerlIO_printf(Perl_debug_log,
2566                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2567                                 PL_colors[4], PL_colors[5]));
2568                         goto phooey;
2569                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2570                         /* can match, the end of the string matches without the
2571                          * "\n" */
2572                         last = checkpos + 1;
2573                     } else if (checkpos < strbeg) {
2574                         /* cant match, string is too short when the "\n" is
2575                          * included */
2576                         DEBUG_EXECUTE_r(
2577                             PerlIO_printf(Perl_debug_log,
2578                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2579                                 PL_colors[4], PL_colors[5]));
2580                         goto phooey;
2581                     } else if (!multiline) {
2582                         /* non multiline match, so compare with the "\n" at the
2583                          * end of the string */
2584                         if (memEQ(checkpos, little, len)) {
2585                             last= checkpos;
2586                         } else {
2587                             DEBUG_EXECUTE_r(
2588                                 PerlIO_printf(Perl_debug_log,
2589                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2590                                     PL_colors[4], PL_colors[5]));
2591                             goto phooey;
2592                         }
2593                     } else {
2594                         /* multiline match, so we have to search for a place
2595                          * where the full string is located */
2596                         goto find_last;
2597                     }
2598             } else {
2599                   find_last:
2600                     if (len)
2601                         last = rninstr(s, strend, little, little + len);
2602                     else
2603                         last = strend;  /* matching "$" */
2604             }
2605             if (!last) {
2606                 /* at one point this block contained a comment which was
2607                  * probably incorrect, which said that this was a "should not
2608                  * happen" case.  Even if it was true when it was written I am
2609                  * pretty sure it is not anymore, so I have removed the comment
2610                  * and replaced it with this one. Yves */
2611                 DEBUG_EXECUTE_r(
2612                     PerlIO_printf(Perl_debug_log,
2613                         "String does not contain required substring, cannot match.\n"
2614                     ));
2615                 goto phooey;
2616             }
2617             dontbother = strend - last + prog->float_min_offset;
2618         }
2619         if (minlen && (dontbother < minlen))
2620             dontbother = minlen - 1;
2621         strend -= dontbother;              /* this one's always in bytes! */
2622         /* We don't know much -- general case. */
2623         if (utf8_target) {
2624             for (;;) {
2625                 if (regtry(reginfo, &s))
2626                     goto got_it;
2627                 if (s >= strend)
2628                     break;
2629                 s += UTF8SKIP(s);
2630             };
2631         }
2632         else {
2633             do {
2634                 if (regtry(reginfo, &s))
2635                     goto got_it;
2636             } while (s++ < strend);
2637         }
2638     }
2639
2640     /* Failure. */
2641     goto phooey;
2642
2643 got_it:
2644     DEBUG_BUFFERS_r(
2645         if (swap)
2646             PerlIO_printf(Perl_debug_log,
2647                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2648                 PTR2UV(prog),
2649                 PTR2UV(swap)
2650             );
2651     );
2652     Safefree(swap);
2653
2654     /* clean up; this will trigger destructors that will free all slabs
2655      * above the current one, and cleanup the regmatch_info_aux
2656      * and regmatch_info_aux_eval sructs */
2657
2658     LEAVE_SCOPE(oldsave);
2659
2660     if (RXp_PAREN_NAMES(prog))
2661         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2662
2663     RX_MATCH_UTF8_set(rx, utf8_target);
2664
2665     /* make sure $`, $&, $', and $digit will work later */
2666     if ( !(flags & REXEC_NOT_FIRST) ) {
2667         if (flags & REXEC_COPY_STR) {
2668 #ifdef PERL_ANY_COW
2669             if (SvCANCOW(sv)) {
2670                 if (DEBUG_C_TEST) {
2671                     PerlIO_printf(Perl_debug_log,
2672                                   "Copy on write: regexp capture, type %d\n",
2673                                   (int) SvTYPE(sv));
2674                 }
2675                 RX_MATCH_COPY_FREE(rx);
2676                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2677                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2678                 assert (SvPOKp(prog->saved_copy));
2679                 prog->sublen  = reginfo->strend - strbeg;
2680                 prog->suboffset = 0;
2681                 prog->subcoffset = 0;
2682             } else
2683 #endif
2684             {
2685                 I32 min = 0;
2686                 I32 max = reginfo->strend - strbeg;
2687                 I32 sublen;
2688
2689                 if (    (flags & REXEC_COPY_SKIP_POST)
2690                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2691                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2692                 ) { /* don't copy $' part of string */
2693                     U32 n = 0;
2694                     max = -1;
2695                     /* calculate the right-most part of the string covered
2696                      * by a capture. Due to look-ahead, this may be to
2697                      * the right of $&, so we have to scan all captures */
2698                     while (n <= prog->lastparen) {
2699                         if (prog->offs[n].end > max)
2700                             max = prog->offs[n].end;
2701                         n++;
2702                     }
2703                     if (max == -1)
2704                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2705                                 ? prog->offs[0].start
2706                                 : 0;
2707                     assert(max >= 0 && max <= reginfo->strend - strbeg);
2708                 }
2709
2710                 if (    (flags & REXEC_COPY_SKIP_PRE)
2711                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2712                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2713                 ) { /* don't copy $` part of string */
2714                     U32 n = 0;
2715                     min = max;
2716                     /* calculate the left-most part of the string covered
2717                      * by a capture. Due to look-behind, this may be to
2718                      * the left of $&, so we have to scan all captures */
2719                     while (min && n <= prog->lastparen) {
2720                         if (   prog->offs[n].start != -1
2721                             && prog->offs[n].start < min)
2722                         {
2723                             min = prog->offs[n].start;
2724                         }
2725                         n++;
2726                     }
2727                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2728                         && min >  prog->offs[0].end
2729                     )
2730                         min = prog->offs[0].end;
2731
2732                 }
2733
2734                 assert(min >= 0 && min <= max
2735                     && min <= reginfo->strend - strbeg);
2736                 sublen = max - min;
2737
2738                 if (RX_MATCH_COPIED(rx)) {
2739                     if (sublen > prog->sublen)
2740                         prog->subbeg =
2741                                 (char*)saferealloc(prog->subbeg, sublen+1);
2742                 }
2743                 else
2744                     prog->subbeg = (char*)safemalloc(sublen+1);
2745                 Copy(strbeg + min, prog->subbeg, sublen, char);
2746                 prog->subbeg[sublen] = '\0';
2747                 prog->suboffset = min;
2748                 prog->sublen = sublen;
2749                 RX_MATCH_COPIED_on(rx);
2750             }
2751             prog->subcoffset = prog->suboffset;
2752             if (prog->suboffset && utf8_target) {
2753                 /* Convert byte offset to chars.
2754                  * XXX ideally should only compute this if @-/@+
2755                  * has been seen, a la PL_sawampersand ??? */
2756
2757                 /* If there's a direct correspondence between the
2758                  * string which we're matching and the original SV,
2759                  * then we can use the utf8 len cache associated with
2760                  * the SV. In particular, it means that under //g,
2761                  * sv_pos_b2u() will use the previously cached
2762                  * position to speed up working out the new length of
2763                  * subcoffset, rather than counting from the start of
2764                  * the string each time. This stops
2765                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2766                  * from going quadratic */
2767                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2768                     sv_pos_b2u(sv, &(prog->subcoffset));
2769                 else
2770                     prog->subcoffset = utf8_length((U8*)strbeg,
2771                                         (U8*)(strbeg+prog->suboffset));
2772             }
2773         }
2774         else {
2775             RX_MATCH_COPY_FREE(rx);
2776             prog->subbeg = strbeg;
2777             prog->suboffset = 0;
2778             prog->subcoffset = 0;
2779             /* use reginfo->strend, as strend may have been modified */
2780             prog->sublen = reginfo->strend - strbeg;
2781         }
2782     }
2783
2784     return 1;
2785
2786 phooey:
2787     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2788                           PL_colors[4], PL_colors[5]));
2789
2790     /* clean up; this will trigger destructors that will free all slabs
2791      * above the current one, and cleanup the regmatch_info_aux
2792      * and regmatch_info_aux_eval sructs */
2793
2794     LEAVE_SCOPE(oldsave);
2795
2796     if (swap) {
2797         /* we failed :-( roll it back */
2798         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2799             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2800             PTR2UV(prog),
2801             PTR2UV(prog->offs),
2802             PTR2UV(swap)
2803         ));
2804         Safefree(prog->offs);
2805         prog->offs = swap;
2806     }
2807     return 0;
2808 }
2809
2810
2811 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2812  * Do inc before dec, in case old and new rex are the same */
2813 #define SET_reg_curpm(Re2) \
2814     if (reginfo->info_aux_eval) {                   \
2815         (void)ReREFCNT_inc(Re2);                    \
2816         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2817         PM_SETRE((PL_reg_curpm), (Re2));            \
2818     }
2819
2820
2821 /*
2822  - regtry - try match at specific point
2823  */
2824 STATIC I32                      /* 0 failure, 1 success */
2825 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2826 {
2827     dVAR;
2828     CHECKPOINT lastcp;
2829     REGEXP *const rx = reginfo->prog;
2830     regexp *const prog = ReANY(rx);
2831     I32 result;
2832     RXi_GET_DECL(prog,progi);
2833     GET_RE_DEBUG_FLAGS_DECL;
2834
2835     PERL_ARGS_ASSERT_REGTRY;
2836
2837     reginfo->cutpoint=NULL;
2838
2839 #ifdef DEBUGGING
2840     PL_reg_starttry = *startposp;
2841 #endif
2842     prog->offs[0].start = *startposp - reginfo->strbeg;
2843     prog->lastparen = 0;
2844     prog->lastcloseparen = 0;
2845
2846     /* XXXX What this code is doing here?!!!  There should be no need
2847        to do this again and again, prog->lastparen should take care of
2848        this!  --ilya*/
2849
2850     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2851      * Actually, the code in regcppop() (which Ilya may be meaning by
2852      * prog->lastparen), is not needed at all by the test suite
2853      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2854      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2855      * Meanwhile, this code *is* needed for the
2856      * above-mentioned test suite tests to succeed.  The common theme
2857      * on those tests seems to be returning null fields from matches.
2858      * --jhi updated by dapm */
2859 #if 1
2860     if (prog->nparens) {
2861         regexp_paren_pair *pp = prog->offs;
2862         I32 i;
2863         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2864             ++pp;
2865             pp->start = -1;
2866             pp->end = -1;
2867         }
2868     }
2869 #endif
2870     REGCP_SET(lastcp);
2871     result = regmatch(reginfo, *startposp, progi->program + 1);
2872     if (result != -1) {
2873         prog->offs[0].end = result;
2874         return 1;
2875     }
2876     if (reginfo->cutpoint)
2877         *startposp= reginfo->cutpoint;
2878     REGCP_UNWIND(lastcp);
2879     return 0;
2880 }
2881
2882
2883 #define sayYES goto yes
2884 #define sayNO goto no
2885 #define sayNO_SILENT goto no_silent
2886
2887 /* we dont use STMT_START/END here because it leads to
2888    "unreachable code" warnings, which are bogus, but distracting. */
2889 #define CACHEsayNO \
2890     if (ST.cache_mask) \
2891        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2892     sayNO
2893
2894 /* this is used to determine how far from the left messages like
2895    'failed...' are printed. It should be set such that messages
2896    are inline with the regop output that created them.
2897 */
2898 #define REPORT_CODE_OFF 32
2899
2900
2901 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2902 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2903 #define CHRTEST_NOT_A_CP_1 -999
2904 #define CHRTEST_NOT_A_CP_2 -998
2905
2906 /* grab a new slab and return the first slot in it */
2907
2908 STATIC regmatch_state *
2909 S_push_slab(pTHX)
2910 {
2911 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2912     dMY_CXT;
2913 #endif
2914     regmatch_slab *s = PL_regmatch_slab->next;
2915     if (!s) {
2916         Newx(s, 1, regmatch_slab);
2917         s->prev = PL_regmatch_slab;
2918         s->next = NULL;
2919         PL_regmatch_slab->next = s;
2920     }
2921     PL_regmatch_slab = s;
2922     return SLAB_FIRST(s);
2923 }
2924
2925
2926 /* push a new state then goto it */
2927
2928 #define PUSH_STATE_GOTO(state, node, input) \
2929     pushinput = input; \
2930     scan = node; \
2931     st->resume_state = state; \
2932     goto push_state;
2933
2934 /* push a new state with success backtracking, then goto it */
2935
2936 #define PUSH_YES_STATE_GOTO(state, node, input) \
2937     pushinput = input; \
2938     scan = node; \
2939     st->resume_state = state; \
2940     goto push_yes_state;
2941
2942
2943
2944
2945 /*
2946
2947 regmatch() - main matching routine
2948
2949 This is basically one big switch statement in a loop. We execute an op,
2950 set 'next' to point the next op, and continue. If we come to a point which
2951 we may need to backtrack to on failure such as (A|B|C), we push a
2952 backtrack state onto the backtrack stack. On failure, we pop the top
2953 state, and re-enter the loop at the state indicated. If there are no more
2954 states to pop, we return failure.
2955
2956 Sometimes we also need to backtrack on success; for example /A+/, where
2957 after successfully matching one A, we need to go back and try to
2958 match another one; similarly for lookahead assertions: if the assertion
2959 completes successfully, we backtrack to the state just before the assertion
2960 and then carry on.  In these cases, the pushed state is marked as
2961 'backtrack on success too'. This marking is in fact done by a chain of
2962 pointers, each pointing to the previous 'yes' state. On success, we pop to
2963 the nearest yes state, discarding any intermediate failure-only states.
2964 Sometimes a yes state is pushed just to force some cleanup code to be
2965 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2966 it to free the inner regex.
2967
2968 Note that failure backtracking rewinds the cursor position, while
2969 success backtracking leaves it alone.
2970
2971 A pattern is complete when the END op is executed, while a subpattern
2972 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2973 ops trigger the "pop to last yes state if any, otherwise return true"
2974 behaviour.
2975
2976 A common convention in this function is to use A and B to refer to the two
2977 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2978 the subpattern to be matched possibly multiple times, while B is the entire
2979 rest of the pattern. Variable and state names reflect this convention.
2980
2981 The states in the main switch are the union of ops and failure/success of
2982 substates associated with with that op.  For example, IFMATCH is the op
2983 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2984 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2985 successfully matched A and IFMATCH_A_fail is a state saying that we have
2986 just failed to match A. Resume states always come in pairs. The backtrack
2987 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2988 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2989 on success or failure.
2990
2991 The struct that holds a backtracking state is actually a big union, with
2992 one variant for each major type of op. The variable st points to the
2993 top-most backtrack struct. To make the code clearer, within each
2994 block of code we #define ST to alias the relevant union.
2995
2996 Here's a concrete example of a (vastly oversimplified) IFMATCH
2997 implementation:
2998
2999     switch (state) {
3000     ....
3001
3002 #define ST st->u.ifmatch
3003
3004     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3005         ST.foo = ...; // some state we wish to save
3006         ...
3007         // push a yes backtrack state with a resume value of
3008         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3009         // first node of A:
3010         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3011         // NOTREACHED
3012
3013     case IFMATCH_A: // we have successfully executed A; now continue with B
3014         next = B;
3015         bar = ST.foo; // do something with the preserved value
3016         break;
3017
3018     case IFMATCH_A_fail: // A failed, so the assertion failed
3019         ...;   // do some housekeeping, then ...
3020         sayNO; // propagate the failure
3021
3022 #undef ST
3023
3024     ...
3025     }
3026
3027 For any old-timers reading this who are familiar with the old recursive
3028 approach, the code above is equivalent to:
3029
3030     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3031     {
3032         int foo = ...
3033         ...
3034         if (regmatch(A)) {
3035             next = B;
3036             bar = foo;
3037             break;
3038         }
3039         ...;   // do some housekeeping, then ...
3040         sayNO; // propagate the failure
3041     }
3042
3043 The topmost backtrack state, pointed to by st, is usually free. If you
3044 want to claim it, populate any ST.foo fields in it with values you wish to
3045 save, then do one of
3046
3047         PUSH_STATE_GOTO(resume_state, node, newinput);
3048         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3049
3050 which sets that backtrack state's resume value to 'resume_state', pushes a
3051 new free entry to the top of the backtrack stack, then goes to 'node'.
3052 On backtracking, the free slot is popped, and the saved state becomes the
3053 new free state. An ST.foo field in this new top state can be temporarily
3054 accessed to retrieve values, but once the main loop is re-entered, it
3055 becomes available for reuse.
3056
3057 Note that the depth of the backtrack stack constantly increases during the
3058 left-to-right execution of the pattern, rather than going up and down with
3059 the pattern nesting. For example the stack is at its maximum at Z at the
3060 end of the pattern, rather than at X in the following:
3061
3062     /(((X)+)+)+....(Y)+....Z/
3063
3064 The only exceptions to this are lookahead/behind assertions and the cut,
3065 (?>A), which pop all the backtrack states associated with A before
3066 continuing.
3067
3068 Backtrack state structs are allocated in slabs of about 4K in size.
3069 PL_regmatch_state and st always point to the currently active state,
3070 and PL_regmatch_slab points to the slab currently containing
3071 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3072 allocated, and is never freed until interpreter destruction. When the slab
3073 is full, a new one is allocated and chained to the end. At exit from
3074 regmatch(), slabs allocated since entry are freed.
3075
3076 */
3077
3078
3079 #define DEBUG_STATE_pp(pp)                                  \
3080     DEBUG_STATE_r({                                         \
3081         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3082         PerlIO_printf(Perl_debug_log,                       \
3083             "    %*s"pp" %s%s%s%s%s\n",                     \
3084             depth*2, "",                                    \
3085             PL_reg_name[st->resume_state],                     \
3086             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3087             ((st==yes_state) ? "Y" : ""),                   \
3088             ((st==mark_state) ? "M" : ""),                  \
3089             ((st==yes_state||st==mark_state) ? "]" : "")    \
3090         );                                                  \
3091     });
3092
3093
3094 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3095
3096 #ifdef DEBUGGING
3097
3098 STATIC void
3099 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3100     const char *start, const char *end, const char *blurb)
3101 {
3102     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3103
3104     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3105
3106     if (!PL_colorset)
3107             reginitcolors();
3108     {
3109         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3110             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3111
3112         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3113             start, end - start, 60);
3114
3115         PerlIO_printf(Perl_debug_log,
3116             "%s%s REx%s %s against %s\n",
3117                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3118
3119         if (utf8_target||utf8_pat)
3120             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3121                 utf8_pat ? "pattern" : "",
3122                 utf8_pat && utf8_target ? " and " : "",
3123                 utf8_target ? "string" : ""
3124             );
3125     }
3126 }
3127
3128 STATIC void
3129 S_dump_exec_pos(pTHX_ const char *locinput,
3130                       const regnode *scan,
3131                       const char *loc_regeol,
3132                       const char *loc_bostr,
3133                       const char *loc_reg_starttry,
3134                       const bool utf8_target)
3135 {
3136     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3137     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3138     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3139     /* The part of the string before starttry has one color
3140        (pref0_len chars), between starttry and current
3141        position another one (pref_len - pref0_len chars),
3142        after the current position the third one.
3143        We assume that pref0_len <= pref_len, otherwise we
3144        decrease pref0_len.  */
3145     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3146         ? (5 + taill) - l : locinput - loc_bostr;
3147     int pref0_len;
3148
3149     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3150
3151     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3152         pref_len++;
3153     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3154     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3155         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3156               ? (5 + taill) - pref_len : loc_regeol - locinput);
3157     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3158         l--;
3159     if (pref0_len < 0)
3160         pref0_len = 0;
3161     if (pref0_len > pref_len)
3162         pref0_len = pref_len;
3163     {
3164         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3165
3166         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3167             (locinput - pref_len),pref0_len, 60, 4, 5);
3168
3169         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3170                     (locinput - pref_len + pref0_len),
3171                     pref_len - pref0_len, 60, 2, 3);
3172
3173         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3174                     locinput, loc_regeol - locinput, 10, 0, 1);
3175
3176         const STRLEN tlen=len0+len1+len2;
3177         PerlIO_printf(Perl_debug_log,
3178                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3179                     (IV)(locinput - loc_bostr),
3180                     len0, s0,
3181                     len1, s1,
3182                     (docolor ? "" : "> <"),
3183                     len2, s2,
3184                     (int)(tlen > 19 ? 0 :  19 - tlen),
3185                     "");
3186     }
3187 }
3188
3189 #endif
3190
3191 /* reg_check_named_buff_matched()
3192  * Checks to see if a named buffer has matched. The data array of
3193  * buffer numbers corresponding to the buffer is expected to reside
3194  * in the regexp->data->data array in the slot stored in the ARG() of
3195  * node involved. Note that this routine doesn't actually care about the
3196  * name, that information is not preserved from compilation to execution.
3197  * Returns the index of the leftmost defined buffer with the given name
3198  * or 0 if non of the buffers matched.
3199  */
3200 STATIC I32
3201 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3202 {
3203     I32 n;
3204     RXi_GET_DECL(rex,rexi);
3205     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3206     I32 *nums=(I32*)SvPVX(sv_dat);
3207
3208     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3209
3210     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3211         if ((I32)rex->lastparen >= nums[n] &&
3212             rex->offs[nums[n]].end != -1)
3213         {
3214             return nums[n];
3215         }
3216     }
3217     return 0;
3218 }
3219
3220
3221 /* free all slabs above current one  - called during LEAVE_SCOPE */
3222
3223 STATIC void
3224 S_clear_backtrack_stack(pTHX_ void *p)
3225 {
3226     regmatch_slab *s = PL_regmatch_slab->next;
3227     PERL_UNUSED_ARG(p);
3228
3229     if (!s)
3230         return;
3231     PL_regmatch_slab->next = NULL;
3232     while (s) {
3233         regmatch_slab * const osl = s;
3234         s = s->next;
3235         Safefree(osl);
3236     }
3237 }
3238
3239
3240 static bool
3241 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3242         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3243 {
3244     /* This function determines if there are one or two characters that match
3245      * the first character of the passed-in EXACTish node <text_node>, and if
3246      * so, returns them in the passed-in pointers.
3247      *
3248      * If it determines that no possible character in the target string can
3249      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3250      * the first character in <text_node> requires UTF-8 to represent, and the
3251      * target string isn't in UTF-8.)
3252      *
3253      * If there are more than two characters that could match the beginning of
3254      * <text_node>, or if more context is required to determine a match or not,
3255      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3256      *
3257      * The motiviation behind this function is to allow the caller to set up
3258      * tight loops for matching.  If <text_node> is of type EXACT, there is
3259      * only one possible character that can match its first character, and so
3260      * the situation is quite simple.  But things get much more complicated if
3261      * folding is involved.  It may be that the first character of an EXACTFish
3262      * node doesn't participate in any possible fold, e.g., punctuation, so it
3263      * can be matched only by itself.  The vast majority of characters that are
3264      * in folds match just two things, their lower and upper-case equivalents.
3265      * But not all are like that; some have multiple possible matches, or match
3266      * sequences of more than one character.  This function sorts all that out.
3267      *
3268      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3269      * loop of trying to match A*, we know we can't exit where the thing
3270      * following it isn't a B.  And something can't be a B unless it is the
3271      * beginning of B.  By putting a quick test for that beginning in a tight
3272      * loop, we can rule out things that can't possibly be B without having to
3273      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3274      * character, we can make a tight loop matching A*, using the outputs of
3275      * this function.
3276      *
3277      * If the target string to match isn't in UTF-8, and there aren't
3278      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3279      * the one or two possible octets (which are characters in this situation)
3280      * that can match.  In all cases, if there is only one character that can
3281      * match, *<c1p> and *<c2p> will be identical.
3282      *
3283      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3284      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3285      * can match the beginning of <text_node>.  They should be declared with at
3286      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3287      * undefined what these contain.)  If one or both of the buffers are
3288      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3289      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3290      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3291      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3292      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3293
3294     const bool utf8_target = reginfo->is_utf8_target;
3295
3296     UV c1 = CHRTEST_NOT_A_CP_1;
3297     UV c2 = CHRTEST_NOT_A_CP_2;
3298     bool use_chrtest_void = FALSE;
3299     const bool is_utf8_pat = reginfo->is_utf8_pat;
3300
3301     /* Used when we have both utf8 input and utf8 output, to avoid converting
3302      * to/from code points */
3303     bool utf8_has_been_setup = FALSE;
3304
3305     dVAR;
3306
3307     U8 *pat = (U8*)STRING(text_node);
3308
3309     if (OP(text_node) == EXACT) {
3310
3311         /* In an exact node, only one thing can be matched, that first
3312          * character.  If both the pat and the target are UTF-8, we can just
3313          * copy the input to the output, avoiding finding the code point of
3314          * that character */
3315         if (!is_utf8_pat) {
3316             c2 = c1 = *pat;
3317         }
3318         else if (utf8_target) {
3319             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3320             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3321             utf8_has_been_setup = TRUE;
3322         }
3323         else {
3324             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3325         }
3326     }
3327     else /* an EXACTFish node */
3328          if ((is_utf8_pat
3329                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3330                                                     pat + STR_LEN(text_node)))
3331              || (!is_utf8_pat
3332                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3333                                                     pat + STR_LEN(text_node))))
3334     {
3335         /* Multi-character folds require more context to sort out.  Also
3336          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3337          * handled outside this routine */
3338         use_chrtest_void = TRUE;
3339     }
3340     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3341         c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3342         if (c1 > 256) {
3343             /* Load the folds hash, if not already done */
3344             SV** listp;
3345             if (! PL_utf8_foldclosures) {
3346                 if (! PL_utf8_tofold) {
3347                     U8 dummy[UTF8_MAXBYTES+1];
3348
3349                     /* Force loading this by folding an above-Latin1 char */
3350                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3351                     assert(PL_utf8_tofold); /* Verify that worked */
3352                 }
3353                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3354             }
3355
3356             /* The fold closures data structure is a hash with the keys being
3357              * the UTF-8 of every character that is folded to, like 'k', and
3358              * the values each an array of all code points that fold to its
3359              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3360              * not included */
3361             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3362                                      (char *) pat,
3363                                      UTF8SKIP(pat),
3364                                      FALSE))))
3365             {
3366                 /* Not found in the hash, therefore there are no folds
3367                  * containing it, so there is only a single character that
3368                  * could match */
3369                 c2 = c1;
3370             }
3371             else {  /* Does participate in folds */
3372                 AV* list = (AV*) *listp;
3373                 if (av_len(list) != 1) {
3374
3375                     /* If there aren't exactly two folds to this, it is outside
3376                      * the scope of this function */
3377                     use_chrtest_void = TRUE;
3378                 }
3379                 else {  /* There are two.  Get them */
3380                     SV** c_p = av_fetch(list, 0, FALSE);
3381                     if (c_p == NULL) {
3382                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3383                     }
3384                     c1 = SvUV(*c_p);
3385
3386                     c_p = av_fetch(list, 1, FALSE);
3387                     if (c_p == NULL) {
3388                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3389                     }
3390                     c2 = SvUV(*c_p);
3391
3392                     /* Folds that cross the 255/256 boundary are forbidden if
3393                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3394                      * pattern character is above 256, and its only other match
3395                      * is below 256, the only legal match will be to itself.
3396                      * We have thrown away the original, so have to compute
3397                      * which is the one above 255 */
3398                     if ((c1 < 256) != (c2 < 256)) {
3399                         if (OP(text_node) == EXACTFL
3400                             || (OP(text_node) == EXACTFA
3401                                 && (isASCII(c1) || isASCII(c2))))
3402                         {
3403                             if (c1 < 256) {
3404                                 c1 = c2;
3405                             }
3406                             else {
3407                                 c2 = c1;
3408                             }
3409                         }
3410                     }
3411                 }
3412             }
3413         }
3414         else /* Here, c1 is < 255 */
3415              if (utf8_target
3416                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3417                  && OP(text_node) != EXACTFL
3418                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3419         {
3420             /* Here, there could be something above Latin1 in the target which
3421              * folds to this character in the pattern.  All such cases except
3422              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3423              * involved in their folds, so are outside the scope of this
3424              * function */
3425             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3426                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3427             }
3428             else {
3429                 use_chrtest_void = TRUE;
3430             }
3431         }
3432         else { /* Here nothing above Latin1 can fold to the pattern character */
3433             switch (OP(text_node)) {
3434
3435                 case EXACTFL:   /* /l rules */
3436                     c2 = PL_fold_locale[c1];
3437                     break;
3438
3439                 case EXACTF:
3440                     if (! utf8_target) {    /* /d rules */
3441                         c2 = PL_fold[c1];
3442                         break;
3443                     }
3444                     /* FALLTHROUGH */
3445                     /* /u rules for all these.  This happens to work for
3446                      * EXACTFA as nothing in Latin1 folds to ASCII */
3447                 case EXACTFA:
3448                 case EXACTFU_TRICKYFOLD:
3449                 case EXACTFU_SS:
3450                 case EXACTFU:
3451                     c2 = PL_fold_latin1[c1];
3452                     break;
3453
3454                 default:
3455                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3456                     assert(0); /* NOTREACHED */
3457             }
3458         }
3459     }
3460
3461     /* Here have figured things out.  Set up the returns */
3462     if (use_chrtest_void) {
3463         *c2p = *c1p = CHRTEST_VOID;
3464     }
3465     else if (utf8_target) {
3466         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3467             uvchr_to_utf8(c1_utf8, c1);
3468             uvchr_to_utf8(c2_utf8, c2);
3469         }
3470
3471         /* Invariants are stored in both the utf8 and byte outputs; Use
3472          * negative numbers otherwise for the byte ones.  Make sure that the
3473          * byte ones are the same iff the utf8 ones are the same */
3474         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3475         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3476                 ? *c2_utf8
3477                 : (c1 == c2)
3478                   ? CHRTEST_NOT_A_CP_1
3479                   : CHRTEST_NOT_A_CP_2;
3480     }
3481     else if (c1 > 255) {
3482        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3483                            can represent */
3484            return FALSE;
3485        }
3486
3487        *c1p = *c2p = c2;    /* c2 is the only representable value */
3488     }
3489     else {  /* c1 is representable; see about c2 */
3490        *c1p = c1;
3491        *c2p = (c2 < 256) ? c2 : c1;
3492     }
3493
3494     return TRUE;
3495 }
3496
3497 /* returns -1 on failure, $+[0] on success */
3498 STATIC I32
3499 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3500 {
3501 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3502     dMY_CXT;
3503 #endif
3504     dVAR;
3505     const bool utf8_target = reginfo->is_utf8_target;
3506     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3507     REGEXP *rex_sv = reginfo->prog;
3508     regexp *rex = ReANY(rex_sv);
3509     RXi_GET_DECL(rex,rexi);
3510     /* the current state. This is a cached copy of PL_regmatch_state */
3511     regmatch_state *st;
3512     /* cache heavy used fields of st in registers */
3513     regnode *scan;
3514     regnode *next;
3515     U32 n = 0;  /* general value; init to avoid compiler warning */
3516     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3517     char *locinput = startpos;
3518     char *pushinput; /* where to continue after a PUSH */
3519     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3520
3521     bool result = 0;        /* return value of S_regmatch */
3522     int depth = 0;          /* depth of backtrack stack */
3523     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3524     const U32 max_nochange_depth =
3525         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3526         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3527     regmatch_state *yes_state = NULL; /* state to pop to on success of
3528                                                             subpattern */
3529     /* mark_state piggy backs on the yes_state logic so that when we unwind
3530        the stack on success we can update the mark_state as we go */
3531     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3532     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3533     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3534     U32 state_num;
3535     bool no_final = 0;      /* prevent failure from backtracking? */
3536     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3537     char *startpoint = locinput;
3538     SV *popmark = NULL;     /* are we looking for a mark? */
3539     SV *sv_commit = NULL;   /* last mark name seen in failure */
3540     SV *sv_yes_mark = NULL; /* last mark name we have seen
3541                                during a successful match */
3542     U32 lastopen = 0;       /* last open we saw */
3543     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3544     SV* const oreplsv = GvSV(PL_replgv);
3545     /* these three flags are set by various ops to signal information to
3546      * the very next op. They have a useful lifetime of exactly one loop
3547      * iteration, and are not preserved or restored by state pushes/pops
3548      */
3549     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3550     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3551     int logical = 0;        /* the following EVAL is:
3552                                 0: (?{...})
3553                                 1: (?(?{...})X|Y)
3554                                 2: (??{...})
3555                                or the following IFMATCH/UNLESSM is:
3556                                 false: plain (?=foo)
3557                                 true:  used as a condition: (?(?=foo))
3558                             */
3559     PAD* last_pad = NULL;
3560     dMULTICALL;
3561     I32 gimme = G_SCALAR;
3562     CV *caller_cv = NULL;       /* who called us */
3563     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3564     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3565     U32 maxopenparen = 0;       /* max '(' index seen so far */
3566     int to_complement;  /* Invert the result? */
3567     _char_class_number classnum;
3568     bool is_utf8_pat = reginfo->is_utf8_pat;
3569
3570 #ifdef DEBUGGING
3571     GET_RE_DEBUG_FLAGS_DECL;
3572 #endif
3573
3574     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3575     multicall_oldcatch = 0;
3576     multicall_cv = NULL;
3577     cx = NULL;
3578     PERL_UNUSED_VAR(multicall_cop);
3579     PERL_UNUSED_VAR(newsp);
3580
3581
3582     PERL_ARGS_ASSERT_REGMATCH;
3583
3584     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3585             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3586     }));
3587
3588     /* grab next free state slot */
3589     st = ++PL_regmatch_state;
3590     if (st >  SLAB_LAST(PL_regmatch_slab))
3591         st = PL_regmatch_state = S_push_slab(aTHX);
3592
3593     /* Note that nextchr is a byte even in UTF */
3594     SET_nextchr;
3595     scan = prog;
3596     while (scan != NULL) {
3597
3598         DEBUG_EXECUTE_r( {
3599             SV * const prop = sv_newmortal();
3600             regnode *rnext=regnext(scan);
3601             DUMP_EXEC_POS( locinput, scan, utf8_target );
3602             regprop(rex, prop, scan);
3603
3604             PerlIO_printf(Perl_debug_log,
3605                     "%3"IVdf":%*s%s(%"IVdf")\n",
3606                     (IV)(scan - rexi->program), depth*2, "",
3607                     SvPVX_const(prop),
3608                     (PL_regkind[OP(scan)] == END || !rnext) ?
3609                         0 : (IV)(rnext - rexi->program));
3610         });
3611
3612         next = scan + NEXT_OFF(scan);
3613         if (next == scan)
3614             next = NULL;
3615         state_num = OP(scan);
3616
3617       reenter_switch:
3618         to_complement = 0;
3619
3620         SET_nextchr;
3621         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3622
3623         switch (state_num) {
3624         case BOL: /*  /^../  */
3625             if (locinput == reginfo->strbeg)
3626                 break;
3627             sayNO;
3628
3629         case MBOL: /*  /^../m  */
3630             if (locinput == reginfo->strbeg ||
3631                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3632             {
3633                 break;
3634             }
3635             sayNO;
3636
3637         case SBOL: /*  /^../s  */
3638             if (locinput == reginfo->strbeg)
3639                 break;
3640             sayNO;
3641
3642         case GPOS: /*  \G  */
3643             if (locinput == reginfo->ganch)
3644                 break;
3645             sayNO;
3646
3647         case KEEPS: /*   \K  */
3648             /* update the startpoint */
3649             st->u.keeper.val = rex->offs[0].start;
3650             rex->offs[0].start = locinput - reginfo->strbeg;
3651             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3652             assert(0); /*NOTREACHED*/
3653         case KEEPS_next_fail:
3654             /* rollback the start point change */
3655             rex->offs[0].start = st->u.keeper.val;
3656             sayNO_SILENT;
3657             assert(0); /*NOTREACHED*/
3658
3659         case EOL: /* /..$/  */
3660                 goto seol;
3661
3662         case MEOL: /* /..$/m  */
3663             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3664                 sayNO;
3665             break;
3666
3667         case SEOL: /* /..$/s  */
3668           seol:
3669             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3670                 sayNO;
3671             if (reginfo->strend - locinput > 1)
3672                 sayNO;
3673             break;
3674
3675         case EOS: /*  \z  */
3676             if (!NEXTCHR_IS_EOS)
3677                 sayNO;
3678             break;
3679
3680         case SANY: /*  /./s  */
3681             if (NEXTCHR_IS_EOS)
3682                 sayNO;
3683             goto increment_locinput;
3684
3685         case CANY: /*  \C  */
3686             if (NEXTCHR_IS_EOS)
3687                 sayNO;
3688             locinput++;
3689             break;
3690
3691         case REG_ANY: /*  /./  */
3692             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3693                 sayNO;
3694             goto increment_locinput;
3695
3696
3697 #undef  ST
3698 #define ST st->u.trie
3699         case TRIEC: /* (ab|cd) with known charclass */
3700             /* In this case the charclass data is available inline so
3701                we can fail fast without a lot of extra overhead.
3702              */
3703             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3704                 DEBUG_EXECUTE_r(
3705                     PerlIO_printf(Perl_debug_log,
3706                               "%*s  %sfailed to match trie start class...%s\n",
3707                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3708                 );
3709                 sayNO_SILENT;
3710                 assert(0); /* NOTREACHED */
3711             }
3712             /* FALL THROUGH */
3713         case TRIE:  /* (ab|cd)  */
3714             /* the basic plan of execution of the trie is:
3715              * At the beginning, run though all the states, and
3716              * find the longest-matching word. Also remember the position
3717              * of the shortest matching word. For example, this pattern:
3718              *    1  2 3 4    5
3719              *    ab|a|x|abcd|abc
3720              * when matched against the string "abcde", will generate
3721              * accept states for all words except 3, with the longest
3722              * matching word being 4, and the shortest being 2 (with
3723              * the position being after char 1 of the string).
3724              *
3725              * Then for each matching word, in word order (i.e. 1,2,4,5),
3726              * we run the remainder of the pattern; on each try setting
3727              * the current position to the character following the word,
3728              * returning to try the next word on failure.
3729              *
3730              * We avoid having to build a list of words at runtime by
3731              * using a compile-time structure, wordinfo[].prev, which
3732              * gives, for each word, the previous accepting word (if any).
3733              * In the case above it would contain the mappings 1->2, 2->0,
3734              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3735              * the longest word (4 above), a list of all words, by
3736              * following the list of prev pointers; this gives us the
3737              * unordered list 4,5,1,2. Then given the current word we have
3738              * just tried, we can go through the list and find the
3739              * next-biggest word to try (so if we just failed on word 2,
3740              * the next in the list is 4).
3741              *
3742              * Since at runtime we don't record the matching position in
3743              * the string for each word, we have to work that out for
3744              * each word we're about to process. The wordinfo table holds
3745              * the character length of each word; given that we recorded
3746              * at the start: the position of the shortest word and its
3747              * length in chars, we just need to move the pointer the
3748              * difference between the two char lengths. Depending on
3749              * Unicode status and folding, that's cheap or expensive.
3750              *
3751              * This algorithm is optimised for the case where are only a
3752              * small number of accept states, i.e. 0,1, or maybe 2.
3753              * With lots of accepts states, and having to try all of them,
3754              * it becomes quadratic on number of accept states to find all
3755              * the next words.
3756              */
3757
3758             {
3759                 /* what type of TRIE am I? (utf8 makes this contextual) */
3760                 DECL_TRIE_TYPE(scan);
3761
3762                 /* what trie are we using right now */
3763                 reg_trie_data * const trie
3764                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3765                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3766                 U32 state = trie->startstate;
3767
3768                 if (   trie->bitmap
3769                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3770                 {
3771                     if (trie->states[ state ].wordnum) {
3772                          DEBUG_EXECUTE_r(
3773                             PerlIO_printf(Perl_debug_log,
3774                                           "%*s  %smatched empty string...%s\n",
3775                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3776                         );
3777                         if (!trie->jump)
3778                             break;
3779                     } else {
3780                         DEBUG_EXECUTE_r(
3781                             PerlIO_printf(Perl_debug_log,
3782                                           "%*s  %sfailed to match trie start class...%s\n",
3783                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3784                         );
3785                         sayNO_SILENT;
3786                    }
3787                 }
3788
3789             {
3790                 U8 *uc = ( U8* )locinput;
3791
3792                 STRLEN len = 0;
3793                 STRLEN foldlen = 0;
3794                 U8 *uscan = (U8*)NULL;
3795                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3796                 U32 charcount = 0; /* how many input chars we have matched */
3797                 U32 accepted = 0; /* have we seen any accepting states? */
3798
3799                 ST.jump = trie->jump;
3800                 ST.me = scan;
3801                 ST.firstpos = NULL;
3802                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3803                 ST.nextword = 0;
3804
3805                 /* fully traverse the TRIE; note the position of the
3806                    shortest accept state and the wordnum of the longest
3807                    accept state */
3808
3809                 while ( state && uc <= (U8*)(reginfo->strend) ) {
3810                     U32 base = trie->states[ state ].trans.base;
3811                     UV uvc = 0;
3812                     U16 charid = 0;
3813                     U16 wordnum;
3814                     wordnum = trie->states[ state ].wordnum;
3815
3816                     if (wordnum) { /* it's an accept state */
3817                         if (!accepted) {
3818                             accepted = 1;
3819                             /* record first match position */
3820                             if (ST.longfold) {
3821                                 ST.firstpos = (U8*)locinput;
3822                                 ST.firstchars = 0;
3823                             }
3824                             else {
3825                                 ST.firstpos = uc;
3826                                 ST.firstchars = charcount;
3827                             }
3828                         }
3829                         if (!ST.nextword || wordnum < ST.nextword)
3830                             ST.nextword = wordnum;
3831                         ST.topword = wordnum;
3832                     }
3833
3834                     DEBUG_TRIE_EXECUTE_r({
3835                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3836                                 PerlIO_printf( Perl_debug_log,
3837                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3838                                     2+depth * 2, "", PL_colors[4],
3839                                     (UV)state, (accepted ? 'Y' : 'N'));
3840                     });
3841
3842                     /* read a char and goto next state */
3843                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3844                         I32 offset;
3845                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3846                                              uscan, len, uvc, charid, foldlen,
3847                                              foldbuf, uniflags);
3848                         charcount++;
3849                         if (foldlen>0)
3850                             ST.longfold = TRUE;
3851                         if (charid &&
3852                              ( ((offset =
3853                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3854
3855                              && ((U32)offset < trie->lasttrans)
3856                              && trie->trans[offset].check == state)
3857                         {
3858                             state = trie->trans[offset].next;
3859                         }
3860                         else {
3861                             state = 0;
3862                         }
3863                         uc += len;
3864
3865                     }
3866                     else {
3867                         state = 0;
3868                     }
3869                     DEBUG_TRIE_EXECUTE_r(
3870                         PerlIO_printf( Perl_debug_log,
3871                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3872                             charid, uvc, (UV)state, PL_colors[5] );
3873                     );
3874                 }
3875                 if (!accepted)
3876                    sayNO;
3877
3878                 /* calculate total number of accept states */
3879                 {
3880                     U16 w = ST.topword;
3881                     accepted = 0;
3882                     while (w) {
3883                         w = trie->wordinfo[w].prev;
3884                         accepted++;
3885                     }
3886                     ST.accepted = accepted;
3887                 }
3888
3889                 DEBUG_EXECUTE_r(
3890                     PerlIO_printf( Perl_debug_log,
3891                         "%*s  %sgot %"IVdf" possible matches%s\n",
3892                         REPORT_CODE_OFF + depth * 2, "",
3893                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3894                 );
3895                 goto trie_first_try; /* jump into the fail handler */
3896             }}
3897             assert(0); /* NOTREACHED */
3898
3899         case TRIE_next_fail: /* we failed - try next alternative */
3900         {
3901             U8 *uc;
3902             if ( ST.jump) {
3903                 REGCP_UNWIND(ST.cp);
3904                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3905             }
3906             if (!--ST.accepted) {
3907                 DEBUG_EXECUTE_r({
3908                     PerlIO_printf( Perl_debug_log,
3909                         "%*s  %sTRIE failed...%s\n",
3910                         REPORT_CODE_OFF+depth*2, "",
3911                         PL_colors[4],
3912                         PL_colors[5] );
3913                 });
3914                 sayNO_SILENT;
3915             }
3916             {
3917                 /* Find next-highest word to process.  Note that this code
3918                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3919                 U16 min = 0;
3920                 U16 word;
3921                 U16 const nextword = ST.nextword;
3922                 reg_trie_wordinfo * const wordinfo
3923                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3924                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3925                     if (word > nextword && (!min || word < min))
3926                         min = word;
3927                 }
3928                 ST.nextword = min;
3929             }
3930
3931           trie_first_try:
3932             if (do_cutgroup) {
3933                 do_cutgroup = 0;
3934                 no_final = 0;
3935             }
3936
3937             if ( ST.jump) {
3938                 ST.lastparen = rex->lastparen;
3939                 ST.lastcloseparen = rex->lastcloseparen;
3940                 REGCP_SET(ST.cp);
3941             }
3942
3943             /* find start char of end of current word */
3944             {
3945                 U32 chars; /* how many chars to skip */
3946                 reg_trie_data * const trie
3947                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3948
3949                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3950                             >=  ST.firstchars);
3951                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3952                             - ST.firstchars;
3953                 uc = ST.firstpos;
3954
3955                 if (ST.longfold) {
3956                     /* the hard option - fold each char in turn and find
3957                      * its folded length (which may be different */
3958                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3959                     STRLEN foldlen;
3960                     STRLEN len;
3961                     UV uvc;
3962                     U8 *uscan;
3963
3964                     while (chars) {
3965                         if (utf8_target) {
3966                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3967                                                     uniflags);
3968                             uc += len;
3969                         }
3970                         else {
3971                             uvc = *uc;
3972                             uc++;
3973                         }
3974                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3975                         uscan = foldbuf;
3976                         while (foldlen) {
3977                             if (!--chars)
3978                                 break;
3979                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3980                                             uniflags);
3981                             uscan += len;
3982                             foldlen -= len;
3983                         }
3984                     }
3985                 }
3986                 else {
3987                     if (utf8_target)
3988                         while (chars--)
3989                             uc += UTF8SKIP(uc);
3990                     else
3991                         uc += chars;
3992                 }
3993             }
3994
3995             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3996                             ? ST.jump[ST.nextword]
3997                             : NEXT_OFF(ST.me));
3998
3999             DEBUG_EXECUTE_r({
4000                 PerlIO_printf( Perl_debug_log,
4001                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4002                     REPORT_CODE_OFF+depth*2, "",
4003                     PL_colors[4],
4004                     ST.nextword,
4005                     PL_colors[5]
4006                     );
4007             });
4008
4009             if (ST.accepted > 1 || has_cutgroup) {
4010                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4011                 assert(0); /* NOTREACHED */
4012             }
4013             /* only one choice left - just continue */
4014             DEBUG_EXECUTE_r({
4015                 AV *const trie_words
4016                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4017                 SV ** const tmp = av_fetch( trie_words,
4018                     ST.nextword-1, 0 );
4019                 SV *sv= tmp ? sv_newmortal() : NULL;
4020
4021                 PerlIO_printf( Perl_debug_log,
4022                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4023                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4024                     ST.nextword,
4025                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4026                             PL_colors[0], PL_colors[1],
4027                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4028                         )
4029                     : "not compiled under -Dr",
4030                     PL_colors[5] );
4031             });
4032
4033             locinput = (char*)uc;
4034             continue; /* execute rest of RE */
4035             assert(0); /* NOTREACHED */
4036         }
4037 #undef  ST
4038
4039         case EXACT: {            /*  /abc/        */
4040             char *s = STRING(scan);
4041             ln = STR_LEN(scan);
4042             if (utf8_target != is_utf8_pat) {
4043                 /* The target and the pattern have differing utf8ness. */
4044                 char *l = locinput;
4045                 const char * const e = s + ln;
4046
4047                 if (utf8_target) {
4048                     /* The target is utf8, the pattern is not utf8.
4049                      * Above-Latin1 code points can't match the pattern;
4050                      * invariants match exactly, and the other Latin1 ones need
4051                      * to be downgraded to a single byte in order to do the
4052                      * comparison.  (If we could be confident that the target
4053                      * is not malformed, this could be refactored to have fewer
4054                      * tests by just assuming that if the first bytes match, it
4055                      * is an invariant, but there are tests in the test suite
4056                      * dealing with (??{...}) which violate this) */
4057                     while (s < e) {
4058                         if (l >= reginfo->strend
4059                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4060                         {
4061                             sayNO;
4062                         }
4063                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4064                             if (*l != *s) {
4065                                 sayNO;
4066                             }
4067                             l++;
4068                         }
4069                         else {
4070                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4071                                 sayNO;
4072                             }
4073                             l += 2;
4074                         }
4075                         s++;
4076                     }
4077                 }
4078                 else {
4079                     /* The target is not utf8, the pattern is utf8. */
4080                     while (s < e) {
4081                         if (l >= reginfo->strend
4082                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4083                         {
4084                             sayNO;
4085                         }
4086                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4087                             if (*s != *l) {
4088                                 sayNO;
4089                             }
4090                             s++;
4091                         }
4092                         else {
4093                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4094                                 sayNO;
4095                             }
4096                             s += 2;
4097                         }
4098                         l++;
4099                     }
4100                 }
4101                 locinput = l;
4102             }
4103             else {
4104                 /* The target and the pattern have the same utf8ness. */
4105                 /* Inline the first character, for speed. */
4106                 if (reginfo->strend - locinput < ln
4107                     || UCHARAT(s) != nextchr
4108                     || (ln > 1 && memNE(s, locinput, ln)))
4109                 {
4110                     sayNO;
4111                 }
4112                 locinput += ln;
4113             }
4114             break;
4115             }
4116
4117         case EXACTFL: {          /*  /abc/il      */
4118             re_fold_t folder;
4119             const U8 * fold_array;
4120             const char * s;
4121             U32 fold_utf8_flags;
4122
4123             RX_MATCH_TAINTED_on(reginfo->prog);
4124             folder = foldEQ_locale;
4125             fold_array = PL_fold_locale;
4126             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4127             goto do_exactf;
4128
4129         case EXACTFU_SS:         /*  /\x{df}/iu   */
4130         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4131         case EXACTFU:            /*  /abc/iu      */
4132             folder = foldEQ_latin1;
4133             fold_array = PL_fold_latin1;
4134             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4135             goto do_exactf;
4136
4137         case EXACTFA:            /*  /abc/iaa     */
4138             folder = foldEQ_latin1;
4139             fold_array = PL_fold_latin1;
4140             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4141             goto do_exactf;
4142
4143         case EXACTF:             /*  /abc/i       */
4144             folder = foldEQ;
4145             fold_array = PL_fold;
4146             fold_utf8_flags = 0;
4147
4148           do_exactf:
4149             s = STRING(scan);
4150             ln = STR_LEN(scan);
4151
4152             if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4153               /* Either target or the pattern are utf8, or has the issue where
4154                * the fold lengths may differ. */
4155                 const char * const l = locinput;
4156                 char *e = reginfo->strend;
4157
4158                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4159                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4160                 {
4161                     sayNO;
4162                 }
4163                 locinput = e;
4164                 break;
4165             }
4166
4167             /* Neither the target nor the pattern are utf8 */
4168             if (UCHARAT(s) != nextchr
4169                 && !NEXTCHR_IS_EOS
4170                 && UCHARAT(s) != fold_array[nextchr])
4171             {
4172                 sayNO;
4173             }
4174             if (reginfo->strend - locinput < ln)
4175                 sayNO;
4176             if (ln > 1 && ! folder(s, locinput, ln))
4177                 sayNO;
4178             locinput += ln;
4179             break;
4180         }
4181
4182         /* XXX Could improve efficiency by separating these all out using a
4183          * macro or in-line function.  At that point regcomp.c would no longer
4184          * have to set the FLAGS fields of these */
4185         case BOUNDL:  /*  /\b/l  */
4186         case NBOUNDL: /*  /\B/l  */
4187             RX_MATCH_TAINTED_on(reginfo->prog);
4188             /* FALL THROUGH */
4189         case BOUND:   /*  /\b/   */
4190         case BOUNDU:  /*  /\b/u  */
4191         case BOUNDA:  /*  /\b/a  */
4192         case NBOUND:  /*  /\B/   */
4193         case NBOUNDU: /*  /\B/u  */
4194         case NBOUNDA: /*  /\B/a  */
4195             /* was last char in word? */
4196             if (utf8_target
4197                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4198                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4199             {
4200                 if (locinput == reginfo->strbeg)
4201                     ln = '\n';
4202                 else {
4203                     const U8 * const r =
4204                             reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4205
4206                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4207                 }
4208                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4209                     ln = isWORDCHAR_uni(ln);
4210                     if (NEXTCHR_IS_EOS)
4211                         n = 0;
4212                     else {
4213                         LOAD_UTF8_CHARCLASS_ALNUM();
4214                         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4215                                                                 utf8_target);
4216                     }
4217                 }
4218                 else {
4219                     ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4220                     n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4221                 }
4222             }
4223             else {
4224
4225                 /* Here the string isn't utf8, or is utf8 and only ascii
4226                  * characters are to match \w.  In the latter case looking at
4227                  * the byte just prior to the current one may be just the final
4228                  * byte of a multi-byte character.  This is ok.  There are two
4229                  * cases:
4230                  * 1) it is a single byte character, and then the test is doing
4231                  *      just what it's supposed to.
4232                  * 2) it is a multi-byte character, in which case the final
4233                  *      byte is never mistakable for ASCII, and so the test
4234                  *      will say it is not a word character, which is the
4235                  *      correct answer. */
4236                 ln = (locinput != reginfo->strbeg) ?
4237                     UCHARAT(locinput - 1) : '\n';
4238                 switch (FLAGS(scan)) {
4239                     case REGEX_UNICODE_CHARSET:
4240                         ln = isWORDCHAR_L1(ln);
4241                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4242                         break;
4243                     case REGEX_LOCALE_CHARSET:
4244                         ln = isWORDCHAR_LC(ln);
4245                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4246                         break;
4247                     case REGEX_DEPENDS_CHARSET:
4248                         ln = isWORDCHAR(ln);
4249                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4250                         break;
4251                     case REGEX_ASCII_RESTRICTED_CHARSET:
4252                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4253                         ln = isWORDCHAR_A(ln);
4254                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4255                         break;
4256                     default:
4257                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4258                         break;
4259                 }
4260             }
4261             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4262              * regcomp.sym */
4263             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4264                     sayNO;
4265             break;
4266
4267         case ANYOF:  /*  /[abc]/       */
4268         case ANYOF_WARN_SUPER:
4269             if (NEXTCHR_IS_EOS)
4270                 sayNO;
4271             if (utf8_target) {
4272                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4273                     sayNO;
4274                 locinput += UTF8SKIP(locinput);
4275             }
4276             else {
4277                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4278                     sayNO;
4279                 locinput++;
4280             }
4281             break;
4282
4283         /* The argument (FLAGS) to all the POSIX node types is the class number
4284          * */
4285
4286         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4287             to_complement = 1;
4288             /* FALLTHROUGH */
4289
4290         case POSIXL:    /* \w or [:punct:] etc. under /l */
4291             if (NEXTCHR_IS_EOS)
4292                 sayNO;
4293
4294             /* The locale hasn't influenced the outcome before this, so defer
4295              * tainting until now */
4296             RX_MATCH_TAINTED_on(reginfo->prog);
4297
4298             /* Use isFOO_lc() for characters within Latin1.  (Note that
4299              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4300              * wouldn't be invariant) */
4301             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4302                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4303                     sayNO;
4304                 }
4305             }
4306             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4307                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4308                                         (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
4309                                                             *(locinput + 1))))))
4310                 {
4311                     sayNO;
4312                 }
4313             }
4314             else { /* Here, must be an above Latin-1 code point */
4315                 goto utf8_posix_not_eos;
4316             }
4317
4318             /* Here, must be utf8 */
4319             locinput += UTF8SKIP(locinput);
4320             break;
4321
4322         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4323             to_complement = 1;
4324             /* FALLTHROUGH */
4325
4326         case POSIXD:    /* \w or [:punct:] etc. under /d */
4327             if (utf8_target) {
4328                 goto utf8_posix;
4329             }
4330             goto posixa;
4331
4332         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4333
4334             if (NEXTCHR_IS_EOS) {
4335                 sayNO;
4336             }
4337
4338             /* All UTF-8 variants match */
4339             if (! UTF8_IS_INVARIANT(nextchr)) {
4340                 goto increment_locinput;
4341             }
4342
4343             to_complement = 1;
4344             /* FALLTHROUGH */
4345
4346         case POSIXA:    /* \w or [:punct:] etc. under /a */
4347
4348           posixa:
4349             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4350              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4351              * character is a single byte */
4352
4353             if (NEXTCHR_IS_EOS
4354                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4355                                                             FLAGS(scan)))))
4356             {
4357                 sayNO;
4358             }
4359
4360             /* Here we are either not in utf8, or we matched a utf8-invariant,
4361              * so the next char is the next byte */
4362             locinput++;
4363             break;
4364
4365         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4366             to_complement = 1;
4367             /* FALLTHROUGH */
4368
4369         case POSIXU:    /* \w or [:punct:] etc. under /u */
4370           utf8_posix:
4371             if (NEXTCHR_IS_EOS) {
4372                 sayNO;
4373             }
4374           utf8_posix_not_eos:
4375
4376             /* Use _generic_isCC() for characters within Latin1.  (Note that
4377              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4378              * wouldn't be invariant) */
4379             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4380                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4381                                                            FLAGS(scan)))))
4382                 {
4383                     sayNO;
4384                 }
4385                 locinput++;
4386             }
4387             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4388                 if (! (to_complement
4389                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4390                                                                *(locinput + 1)),
4391                                               FLAGS(scan)))))
4392                 {
4393                     sayNO;
4394                 }
4395                 locinput += 2;
4396             }
4397             else {  /* Handle above Latin-1 code points */
4398                 classnum = (_char_class_number) FLAGS(scan);
4399                 if (classnum < _FIRST_NON_SWASH_CC) {
4400
4401                     /* Here, uses a swash to find such code points.  Load if if
4402                      * not done already */
4403                     if (! PL_utf8_swash_ptrs[classnum]) {
4404                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4405                         PL_utf8_swash_ptrs[classnum]
4406                                 = _core_swash_init("utf8",
4407                                         swash_property_names[classnum],
4408                                         &PL_sv_undef, 1, 0, NULL, &flags);
4409                     }
4410                     if (! (to_complement
4411                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4412                                                (U8 *) locinput, TRUE))))
4413                     {
4414                         sayNO;
4415                     }
4416                 }
4417                 else {  /* Here, uses macros to find above Latin-1 code points */
4418                     switch (classnum) {
4419                         case _CC_ENUM_SPACE:    /* XXX would require separate
4420                                                    code if we revert the change
4421                                                    of \v matching this */
4422                         case _CC_ENUM_PSXSPC:
4423                             if (! (to_complement
4424                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4425                             {
4426                                 sayNO;
4427                             }
4428                             break;
4429                         case _CC_ENUM_BLANK:
4430                             if (! (to_complement
4431                                             ^ cBOOL(is_HORIZWS_high(locinput))))
4432                             {
4433                                 sayNO;
4434                             }
4435                             break;
4436                         case _CC_ENUM_XDIGIT:
4437                             if (! (to_complement
4438                                             ^ cBOOL(is_XDIGIT_high(locinput))))
4439                             {
4440                                 sayNO;
4441                             }
4442                             break;
4443                         case _CC_ENUM_VERTSPACE:
4444                             if (! (to_complement
4445                                             ^ cBOOL(is_VERTWS_high(locinput))))
4446                             {
4447                                 sayNO;
4448                             }
4449                             break;
4450                         default:    /* The rest, e.g. [:cntrl:], can't match
4451                                        above Latin1 */
4452                             if (! to_complement) {
4453                                 sayNO;
4454                             }
4455                             break;
4456                     }
4457                 }
4458                 locinput += UTF8SKIP(locinput);
4459             }
4460             break;
4461
4462         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4463                        a Unicode extended Grapheme Cluster */
4464             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4465               extended Grapheme Cluster is:
4466
4467             CR LF
4468             | Prepend* Begin Extend*
4469             | .
4470
4471             Begin is:           ( Special_Begin | ! Control )
4472             Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4473             Extend is:          ( Grapheme_Extend | Spacing_Mark )
4474             Control is:         [ GCB_Control | CR | LF ]
4475             Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4476
4477                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4478                we can rewrite
4479
4480                    Begin is ( Regular_Begin + Special Begin )
4481
4482                It turns out that 98.4% of all Unicode code points match
4483                Regular_Begin.  Doing it this way eliminates a table match in
4484                the previous implementation for almost all Unicode code points.
4485
4486                There is a subtlety with Prepend* which showed up in testing.
4487                Note that the Begin, and only the Begin is required in:
4488                 | Prepend* Begin Extend*
4489                Also, Begin contains '! Control'.  A Prepend must be a
4490                '!  Control', which means it must also be a Begin.  What it
4491                comes down to is that if we match Prepend* and then find no
4492                suitable Begin afterwards, that if we backtrack the last
4493                Prepend, that one will be a suitable Begin.
4494             */
4495
4496             if (NEXTCHR_IS_EOS)
4497                 sayNO;
4498             if  (! utf8_target) {
4499
4500                 /* Match either CR LF  or '.', as all the other possibilities
4501                  * require utf8 */
4502                 locinput++;         /* Match the . or CR */
4503                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4504                                        match the LF */
4505                     && locinput < reginfo->strend
4506                     && UCHARAT(locinput) == '\n')
4507                 {
4508                     locinput++;
4509                 }
4510             }
4511             else {
4512
4513                 /* Utf8: See if is ( CR LF ); already know that locinput <
4514                  * reginfo->strend, so locinput+1 is in bounds */
4515                 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4516                      && UCHARAT(locinput + 1) == '\n')
4517                 {
4518                     locinput += 2;
4519                 }
4520                 else {
4521                     STRLEN len;
4522
4523                     /* In case have to backtrack to beginning, then match '.' */
4524                     char *starting = locinput;
4525
4526                     /* In case have to backtrack the last prepend */
4527                     char *previous_prepend = NULL;
4528
4529                     LOAD_UTF8_CHARCLASS_GCB();
4530
4531                     /* Match (prepend)*   */
4532                     while (locinput < reginfo->strend
4533                            && (len = is_GCB_Prepend_utf8(locinput)))
4534                     {
4535                         previous_prepend = locinput;
4536                         locinput += len;
4537                     }
4538
4539                     /* As noted above, if we matched a prepend character, but
4540                      * the next thing won't match, back off the last prepend we
4541                      * matched, as it is guaranteed to match the begin */
4542                     if (previous_prepend
4543                         && (locinput >=  reginfo->strend
4544                             || (! swash_fetch(PL_utf8_X_regular_begin,
4545                                              (U8*)locinput, utf8_target)
4546                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4547                         )
4548                     {
4549                         locinput = previous_prepend;
4550                     }
4551
4552                     /* Note that here we know reginfo->strend > locinput, as we
4553                      * tested that upon input to this switch case, and if we
4554                      * moved locinput forward, we tested the result just above
4555                      * and it either passed, or we backed off so that it will
4556                      * now pass */
4557                     if (swash_fetch(PL_utf8_X_regular_begin,
4558                                     (U8*)locinput, utf8_target)) {
4559                         locinput += UTF8SKIP(locinput);
4560                     }
4561                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4562
4563                         /* Here did not match the required 'Begin' in the
4564                          * second term.  So just match the very first
4565                          * character, the '.' of the final term of the regex */
4566                         locinput = starting + UTF8SKIP(starting);
4567                         goto exit_utf8;
4568                     } else {
4569
4570                         /* Here is a special begin.  It can be composed of
4571                          * several individual characters.  One possibility is
4572                          * RI+ */
4573                         if ((len = is_GCB_RI_utf8(locinput))) {
4574                             locinput += len;
4575                             while (locinput < reginfo->strend
4576                                    && (len = is_GCB_RI_utf8(locinput)))
4577                             {
4578                                 locinput += len;
4579                             }
4580                         } else if ((len = is_GCB_T_utf8(locinput))) {
4581                             /* Another possibility is T+ */
4582                             locinput += len;
4583                             while (locinput < reginfo->strend
4584                                 && (len = is_GCB_T_utf8(locinput)))
4585                             {
4586                                 locinput += len;
4587                             }
4588                         } else {
4589
4590                             /* Here, neither RI+ nor T+; must be some other
4591                              * Hangul.  That means it is one of the others: L,
4592                              * LV, LVT or V, and matches:
4593                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4594
4595                             /* Match L*           */
4596                             while (locinput < reginfo->strend
4597                                    && (len = is_GCB_L_utf8(locinput)))
4598                             {
4599                                 locinput += len;
4600                             }
4601
4602                             /* Here, have exhausted L*.  If the next character
4603                              * is not an LV, LVT nor V, it means we had to have
4604                              * at least one L, so matches L+ in the original
4605                              * equation, we have a complete hangul syllable.
4606                              * Are done. */
4607
4608                             if (locinput < reginfo->strend
4609                                 && is_GCB_LV_LVT_V_utf8(locinput))
4610                             {
4611                                 /* Otherwise keep going.  Must be LV, LVT or V.
4612                                  * See if LVT, by first ruling out V, then LV */
4613                                 if (! is_GCB_V_utf8(locinput)
4614                                         /* All but every TCount one is LV */
4615                                     && (valid_utf8_to_uvchr((U8 *) locinput,
4616                                                                          NULL)
4617                                                                         - SBASE)
4618                                         % TCount != 0)
4619                                 {
4620                                     locinput += UTF8SKIP(locinput);
4621                                 } else {
4622
4623                                     /* Must be  V or LV.  Take it, then match
4624                                      * V*     */
4625                                     locinput += UTF8SKIP(locinput);
4626                                     while (locinput < reginfo->strend
4627                                            && (len = is_GCB_V_utf8(locinput)))
4628                                     {
4629                                         locinput += len;
4630                                     }
4631                                 }
4632
4633                                 /* And any of LV, LVT, or V can be followed
4634                                  * by T*            */
4635                                 while (locinput < reginfo->strend
4636                                        && (len = is_GCB_T_utf8(locinput)))
4637                                 {
4638                                     locinput += len;
4639                                 }
4640                             }
4641                         }
4642                     }
4643
4644                     /* Match any extender */
4645                     while (locinput < reginfo->strend
4646                             && swash_fetch(PL_utf8_X_extend,
4647                                             (U8*)locinput, utf8_target))
4648                     {
4649                         locinput += UTF8SKIP(locinput);
4650                     }
4651                 }
4652             exit_utf8:
4653                 if (locinput > reginfo->strend) sayNO;
4654             }
4655             break;
4656
4657         case NREFFL:  /*  /\g{name}/il  */
4658         {   /* The capture buffer cases.  The ones beginning with N for the
4659                named buffers just convert to the equivalent numbered and
4660                pretend they were called as the corresponding numbered buffer
4661                op.  */
4662             /* don't initialize these in the declaration, it makes C++
4663                unhappy */
4664             const char *s;
4665             char type;
4666             re_fold_t folder;
4667             const U8 *fold_array;
4668             UV utf8_fold_flags;
4669
4670             RX_MATCH_TAINTED_on(reginfo->prog);
4671             folder = foldEQ_locale;
4672             fold_array = PL_fold_locale;
4673             type = REFFL;
4674             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4675             goto do_nref;
4676
4677         case NREFFA:  /*  /\g{name}/iaa  */
4678             folder = foldEQ_latin1;
4679             fold_array = PL_fold_latin1;
4680             type = REFFA;
4681             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4682             goto do_nref;
4683
4684         case NREFFU:  /*  /\g{name}/iu  */
4685             folder = foldEQ_latin1;
4686             fold_array = PL_fold_latin1;
4687             type = REFFU;
4688             utf8_fold_flags = 0;
4689             goto do_nref;
4690
4691         case NREFF:  /*  /\g{name}/i  */
4692             folder = foldEQ;
4693             fold_array = PL_fold;
4694             type = REFF;
4695             utf8_fold_flags = 0;
4696             goto do_nref;
4697
4698         case NREF:  /*  /\g{name}/   */
4699             type = REF;
4700             folder = NULL;
4701             fold_array = NULL;
4702             utf8_fold_flags = 0;
4703           do_nref:
4704
4705             /* For the named back references, find the corresponding buffer
4706              * number */
4707             n = reg_check_named_buff_matched(rex,scan);
4708
4709             if ( ! n ) {
4710                 sayNO;
4711             }
4712             goto do_nref_ref_common;
4713
4714         case REFFL:  /*  /\1/il  */
4715             RX_MATCH_TAINTED_on(reginfo->prog);
4716             folder = foldEQ_locale;
4717             fold_array = PL_fold_locale;
4718             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4719             goto do_ref;
4720
4721         case REFFA:  /*  /\1/iaa  */
4722             folder = foldEQ_latin1;
4723             fold_array = PL_fold_latin1;
4724             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4725             goto do_ref;
4726
4727         case REFFU:  /*  /\1/iu  */
4728             folder = foldEQ_latin1;
4729             fold_array = PL_fold_latin1;
4730             utf8_fold_flags = 0;
4731             goto do_ref;
4732
4733         case REFF:  /*  /\1/i  */
4734             folder = foldEQ;
4735             fold_array = PL_fold;
4736             utf8_fold_flags = 0;
4737             goto do_ref;
4738
4739         case REF:  /*  /\1/    */
4740             folder = NULL;
4741             fold_array = NULL;
4742             utf8_fold_flags = 0;
4743
4744           do_ref:
4745             type = OP(scan);
4746             n = ARG(scan);  /* which paren pair */
4747
4748           do_nref_ref_common:
4749             ln = rex->offs[n].start;
4750             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4751             if (rex->lastparen < n || ln == -1)
4752                 sayNO;                  /* Do not match unless seen CLOSEn. */
4753             if (ln == rex->offs[n].end)
4754                 break;
4755
4756             s = reginfo->strbeg + ln;
4757             if (type != REF     /* REF can do byte comparison */
4758                 && (utf8_target || type == REFFU))
4759             { /* XXX handle REFFL better */
4760                 char * limit = reginfo->strend;
4761
4762                 /* This call case insensitively compares the entire buffer
4763                     * at s, with the current input starting at locinput, but
4764                     * not going off the end given by reginfo->strend, and
4765                     * returns in <limit> upon success, how much of the
4766                     * current input was matched */
4767                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4768                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4769                 {
4770                     sayNO;
4771                 }
4772                 locinput = limit;
4773                 break;
4774             }
4775
4776             /* Not utf8:  Inline the first character, for speed. */
4777             if (!NEXTCHR_IS_EOS &&
4778                 UCHARAT(s) != nextchr &&
4779                 (type == REF ||
4780                  UCHARAT(s) != fold_array[nextchr]))
4781                 sayNO;
4782             ln = rex->offs[n].end - ln;
4783             if (locinput + ln > reginfo->strend)
4784                 sayNO;
4785             if (ln > 1 && (type == REF
4786                            ? memNE(s, locinput, ln)
4787                            : ! folder(s, locinput, ln)))
4788                 sayNO;
4789             locinput += ln;
4790             break;
4791         }
4792
4793         case NOTHING: /* null op; e.g. the 'nothing' following
4794                        * the '*' in m{(a+|b)*}' */
4795             break;
4796         case TAIL: /* placeholder while compiling (A|B|C) */
4797             break;
4798
4799         case BACK: /* ??? doesn't appear to be used ??? */
4800             break;
4801
4802 #undef  ST
4803 #define ST st->u.eval
4804         {
4805             SV *ret;
4806             REGEXP *re_sv;
4807             regexp *re;
4808             regexp_internal *rei;
4809             regnode *startpoint;
4810
4811         case GOSTART: /*  (?R)  */
4812         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4813             if (cur_eval && cur_eval->locinput==locinput) {
4814                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4815                     Perl_croak(aTHX_ "Infinite recursion in regex");
4816                 if ( ++nochange_depth > max_nochange_depth )
4817                     Perl_croak(aTHX_
4818                         "Pattern subroutine nesting without pos change"
4819                         " exceeded limit in regex");
4820             } else {
4821                 nochange_depth = 0;
4822             }
4823             re_sv = rex_sv;
4824             re = rex;
4825             rei = rexi;
4826             if (OP(scan)==GOSUB) {
4827                 startpoint = scan + ARG2L(scan);
4828                 ST.close_paren = ARG(scan);
4829             } else {
4830                 startpoint = rei->program+1;
4831                 ST.close_paren = 0;
4832             }
4833             goto eval_recurse_doit;
4834             assert(0); /* NOTREACHED */
4835
4836         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4837             if (cur_eval && cur_eval->locinput==locinput) {
4838                 if ( ++nochange_depth > max_nochange_depth )
4839                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4840             } else {
4841                 nochange_depth = 0;
4842             }
4843             {
4844                 /* execute the code in the {...} */
4845
4846                 dSP;
4847                 IV before;
4848                 OP * const oop = PL_op;
4849                 COP * const ocurcop = PL_curcop;
4850                 OP *nop;
4851                 struct re_save_state saved_state;
4852                 CV *newcv;
4853
4854                 /* save *all* paren positions */
4855                 regcppush(rex, 0, maxopenparen);
4856                 REGCP_SET(runops_cp);
4857
4858                 /* To not corrupt the existing regex state while executing the
4859                  * eval we would normally put it on the save stack, like with
4860                  * save_re_context. However, re-evals have a weird scoping so we
4861                  * can't just add ENTER/LEAVE here. With that, things like
4862                  *
4863                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4864                  *
4865                  * would break, as they expect the localisation to be unwound
4866                  * only when the re-engine backtracks through the bit that
4867                  * localised it.
4868                  *
4869                  * What we do instead is just saving the state in a local c
4870                  * variable.
4871                  */
4872                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4873
4874                 if (!caller_cv)
4875                     caller_cv = find_runcv(NULL);
4876
4877                 n = ARG(scan);
4878
4879                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4880                     newcv = (ReANY(
4881                                                 (REGEXP*)(rexi->data->data[n])
4882                                             ))->qr_anoncv
4883                                         ;
4884                     nop = (OP*)rexi->data->data[n+1];
4885                 }
4886                 else if (rexi->data->what[n] == 'l') { /* literal code */
4887                     newcv = caller_cv;
4888                     nop = (OP*)rexi->data->data[n];
4889                     assert(CvDEPTH(newcv));
4890                 }
4891                 else {
4892                     /* literal with own CV */
4893                     assert(rexi->data->what[n] == 'L');
4894                     newcv = rex->qr_anoncv;
4895                     nop = (OP*)rexi->data->data[n];
4896                 }
4897
4898                 /* normally if we're about to execute code from the same
4899                  * CV that we used previously, we just use the existing
4900                  * CX stack entry. However, its possible that in the
4901                  * meantime we may have backtracked, popped from the save
4902                  * stack, and undone the SAVECOMPPAD(s) associated with
4903                  * PUSH_MULTICALL; in which case PL_comppad no longer
4904                  * points to newcv's pad. */
4905                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4906                 {
4907                     U8 flags = (CXp_SUB_RE |
4908                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
4909                     if (last_pushed_cv) {
4910                         CHANGE_MULTICALL_FLAGS(newcv, flags);
4911                     }
4912                     else {
4913                         PUSH_MULTICALL_FLAGS(newcv, flags);
4914                     }
4915                     last_pushed_cv = newcv;
4916                 }
4917                 else {
4918                     /* these assignments are just to silence compiler
4919                      * warnings */
4920                     multicall_cop = NULL;
4921                     newsp = NULL;
4922                 }
4923                 last_pad = PL_comppad;
4924
4925                 /* the initial nextstate you would normally execute
4926                  * at the start of an eval (which would cause error
4927                  * messages to come from the eval), may be optimised
4928                  * away from the execution path in the regex code blocks;
4929                  * so manually set PL_curcop to it initially */
4930                 {
4931                     OP *o = cUNOPx(nop)->op_first;
4932                     assert(o->op_type == OP_NULL);
4933                     if (o->op_targ == OP_SCOPE) {
4934                         o = cUNOPo->op_first;
4935                     }
4936                     else {
4937                         assert(o->op_targ == OP_LEAVE);
4938                         o = cUNOPo->op_first;
4939                         assert(o->op_type == OP_ENTER);
4940                         o = o->op_sibling;
4941                     }
4942
4943                     if (o->op_type != OP_STUB) {
4944                         assert(    o->op_type == OP_NEXTSTATE
4945                                 || o->op_type == OP_DBSTATE
4946                                 || (o->op_type == OP_NULL
4947                                     &&  (  o->op_targ == OP_NEXTSTATE
4948                                         || o->op_targ == OP_DBSTATE
4949                                         )
4950                                     )
4951                         );
4952                         PL_curcop = (COP*)o;
4953                     }
4954                 }
4955                 nop = nop->op_next;
4956
4957                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4958                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4959
4960                 rex->offs[0].end = locinput - reginfo->strbeg;
4961                 if (reginfo->info_aux_eval->pos_magic)
4962                         reginfo->info_aux_eval->pos_magic->mg_len
4963                                         = locinput - reginfo->strbeg;
4964
4965                 if (sv_yes_mark) {
4966                     SV *sv_mrk = get_sv("REGMARK", 1);
4967                     sv_setsv(sv_mrk, sv_yes_mark);
4968                 }
4969
4970                 /* we don't use MULTICALL here as we want to call the
4971                  * first op of the block of interest, rather than the
4972                  * first op of the sub */
4973                 before = (IV)(SP-PL_stack_base);
4974                 PL_op = nop;
4975                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4976                 SPAGAIN;
4977                 if ((IV)(SP-PL_stack_base) == before)
4978                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4979                 else {
4980                     ret = POPs;
4981                     PUTBACK;
4982                 }
4983
4984                 /* before restoring everything, evaluate the returned
4985                  * value, so that 'uninit' warnings don't use the wrong
4986                  * PL_op or pad. Also need to process any magic vars
4987                  * (e.g. $1) *before* parentheses are restored */
4988
4989                 PL_op = NULL;
4990
4991                 re_sv = NULL;
4992                 if (logical == 0)        /*   (?{})/   */
4993                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4994                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4995                     sw = cBOOL(SvTRUE(ret));
4996                     logical = 0;
4997                 }
4998                 else {                   /*  /(??{})  */
4999                     /*  if its overloaded, let the regex compiler handle
5000                      *  it; otherwise extract regex, or stringify  */
5001                     if (!SvAMAGIC(ret)) {
5002                         SV *sv = ret;
5003                         if (SvROK(sv))
5004                             sv = SvRV(sv);
5005                         if (SvTYPE(sv) == SVt_REGEXP)
5006                             re_sv = (REGEXP*) sv;
5007                         else if (SvSMAGICAL(sv)) {
5008                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5009                             if (mg)
5010                                 re_sv = (REGEXP *) mg->mg_obj;
5011                         }
5012
5013                         /* force any magic, undef warnings here */
5014                         if (!re_sv) {
5015                             ret = sv_mortalcopy(ret);
5016                             (void) SvPV_force_nolen(ret);
5017                         }
5018                     }
5019
5020                 }
5021
5022                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5023
5024                 /* *** Note that at this point we don't restore
5025                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5026                  * be used again soon. This is safe as long as nothing
5027                  * in the regexp code uses the pad ! */
5028                 PL_op = oop;
5029                 PL_curcop = ocurcop;
5030                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5031
5032                 if (logical != 2)
5033                     break;
5034             }
5035
5036                 /* only /(??{})/  from now on */
5037                 logical = 0;
5038                 {
5039                     /* extract RE object from returned value; compiling if
5040                      * necessary */
5041
5042                     if (re_sv) {
5043                         re_sv = reg_temp_copy(NULL, re_sv);
5044                     }
5045                     else {
5046                         U32 pm_flags = 0;
5047
5048                         if (SvUTF8(ret) && IN_BYTES) {
5049                             /* In use 'bytes': make a copy of the octet
5050                              * sequence, but without the flag on */
5051                             STRLEN len;
5052                             const char *const p = SvPV(ret, len);
5053                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5054                         }
5055                         if (rex->intflags & PREGf_USE_RE_EVAL)
5056                             pm_flags |= PMf_USE_RE_EVAL;
5057
5058                         /* if we got here, it should be an engine which
5059                          * supports compiling code blocks and stuff */
5060                         assert(rex->engine && rex->engine->op_comp);
5061                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5062                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5063                                     rex->engine, NULL, NULL,
5064                                     /* copy /msix etc to inner pattern */
5065                                     scan->flags,
5066                                     pm_flags);
5067
5068                         if (!(SvFLAGS(ret)
5069                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5070                                  | SVs_GMG))) {
5071                             /* This isn't a first class regexp. Instead, it's
5072                                caching a regexp onto an existing, Perl visible
5073                                scalar.  */
5074                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5075                         }
5076                         /* safe to do now that any $1 etc has been
5077                          * interpolated into the new pattern string and
5078                          * compiled */
5079                         S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5080                     }
5081                     SAVEFREESV(re_sv);
5082                     re = ReANY(re_sv);
5083                 }
5084                 RXp_MATCH_COPIED_off(re);
5085                 re->subbeg = rex->subbeg;
5086                 re->sublen = rex->sublen;
5087                 re->suboffset = rex->suboffset;
5088                 re->subcoffset = rex->subcoffset;
5089                 rei = RXi_GET(re);
5090                 DEBUG_EXECUTE_r(
5091                     debug_start_match(re_sv, utf8_target, locinput,
5092                                     reginfo->strend, "Matching embedded");
5093                 );
5094                 startpoint = rei->program + 1;
5095                 ST.close_paren = 0; /* only used for GOSUB */
5096
5097         eval_recurse_doit: /* Share code with GOSUB below this line */
5098                 /* run the pattern returned from (??{...}) */
5099
5100                 /* Save *all* the positions. */
5101                 ST.cp = regcppush(rex, 0, maxopenparen);
5102                 REGCP_SET(ST.lastcp);
5103
5104                 re->lastparen = 0;
5105                 re->lastcloseparen = 0;
5106
5107                 maxopenparen = 0;
5108
5109                 /* XXXX This is too dramatic a measure... */
5110                 reginfo->poscache_maxiter = 0;
5111
5112                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5113
5114                 ST.prev_rex = rex_sv;
5115                 ST.prev_curlyx = cur_curlyx;
5116                 rex_sv = re_sv;
5117                 SET_reg_curpm(rex_sv);
5118                 rex = re;
5119                 rexi = rei;
5120                 cur_curlyx = NULL;
5121                 ST.B = next;
5122                 ST.prev_eval = cur_eval;
5123                 cur_eval = st;
5124                 /* now continue from first node in postoned RE */
5125                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5126                 assert(0); /* NOTREACHED */
5127         }
5128
5129         case EVAL_AB: /* cleanup after a successful (??{A})B */
5130             /* note: this is called twice; first after popping B, then A */
5131             rex_sv = ST.prev_rex;
5132             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5133             SET_reg_curpm(rex_sv);
5134             rex = ReANY(rex_sv);
5135             rexi = RXi_GET(rex);
5136             regcpblow(ST.cp);
5137             cur_eval = ST.prev_eval;
5138             cur_curlyx = ST.prev_curlyx;
5139
5140             /* XXXX This is too dramatic a measure... */
5141             reginfo->poscache_maxiter = 0;
5142             if ( nochange_depth )
5143                 nochange_depth--;
5144             sayYES;
5145
5146
5147         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5148             /* note: this is called twice; first after popping B, then A */
5149             rex_sv = ST.prev_rex;
5150             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5151             SET_reg_curpm(rex_sv);
5152             rex = ReANY(rex_sv);
5153             rexi = RXi_GET(rex);
5154
5155             REGCP_UNWIND(ST.lastcp);
5156             regcppop(rex, &maxopenparen);
5157             cur_eval = ST.prev_eval;
5158             cur_curlyx = ST.prev_curlyx;
5159             /* XXXX This is too dramatic a measure... */
5160             reginfo->poscache_maxiter = 0;
5161             if ( nochange_depth )
5162                 nochange_depth--;
5163             sayNO_SILENT;
5164 #undef ST
5165
5166         case OPEN: /*  (  */
5167             n = ARG(scan);  /* which paren pair */
5168             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5169             if (n > maxopenparen)
5170                 maxopenparen = n;
5171             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5172                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5173                 PTR2UV(rex),
5174                 PTR2UV(rex->offs),
5175                 (UV)n,
5176                 (IV)rex->offs[n].start_tmp,
5177                 (UV)maxopenparen
5178             ));
5179             lastopen = n;
5180             break;
5181
5182 /* XXX really need to log other places start/end are set too */
5183 #define CLOSE_CAPTURE \
5184     rex->offs[n].start = rex->offs[n].start_tmp; \
5185     rex->offs[n].end = locinput - reginfo->strbeg; \
5186     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5187         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5188         PTR2UV(rex), \
5189         PTR2UV(rex->offs), \
5190         (UV)n, \
5191         (IV)rex->offs[n].start, \
5192         (IV)rex->offs[n].end \
5193     ))
5194
5195         case CLOSE:  /*  )  */
5196             n = ARG(scan);  /* which paren pair */
5197             CLOSE_CAPTURE;
5198             if (n > rex->lastparen)
5199                 rex->lastparen = n;
5200             rex->lastcloseparen = n;
5201             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5202                 goto fake_end;
5203             }
5204             break;
5205
5206         case ACCEPT:  /*  (*ACCEPT)  */
5207             if (ARG(scan)){
5208                 regnode *cursor;
5209                 for (cursor=scan;
5210                      cursor && OP(cursor)!=END;
5211                      cursor=regnext(cursor))
5212                 {
5213                     if ( OP(cursor)==CLOSE ){
5214                         n = ARG(cursor);
5215                         if ( n <= lastopen ) {
5216                             CLOSE_CAPTURE;
5217                             if (n > rex->lastparen)
5218                                 rex->lastparen = n;
5219                             rex->lastcloseparen = n;
5220                             if ( n == ARG(scan) || (cur_eval &&
5221                                 cur_eval->u.eval.close_paren == n))
5222                                 break;
5223                         }
5224                     }
5225                 }
5226             }
5227             goto fake_end;
5228             /*NOTREACHED*/
5229
5230         case GROUPP:  /*  (?(1))  */
5231             n = ARG(scan);  /* which paren pair */
5232             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5233             break;
5234
5235         case NGROUPP:  /*  (?(<name>))  */
5236             /* reg_check_named_buff_matched returns 0 for no match */
5237             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5238             break;
5239
5240         case INSUBP:   /*  (?(R))  */
5241             n = ARG(scan);
5242             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5243             break;
5244
5245         case DEFINEP:  /*  (?(DEFINE))  */
5246             sw = 0;
5247             break;
5248
5249         case IFTHEN:   /*  (?(cond)A|B)  */
5250             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5251             if (sw)
5252                 next = NEXTOPER(NEXTOPER(scan));
5253             else {
5254                 next = scan + ARG(scan);
5255                 if (OP(next) == IFTHEN) /* Fake one. */
5256                     next = NEXTOPER(NEXTOPER(next));
5257             }
5258             break;
5259
5260         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5261             logical = scan->flags;
5262             break;
5263
5264 /*******************************************************************
5265
5266 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5267 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5268 STAR/PLUS/CURLY/CURLYN are used instead.)
5269
5270 A*B is compiled as <CURLYX><A><WHILEM><B>
5271
5272 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5273 state, which contains the current count, initialised to -1. It also sets
5274 cur_curlyx to point to this state, with any previous value saved in the
5275 state block.
5276
5277 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5278 since the pattern may possibly match zero times (i.e. it's a while {} loop
5279 rather than a do {} while loop).
5280
5281 Each entry to WHILEM represents a successful match of A. The count in the
5282 CURLYX block is incremented, another WHILEM state is pushed, and execution
5283 passes to A or B depending on greediness and the current count.
5284
5285 For example, if matching against the string a1a2a3b (where the aN are
5286 substrings that match /A/), then the match progresses as follows: (the
5287 pushed states are interspersed with the bits of strings matched so far):
5288
5289     <CURLYX cnt=-1>
5290     <CURLYX cnt=0><WHILEM>
5291     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5292     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5293     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5294     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5295
5296 (Contrast this with something like CURLYM, which maintains only a single
5297 backtrack state:
5298
5299     <CURLYM cnt=0> a1
5300     a1 <CURLYM cnt=1> a2
5301     a1 a2 <CURLYM cnt=2> a3
5302     a1 a2 a3 <CURLYM cnt=3> b
5303 )
5304
5305 Each WHILEM state block marks a point to backtrack to upon partial failure
5306 of A or B, and also contains some minor state data related to that
5307 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5308 overall state, such as the count, and pointers to the A and B ops.
5309
5310 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5311 must always point to the *current* CURLYX block, the rules are:
5312
5313 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5314 and set cur_curlyx to point the new block.
5315
5316 When popping the CURLYX block after a successful or unsuccessful match,
5317 restore the previous cur_curlyx.
5318
5319 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5320 to the outer one saved in the CURLYX block.
5321
5322 When popping the WHILEM block after a successful or unsuccessful B match,
5323 restore the previous cur_curlyx.
5324
5325 Here's an example for the pattern (AI* BI)*BO
5326 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5327
5328 cur_
5329 curlyx backtrack stack
5330 ------ ---------------
5331 NULL
5332 CO     <CO prev=NULL> <WO>
5333 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5334 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5335 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5336
5337 At this point the pattern succeeds, and we work back down the stack to
5338 clean up, restoring as we go:
5339
5340 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5341 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5342 CO     <CO prev=NULL> <WO>
5343 NULL
5344
5345 *******************************************************************/
5346
5347 #define ST st->u.curlyx
5348
5349         case CURLYX:    /* start of /A*B/  (for complex A) */
5350         {
5351             /* No need to save/restore up to this paren */
5352             I32 parenfloor = scan->flags;
5353
5354             assert(next); /* keep Coverity happy */
5355             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5356                 next += ARG(next);
5357
5358             /* XXXX Probably it is better to teach regpush to support
5359                parenfloor > maxopenparen ... */
5360             if (parenfloor > (I32)rex->lastparen)
5361                 parenfloor = rex->lastparen; /* Pessimization... */
5362
5363             ST.prev_curlyx= cur_curlyx;
5364             cur_curlyx = st;
5365             ST.cp = PL_savestack_ix;
5366
5367             /* these fields contain the state of the current curly.
5368              * they are accessed by subsequent WHILEMs */
5369             ST.parenfloor = parenfloor;
5370             ST.me = scan;
5371             ST.B = next;
5372             ST.minmod = minmod;
5373             minmod = 0;
5374             ST.count = -1;      /* this will be updated by WHILEM */
5375             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5376
5377             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5378             assert(0); /* NOTREACHED */
5379         }
5380
5381         case CURLYX_end: /* just finished matching all of A*B */
5382             cur_curlyx = ST.prev_curlyx;
5383             sayYES;
5384             assert(0); /* NOTREACHED */
5385
5386         case CURLYX_end_fail: /* just failed to match all of A*B */
5387             regcpblow(ST.cp);
5388             cur_curlyx = ST.prev_curlyx;
5389             sayNO;
5390             assert(0); /* NOTREACHED */
5391
5392
5393 #undef ST
5394 #define ST st->u.whilem
5395
5396         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5397         {
5398             /* see the discussion above about CURLYX/WHILEM */
5399             I32 n;
5400             int min = ARG1(cur_curlyx->u.curlyx.me);
5401             int max = ARG2(cur_curlyx->u.curlyx.me);
5402             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5403
5404             assert(cur_curlyx); /* keep Coverity happy */
5405             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5406             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5407             ST.cache_offset = 0;
5408             ST.cache_mask = 0;
5409
5410
5411             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5412                   "%*s  whilem: matched %ld out of %d..%d\n",
5413                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5414             );
5415
5416             /* First just match a string of min A's. */
5417
5418             if (n < min) {
5419                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5420                                     maxopenparen);
5421                 cur_curlyx->u.curlyx.lastloc = locinput;
5422                 REGCP_SET(ST.lastcp);
5423
5424                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5425                 assert(0); /* NOTREACHED */
5426             }
5427
5428             /* If degenerate A matches "", assume A done. */
5429
5430             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5431                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5432                    "%*s  whilem: empty match detected, trying continuation...\n",
5433                    REPORT_CODE_OFF+depth*2, "")
5434                 );
5435                 goto do_whilem_B_max;
5436             }
5437
5438             /* super-linear cache processing */
5439
5440             if (scan->flags) {
5441
5442                 if (!reginfo->poscache_maxiter) {
5443                     /* start the countdown: Postpone detection until we
5444                      * know the match is not *that* much linear. */
5445                     reginfo->poscache_maxiter
5446                         =    (reginfo->strend - reginfo->strbeg + 1)
5447                            * (scan->flags>>4);
5448                     /* possible overflow for long strings and many CURLYX's */
5449                     if (reginfo->poscache_maxiter < 0)
5450                         reginfo->poscache_maxiter = I32_MAX;
5451                     reginfo->poscache_iter = reginfo->poscache_maxiter;
5452                 }
5453
5454                 if (reginfo->poscache_iter-- == 0) {
5455                     /* initialise cache */
5456                     const I32 size = (reginfo->poscache_maxiter + 7)/8;
5457                     if (PL_reg_poscache) {
5458                         if ((I32)PL_reg_poscache_size < size) {
5459                             Renew(PL_reg_poscache, size, char);
5460                             PL_reg_poscache_size = size;
5461                         }
5462                         Zero(PL_reg_poscache, size, char);
5463                     }
5464                     else {
5465                         PL_reg_poscache_size = size;
5466                         Newxz(PL_reg_poscache, size, char);
5467                     }
5468                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5469       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5470                               PL_colors[4], PL_colors[5])
5471                     );
5472                 }
5473
5474                 if (reginfo->poscache_iter < 0) {
5475                     /* have we already failed at this position? */
5476                     I32 offset, mask;
5477                     offset  = (scan->flags & 0xf) - 1
5478                                 +   (locinput - reginfo->strbeg)
5479                                   * (scan->flags>>4);
5480                     mask    = 1 << (offset % 8);
5481                     offset /= 8;
5482                     if (PL_reg_poscache[offset] & mask) {
5483                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5484                             "%*s  whilem: (cache) already tried at this position...\n",
5485                             REPORT_CODE_OFF+depth*2, "")
5486                         );
5487                         sayNO; /* cache records failure */
5488                     }
5489                     ST.cache_offset = offset;
5490                     ST.cache_mask   = mask;
5491                 }
5492             }
5493
5494             /* Prefer B over A for minimal matching. */
5495
5496             if (cur_curlyx->u.curlyx.minmod) {
5497                 ST.save_curlyx = cur_curlyx;
5498                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5499                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5500                             maxopenparen);
5501                 REGCP_SET(ST.lastcp);
5502                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5503                                     locinput);
5504                 assert(0); /* NOTREACHED */
5505             }
5506
5507             /* Prefer A over B for maximal matching. */
5508
5509             if (n < max) { /* More greed allowed? */
5510                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5511                             maxopenparen);
5512                 cur_curlyx->u.curlyx.lastloc = locinput;
5513                 REGCP_SET(ST.lastcp);
5514                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5515                 assert(0); /* NOTREACHED */
5516             }
5517             goto do_whilem_B_max;
5518         }
5519         assert(0); /* NOTREACHED */
5520
5521         case WHILEM_B_min: /* just matched B in a minimal match */
5522         case WHILEM_B_max: /* just matched B in a maximal match */
5523             cur_curlyx = ST.save_curlyx;
5524             sayYES;
5525             assert(0); /* NOTREACHED */
5526
5527         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5528             cur_curlyx = ST.save_curlyx;
5529             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5530             cur_curlyx->u.curlyx.count--;
5531             CACHEsayNO;
5532             assert(0); /* NOTREACHED */
5533
5534         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5535             /* FALL THROUGH */
5536         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5537             REGCP_UNWIND(ST.lastcp);
5538             regcppop(rex, &maxopenparen);
5539             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5540             cur_curlyx->u.curlyx.count--;
5541             CACHEsayNO;
5542             assert(0); /* NOTREACHED */
5543
5544         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5545             REGCP_UNWIND(ST.lastcp);
5546             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5547             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5548                 "%*s  whilem: failed, trying continuation...\n",
5549                 REPORT_CODE_OFF+depth*2, "")
5550             );
5551           do_whilem_B_max:
5552             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5553                 && ckWARN(WARN_REGEXP)
5554                 && !reginfo->warned)
5555             {
5556                 reginfo->warned = TRUE;
5557                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5558                      "Complex regular subexpression recursion limit (%d) "
5559                      "exceeded",
5560                      REG_INFTY - 1);
5561             }
5562
5563             /* now try B */
5564             ST.save_curlyx = cur_curlyx;
5565             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5566             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5567                                 locinput);
5568             assert(0); /* NOTREACHED */
5569
5570         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5571             cur_curlyx = ST.save_curlyx;
5572             REGCP_UNWIND(ST.lastcp);
5573             regcppop(rex, &maxopenparen);
5574
5575             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5576                 /* Maximum greed exceeded */
5577                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5578                     && ckWARN(WARN_REGEXP)
5579                     && !reginfo->warned)
5580                 {
5581                     reginfo->warned     = TRUE;
5582                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5583                         "Complex regular subexpression recursion "
5584                         "limit (%d) exceeded",
5585                         REG_INFTY - 1);
5586                 }
5587                 cur_curlyx->u.curlyx.count--;
5588                 CACHEsayNO;
5589             }
5590
5591             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5592                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5593             );
5594             /* Try grabbing another A and see if it helps. */
5595             cur_curlyx->u.curlyx.lastloc = locinput;
5596             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5597                             maxopenparen);
5598             REGCP_SET(ST.lastcp);
5599             PUSH_STATE_GOTO(WHILEM_A_min,
5600                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5601                 locinput);
5602             assert(0); /* NOTREACHED */
5603
5604 #undef  ST
5605 #define ST st->u.branch
5606
5607         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5608             next = scan + ARG(scan);
5609             if (next == scan)
5610                 next = NULL;
5611             scan = NEXTOPER(scan);
5612             /* FALL THROUGH */
5613
5614         case BRANCH:        /*  /(...|A|...)/ */
5615             scan = NEXTOPER(scan); /* scan now points to inner node */
5616             ST.lastparen = rex->lastparen;
5617             ST.lastcloseparen = rex->lastcloseparen;
5618             ST.next_branch = next;
5619             REGCP_SET(ST.cp);
5620
5621             /* Now go into the branch */
5622             if (has_cutgroup) {
5623                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5624             } else {
5625                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5626             }
5627             assert(0); /* NOTREACHED */
5628
5629         case CUTGROUP:  /*  /(*THEN)/  */
5630             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5631                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5632             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5633             assert(0); /* NOTREACHED */
5634
5635         case CUTGROUP_next_fail:
5636             do_cutgroup = 1;
5637             no_final = 1;
5638             if (st->u.mark.mark_name)
5639                 sv_commit = st->u.mark.mark_name;
5640             sayNO;
5641             assert(0); /* NOTREACHED */
5642
5643         case BRANCH_next:
5644             sayYES;
5645             assert(0); /* NOTREACHED */
5646
5647         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5648             if (do_cutgroup) {
5649                 do_cutgroup = 0;
5650                 no_final = 0;
5651             }
5652             REGCP_UNWIND(ST.cp);
5653             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5654             scan = ST.next_branch;
5655             /* no more branches? */
5656             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5657                 DEBUG_EXECUTE_r({
5658                     PerlIO_printf( Perl_debug_log,
5659                         "%*s  %sBRANCH failed...%s\n",
5660                         REPORT_CODE_OFF+depth*2, "",
5661                         PL_colors[4],
5662                         PL_colors[5] );
5663                 });
5664                 sayNO_SILENT;
5665             }
5666             continue; /* execute next BRANCH[J] op */
5667             assert(0); /* NOTREACHED */
5668
5669         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5670             minmod = 1;
5671             break;
5672
5673 #undef  ST
5674 #define ST st->u.curlym
5675
5676         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5677
5678             /* This is an optimisation of CURLYX that enables us to push
5679              * only a single backtracking state, no matter how many matches
5680              * there are in {m,n}. It relies on the pattern being constant
5681              * length, with no parens to influence future backrefs
5682              */
5683
5684             ST.me = scan;
5685             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5686
5687             ST.lastparen      = rex->lastparen;
5688             ST.lastcloseparen = rex->lastcloseparen;
5689
5690             /* if paren positive, emulate an OPEN/CLOSE around A */
5691             if (ST.me->flags) {
5692                 U32 paren = ST.me->flags;
5693                 if (paren > maxopenparen)
5694                     maxopenparen = paren;
5695                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5696             }
5697             ST.A = scan;
5698             ST.B = next;
5699             ST.alen = 0;
5700             ST.count = 0;
5701             ST.minmod = minmod;
5702             minmod = 0;
5703             ST.c1 = CHRTEST_UNINIT;
5704             REGCP_SET(ST.cp);
5705
5706             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5707                 goto curlym_do_B;
5708
5709           curlym_do_A: /* execute the A in /A{m,n}B/  */
5710             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5711             assert(0); /* NOTREACHED */
5712
5713         case CURLYM_A: /* we've just matched an A */
5714             ST.count++;
5715             /* after first match, determine A's length: u.curlym.alen */
5716             if (ST.count == 1) {
5717                 if (reginfo->is_utf8_target) {
5718                     char *s = st->locinput;
5719                     while (s < locinput) {
5720                         ST.alen++;
5721                         s += UTF8SKIP(s);
5722                     }
5723                 }
5724                 else {
5725                     ST.alen = locinput - st->locinput;
5726                 }
5727                 if (ST.alen == 0)
5728                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5729             }
5730             DEBUG_EXECUTE_r(
5731                 PerlIO_printf(Perl_debug_log,
5732                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5733                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5734                           (IV) ST.count, (IV)ST.alen)
5735             );
5736
5737             if (cur_eval && cur_eval->u.eval.close_paren &&
5738                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5739                 goto fake_end;
5740
5741             {
5742                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5743                 if ( max == REG_INFTY || ST.count < max )
5744                     goto curlym_do_A; /* try to match another A */
5745             }
5746             goto curlym_do_B; /* try to match B */
5747
5748         case CURLYM_A_fail: /* just failed to match an A */
5749             REGCP_UNWIND(ST.cp);
5750
5751             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5752                 || (cur_eval && cur_eval->u.eval.close_paren &&
5753                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5754                 sayNO;
5755
5756           curlym_do_B: /* execute the B in /A{m,n}B/  */
5757             if (ST.c1 == CHRTEST_UNINIT) {
5758                 /* calculate c1 and c2 for possible match of 1st char
5759                  * following curly */
5760                 ST.c1 = ST.c2 = CHRTEST_VOID;
5761                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5762                     regnode *text_node = ST.B;
5763                     if (! HAS_TEXT(text_node))
5764                         FIND_NEXT_IMPT(text_node);
5765                     /* this used to be
5766
5767                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5768
5769                         But the former is redundant in light of the latter.
5770
5771                         if this changes back then the macro for
5772                         IS_TEXT and friends need to change.
5773                      */
5774                     if (PL_regkind[OP(text_node)] == EXACT) {
5775                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5776                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5777                            reginfo))
5778                         {
5779                             sayNO;
5780                         }
5781                     }
5782                 }
5783             }
5784
5785             DEBUG_EXECUTE_r(
5786                 PerlIO_printf(Perl_debug_log,
5787                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5788                     (int)(REPORT_CODE_OFF+(depth*2)),
5789                     "", (IV)ST.count)
5790                 );
5791             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5792                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5793                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5794                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5795                     {
5796                         /* simulate B failing */
5797                         DEBUG_OPTIMISE_r(
5798                             PerlIO_printf(Perl_debug_log,
5799                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5800                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5801                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5802                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5803                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5804                         );
5805                         state_num = CURLYM_B_fail;
5806                         goto reenter_switch;
5807                     }
5808                 }
5809                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5810                     /* simulate B failing */
5811                     DEBUG_OPTIMISE_r(
5812                         PerlIO_printf(Perl_debug_log,
5813                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5814                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5815                             (int) nextchr, ST.c1, ST.c2)
5816                     );
5817                     state_num = CURLYM_B_fail;
5818                     goto reenter_switch;
5819                 }
5820             }
5821
5822             if (ST.me->flags) {
5823                 /* emulate CLOSE: mark current A as captured */
5824                 I32 paren = ST.me->flags;
5825                 if (ST.count) {
5826                     rex->offs[paren].start
5827                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5828                     rex->offs[paren].end = locinput - reginfo->strbeg;
5829                     if ((U32)paren > rex->lastparen)
5830                         rex->lastparen = paren;
5831                     rex->lastcloseparen = paren;
5832                 }
5833                 else
5834                     rex->offs[paren].end = -1;
5835                 if (cur_eval && cur_eval->u.eval.close_paren &&
5836                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5837                 {
5838                     if (ST.count)
5839                         goto fake_end;
5840                     else
5841                         sayNO;
5842                 }
5843             }
5844
5845             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5846             assert(0); /* NOTREACHED */
5847
5848         case CURLYM_B_fail: /* just failed to match a B */
5849             REGCP_UNWIND(ST.cp);
5850             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5851             if (ST.minmod) {
5852                 I32 max = ARG2(ST.me);
5853                 if (max != REG_INFTY && ST.count == max)
5854                     sayNO;
5855                 goto curlym_do_A; /* try to match a further A */
5856             }
5857             /* backtrack one A */
5858             if (ST.count == ARG1(ST.me) /* min */)
5859                 sayNO;
5860             ST.count--;
5861             SET_locinput(HOPc(locinput, -ST.alen));
5862             goto curlym_do_B; /* try to match B */
5863
5864 #undef ST
5865 #define ST st->u.curly
5866
5867 #define CURLY_SETPAREN(paren, success) \
5868     if (paren) { \
5869         if (success) { \
5870             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
5871             rex->offs[paren].end = locinput - reginfo->strbeg; \
5872             if (paren > rex->lastparen) \
5873                 rex->lastparen = paren; \
5874             rex->lastcloseparen = paren; \
5875         } \
5876         else { \
5877             rex->offs[paren].end = -1; \
5878             rex->lastparen      = ST.lastparen; \
5879             rex->lastcloseparen = ST.lastcloseparen; \
5880         } \
5881     }
5882
5883         case STAR:              /*  /A*B/ where A is width 1 char */
5884             ST.paren = 0;
5885             ST.min = 0;
5886             ST.max = REG_INFTY;
5887             scan = NEXTOPER(scan);
5888             goto repeat;
5889
5890         case PLUS:              /*  /A+B/ where A is width 1 char */
5891             ST.paren = 0;
5892             ST.min = 1;
5893             ST.max = REG_INFTY;
5894             scan = NEXTOPER(scan);
5895             goto repeat;
5896
5897         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5898             ST.paren = scan->flags;     /* Which paren to set */
5899             ST.lastparen      = rex->lastparen;
5900             ST.lastcloseparen = rex->lastcloseparen;
5901             if (ST.paren > maxopenparen)
5902                 maxopenparen = ST.paren;
5903             ST.min = ARG1(scan);  /* min to match */
5904             ST.max = ARG2(scan);  /* max to match */
5905             if (cur_eval && cur_eval->u.eval.close_paren &&
5906                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5907                 ST.min=1;
5908                 ST.max=1;
5909             }
5910             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5911             goto repeat;
5912
5913         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5914             ST.paren = 0;
5915             ST.min = ARG1(scan);  /* min to match */
5916             ST.max = ARG2(scan);  /* max to match */
5917             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5918           repeat:
5919             /*
5920             * Lookahead to avoid useless match attempts
5921             * when we know what character comes next.
5922             *
5923             * Used to only do .*x and .*?x, but now it allows
5924             * for )'s, ('s and (?{ ... })'s to be in the way
5925             * of the quantifier and the EXACT-like node.  -- japhy
5926             */
5927
5928             assert(ST.min <= ST.max);
5929             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5930                 ST.c1 = ST.c2 = CHRTEST_VOID;
5931             }
5932             else {
5933                 regnode *text_node = next;
5934
5935                 if (! HAS_TEXT(text_node))
5936                     FIND_NEXT_IMPT(text_node);
5937
5938                 if (! HAS_TEXT(text_node))
5939                     ST.c1 = ST.c2 = CHRTEST_VOID;
5940                 else {
5941                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5942                         ST.c1 = ST.c2 = CHRTEST_VOID;
5943                     }
5944                     else {
5945
5946                     /*  Currently we only get here when
5947
5948                         PL_rekind[OP(text_node)] == EXACT
5949
5950                         if this changes back then the macro for IS_TEXT and
5951                         friends need to change. */
5952                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5953                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5954                            reginfo))
5955                         {
5956                             sayNO;
5957                         }
5958                     }
5959                 }
5960             }
5961
5962             ST.A = scan;
5963             ST.B = next;
5964             if (minmod) {
5965                 char *li = locinput;
5966                 minmod = 0;
5967                 if (ST.min &&
5968                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
5969                             < ST.min)
5970                     sayNO;
5971                 SET_locinput(li);
5972                 ST.count = ST.min;
5973                 REGCP_SET(ST.cp);
5974                 if (ST.c1 == CHRTEST_VOID)
5975                     goto curly_try_B_min;
5976
5977                 ST.oldloc = locinput;
5978
5979                 /* set ST.maxpos to the furthest point along the
5980                  * string that could possibly match */
5981                 if  (ST.max == REG_INFTY) {
5982                     ST.maxpos = reginfo->strend - 1;
5983                     if (utf8_target)
5984                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5985                             ST.maxpos--;
5986                 }
5987                 else if (utf8_target) {
5988                     int m = ST.max - ST.min;
5989                     for (ST.maxpos = locinput;
5990                          m >0 && ST.maxpos < reginfo->strend; m--)
5991                         ST.maxpos += UTF8SKIP(ST.maxpos);
5992                 }
5993                 else {
5994                     ST.maxpos = locinput + ST.max - ST.min;
5995                     if (ST.maxpos >= reginfo->strend)
5996                         ST.maxpos = reginfo->strend - 1;
5997                 }
5998                 goto curly_try_B_min_known;
5999
6000             }
6001             else {
6002                 /* avoid taking address of locinput, so it can remain
6003                  * a register var */
6004                 char *li = locinput;
6005                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6006                 if (ST.count < ST.min)
6007                     sayNO;
6008                 SET_locinput(li);
6009                 if ((ST.count > ST.min)
6010                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6011                 {
6012                     /* A{m,n} must come at the end of the string, there's
6013                      * no point in backing off ... */
6014                     ST.min = ST.count;
6015                     /* ...except that $ and \Z can match before *and* after
6016                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6017                        We may back off by one in this case. */
6018                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6019                         ST.min--;
6020                 }
6021                 REGCP_SET(ST.cp);
6022                 goto curly_try_B_max;
6023             }
6024             assert(0); /* NOTREACHED */
6025
6026
6027         case CURLY_B_min_known_fail:
6028             /* failed to find B in a non-greedy match where c1,c2 valid */
6029
6030             REGCP_UNWIND(ST.cp);
6031             if (ST.paren) {
6032                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6033             }
6034             /* Couldn't or didn't -- move forward. */
6035             ST.oldloc = locinput;
6036             if (utf8_target)
6037                 locinput += UTF8SKIP(locinput);
6038             else
6039                 locinput++;
6040             ST.count++;
6041           curly_try_B_min_known:
6042              /* find the next place where 'B' could work, then call B */
6043             {
6044                 int n;
6045                 if (utf8_target) {
6046                     n = (ST.oldloc == locinput) ? 0 : 1;
6047                     if (ST.c1 == ST.c2) {
6048                         /* set n to utf8_distance(oldloc, locinput) */
6049                         while (locinput <= ST.maxpos
6050                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6051                         {
6052                             locinput += UTF8SKIP(locinput);
6053                             n++;
6054                         }
6055                     }
6056                     else {
6057                         /* set n to utf8_distance(oldloc, locinput) */
6058                         while (locinput <= ST.maxpos
6059                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6060                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6061                         {
6062                             locinput += UTF8SKIP(locinput);
6063                             n++;
6064                         }
6065                     }
6066                 }
6067                 else {  /* Not utf8_target */
6068                     if (ST.c1 == ST.c2) {
6069                         while (locinput <= ST.maxpos &&
6070                                UCHARAT(locinput) != ST.c1)
6071                             locinput++;
6072                     }
6073                     else {
6074                         while (locinput <= ST.maxpos
6075                                && UCHARAT(locinput) != ST.c1
6076                                && UCHARAT(locinput) != ST.c2)
6077                             locinput++;
6078                     }
6079                     n = locinput - ST.oldloc;
6080                 }
6081                 if (locinput > ST.maxpos)
6082                     sayNO;
6083                 if (n) {
6084                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6085                      * at b; check that everything between oldloc and
6086                      * locinput matches */
6087                     char *li = ST.oldloc;
6088                     ST.count += n;
6089                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6090                         sayNO;
6091                     assert(n == REG_INFTY || locinput == li);
6092                 }
6093                 CURLY_SETPAREN(ST.paren, ST.count);
6094                 if (cur_eval && cur_eval->u.eval.close_paren &&
6095                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6096                     goto fake_end;
6097                 }
6098                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6099             }
6100             assert(0); /* NOTREACHED */
6101
6102
6103         case CURLY_B_min_fail:
6104             /* failed to find B in a non-greedy match where c1,c2 invalid */
6105
6106             REGCP_UNWIND(ST.cp);
6107             if (ST.paren) {
6108                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6109             }
6110             /* failed -- move forward one */
6111             {
6112                 char *li = locinput;
6113                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6114                     sayNO;
6115                 }
6116                 locinput = li;
6117             }
6118             {
6119                 ST.count++;
6120                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6121                         ST.count > 0)) /* count overflow ? */
6122                 {
6123                   curly_try_B_min:
6124                     CURLY_SETPAREN(ST.paren, ST.count);
6125                     if (cur_eval && cur_eval->u.eval.close_paren &&
6126                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6127                         goto fake_end;
6128                     }
6129                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6130                 }
6131             }
6132             sayNO;
6133             assert(0); /* NOTREACHED */
6134
6135
6136         curly_try_B_max:
6137             /* a successful greedy match: now try to match B */
6138             if (cur_eval && cur_eval->u.eval.close_paren &&
6139                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6140                 goto fake_end;
6141             }
6142             {
6143                 bool could_match = locinput < reginfo->strend;
6144
6145                 /* If it could work, try it. */
6146                 if (ST.c1 != CHRTEST_VOID && could_match) {
6147                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6148                     {
6149                         could_match = memEQ(locinput,
6150                                             ST.c1_utf8,
6151                                             UTF8SKIP(locinput))
6152                                     || memEQ(locinput,
6153                                              ST.c2_utf8,
6154                                              UTF8SKIP(locinput));
6155                     }
6156                     else {
6157                         could_match = UCHARAT(locinput) == ST.c1
6158                                       || UCHARAT(locinput) == ST.c2;
6159                     }
6160                 }
6161                 if (ST.c1 == CHRTEST_VOID || could_match) {
6162                     CURLY_SETPAREN(ST.paren, ST.count);
6163                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6164                     assert(0); /* NOTREACHED */
6165                 }
6166             }
6167             /* FALL THROUGH */
6168
6169         case CURLY_B_max_fail:
6170             /* failed to find B in a greedy match */
6171
6172             REGCP_UNWIND(ST.cp);
6173             if (ST.paren) {
6174                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6175             }
6176             /*  back up. */
6177             if (--ST.count < ST.min)
6178                 sayNO;
6179             locinput = HOPc(locinput, -1);
6180             goto curly_try_B_max;
6181
6182 #undef ST
6183
6184         case END: /*  last op of main pattern  */
6185             fake_end:
6186             if (cur_eval) {
6187                 /* we've just finished A in /(??{A})B/; now continue with B */
6188
6189                 st->u.eval.prev_rex = rex_sv;           /* inner */
6190
6191                 /* Save *all* the positions. */
6192                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6193                 rex_sv = cur_eval->u.eval.prev_rex;
6194                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6195                 SET_reg_curpm(rex_sv);
6196                 rex = ReANY(rex_sv);
6197                 rexi = RXi_GET(rex);
6198                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6199
6200                 REGCP_SET(st->u.eval.lastcp);
6201
6202                 /* Restore parens of the outer rex without popping the
6203                  * savestack */
6204                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6205                                         &maxopenparen);
6206
6207                 st->u.eval.prev_eval = cur_eval;
6208                 cur_eval = cur_eval->u.eval.prev_eval;
6209                 DEBUG_EXECUTE_r(
6210                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6211                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6212                 if ( nochange_depth )
6213                     nochange_depth--;
6214
6215                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6216                                     locinput); /* match B */
6217             }
6218
6219             if (locinput < reginfo->till) {
6220                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6221                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6222                                       PL_colors[4],
6223                                       (long)(locinput - PL_reg_starttry),
6224                                       (long)(reginfo->till - PL_reg_starttry),
6225                                       PL_colors[5]));
6226
6227                 sayNO_SILENT;           /* Cannot match: too short. */
6228             }
6229             sayYES;                     /* Success! */
6230
6231         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6232             DEBUG_EXECUTE_r(
6233             PerlIO_printf(Perl_debug_log,
6234                 "%*s  %ssubpattern success...%s\n",
6235                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6236             sayYES;                     /* Success! */
6237
6238 #undef  ST
6239 #define ST st->u.ifmatch
6240
6241         {
6242             char *newstart;
6243
6244         case SUSPEND:   /* (?>A) */
6245             ST.wanted = 1;
6246             newstart = locinput;
6247             goto do_ifmatch;
6248
6249         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6250             ST.wanted = 0;
6251             goto ifmatch_trivial_fail_test;
6252
6253         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6254             ST.wanted = 1;
6255           ifmatch_trivial_fail_test:
6256             if (scan->flags) {
6257                 char * const s = HOPBACKc(locinput, scan->flags);
6258                 if (!s) {
6259                     /* trivial fail */
6260                     if (logical) {
6261                         logical = 0;
6262                         sw = 1 - cBOOL(ST.wanted);
6263                     }
6264                     else if (ST.wanted)
6265                         sayNO;
6266                     next = scan + ARG(scan);
6267                     if (next == scan)
6268                         next = NULL;
6269                     break;
6270                 }
6271                 newstart = s;
6272             }
6273             else
6274                 newstart = locinput;
6275
6276           do_ifmatch:
6277             ST.me = scan;
6278             ST.logical = logical;
6279             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6280
6281             /* execute body of (?...A) */
6282             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6283             assert(0); /* NOTREACHED */
6284         }
6285
6286         case IFMATCH_A_fail: /* body of (?...A) failed */
6287             ST.wanted = !ST.wanted;
6288             /* FALL THROUGH */
6289
6290         case IFMATCH_A: /* body of (?...A) succeeded */
6291             if (ST.logical) {
6292                 sw = cBOOL(ST.wanted);
6293             }
6294             else if (!ST.wanted)
6295                 sayNO;
6296
6297             if (OP(ST.me) != SUSPEND) {
6298                 /* restore old position except for (?>...) */
6299                 locinput = st->locinput;
6300             }
6301             scan = ST.me + ARG(ST.me);
6302             if (scan == ST.me)
6303                 scan = NULL;
6304             continue; /* execute B */
6305
6306 #undef ST
6307
6308         case LONGJMP: /*  alternative with many branches compiles to
6309                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6310             next = scan + ARG(scan);
6311             if (next == scan)
6312                 next = NULL;
6313             break;
6314
6315         case COMMIT:  /*  (*COMMIT)  */
6316             reginfo->cutpoint = reginfo->strend;
6317             /* FALLTHROUGH */
6318
6319         case PRUNE:   /*  (*PRUNE)   */
6320             if (!scan->flags)
6321                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6322             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6323             assert(0); /* NOTREACHED */
6324
6325         case COMMIT_next_fail:
6326             no_final = 1;
6327             /* FALLTHROUGH */
6328
6329         case OPFAIL:   /* (*FAIL)  */
6330             sayNO;
6331             assert(0); /* NOTREACHED */
6332
6333 #define ST st->u.mark
6334         case MARKPOINT: /*  (*MARK:foo)  */
6335             ST.prev_mark = mark_state;
6336             ST.mark_name = sv_commit = sv_yes_mark
6337                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6338             mark_state = st;
6339             ST.mark_loc = locinput;
6340             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6341             assert(0); /* NOTREACHED */
6342
6343         case MARKPOINT_next:
6344             mark_state = ST.prev_mark;
6345             sayYES;
6346             assert(0); /* NOTREACHED */
6347
6348         case MARKPOINT_next_fail:
6349             if (popmark && sv_eq(ST.mark_name,popmark))
6350             {
6351                 if (ST.mark_loc > startpoint)
6352                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6353                 popmark = NULL; /* we found our mark */
6354                 sv_commit = ST.mark_name;
6355
6356                 DEBUG_EXECUTE_r({
6357                         PerlIO_printf(Perl_debug_log,
6358                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6359                             REPORT_CODE_OFF+depth*2, "",
6360                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6361                 });
6362             }
6363             mark_state = ST.prev_mark;
6364             sv_yes_mark = mark_state ?
6365                 mark_state->u.mark.mark_name : NULL;
6366             sayNO;
6367             assert(0); /* NOTREACHED */
6368
6369         case SKIP:  /*  (*SKIP)  */
6370             if (scan->flags) {
6371                 /* (*SKIP) : if we fail we cut here*/
6372                 ST.mark_name = NULL;
6373                 ST.mark_loc = locinput;
6374                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6375             } else {
6376                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6377                    otherwise do nothing.  Meaning we need to scan
6378                  */
6379                 regmatch_state *cur = mark_state;
6380                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6381
6382                 while (cur) {
6383                     if ( sv_eq( cur->u.mark.mark_name,
6384                                 find ) )
6385                     {
6386                         ST.mark_name = find;
6387                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6388                     }
6389                     cur = cur->u.mark.prev_mark;
6390                 }
6391             }
6392             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6393             break;
6394
6395         case SKIP_next_fail:
6396             if (ST.mark_name) {
6397                 /* (*CUT:NAME) - Set up to search for the name as we
6398                    collapse the stack*/
6399                 popmark = ST.mark_name;
6400             } else {
6401                 /* (*CUT) - No name, we cut here.*/
6402                 if (ST.mark_loc > startpoint)
6403                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6404                 /* but we set sv_commit to latest mark_name if there
6405                    is one so they can test to see how things lead to this
6406                    cut */
6407                 if (mark_state)
6408                     sv_commit=mark_state->u.mark.mark_name;
6409             }
6410             no_final = 1;
6411             sayNO;
6412             assert(0); /* NOTREACHED */
6413 #undef ST
6414
6415         case LNBREAK: /* \R */
6416             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6417                 locinput += n;
6418             } else
6419                 sayNO;
6420             break;
6421
6422         default:
6423             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6424                           PTR2UV(scan), OP(scan));
6425             Perl_croak(aTHX_ "regexp memory corruption");
6426
6427         /* this is a point to jump to in order to increment
6428          * locinput by one character */
6429         increment_locinput:
6430             assert(!NEXTCHR_IS_EOS);
6431             if (utf8_target) {
6432                 locinput += PL_utf8skip[nextchr];
6433                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6434                 if (locinput > reginfo->strend)
6435                     sayNO;
6436             }
6437             else
6438                 locinput++;
6439             break;
6440
6441         } /* end switch */
6442
6443         /* switch break jumps here */
6444         scan = next; /* prepare to execute the next op and ... */
6445         continue;    /* ... jump back to the top, reusing st */
6446         assert(0); /* NOTREACHED */
6447
6448       push_yes_state:
6449         /* push a state that backtracks on success */
6450         st->u.yes.prev_yes_state = yes_state;
6451         yes_state = st;
6452         /* FALL THROUGH */
6453       push_state:
6454         /* push a new regex state, then continue at scan  */
6455         {
6456             regmatch_state *newst;
6457
6458             DEBUG_STACK_r({
6459                 regmatch_state *cur = st;
6460                 regmatch_state *curyes = yes_state;
6461                 int curd = depth;
6462                 regmatch_slab *slab = PL_regmatch_slab;
6463                 for (;curd > -1;cur--,curd--) {
6464                     if (cur < SLAB_FIRST(slab)) {
6465                         slab = slab->prev;
6466                         cur = SLAB_LAST(slab);
6467                     }
6468                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6469                         REPORT_CODE_OFF + 2 + depth * 2,"",
6470                         curd, PL_reg_name[cur->resume_state],
6471                         (curyes == cur) ? "yes" : ""
6472                     );
6473                     if (curyes == cur)
6474                         curyes = cur->u.yes.prev_yes_state;
6475                 }
6476             } else
6477                 DEBUG_STATE_pp("push")
6478             );
6479             depth++;
6480             st->locinput = locinput;
6481             newst = st+1;
6482             if (newst >  SLAB_LAST(PL_regmatch_slab))
6483                 newst = S_push_slab(aTHX);
6484             PL_regmatch_state = newst;
6485
6486             locinput = pushinput;
6487             st = newst;
6488             continue;
6489             assert(0); /* NOTREACHED */
6490         }
6491     }
6492
6493     /*
6494     * We get here only if there's trouble -- normally "case END" is
6495     * the terminating point.
6496     */
6497     Perl_croak(aTHX_ "corrupted regexp pointers");
6498     /*NOTREACHED*/
6499     sayNO;
6500
6501 yes:
6502     if (yes_state) {
6503         /* we have successfully completed a subexpression, but we must now
6504          * pop to the state marked by yes_state and continue from there */
6505         assert(st != yes_state);
6506 #ifdef DEBUGGING
6507         while (st != yes_state) {
6508             st--;
6509             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6510                 PL_regmatch_slab = PL_regmatch_slab->prev;
6511                 st = SLAB_LAST(PL_regmatch_slab);
6512             }
6513             DEBUG_STATE_r({
6514                 if (no_final) {
6515                     DEBUG_STATE_pp("pop (no final)");
6516                 } else {
6517                     DEBUG_STATE_pp("pop (yes)");
6518                 }
6519             });
6520             depth--;
6521         }
6522 #else
6523         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6524             || yes_state > SLAB_LAST(PL_regmatch_slab))
6525         {
6526             /* not in this slab, pop slab */
6527             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6528             PL_regmatch_slab = PL_regmatch_slab->prev;
6529             st = SLAB_LAST(PL_regmatch_slab);
6530         }
6531         depth -= (st - yes_state);
6532 #endif
6533         st = yes_state;
6534         yes_state = st->u.yes.prev_yes_state;
6535         PL_regmatch_state = st;
6536
6537         if (no_final)
6538             locinput= st->locinput;
6539         state_num = st->resume_state + no_final;
6540         goto reenter_switch;
6541     }
6542
6543     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6544                           PL_colors[4], PL_colors[5]));
6545
6546     if (reginfo->info_aux_eval) {
6547         /* each successfully executed (?{...}) block does the equivalent of
6548          *   local $^R = do {...}
6549          * When popping the save stack, all these locals would be undone;
6550          * bypass this by setting the outermost saved $^R to the latest
6551          * value */
6552         if (oreplsv != GvSV(PL_replgv))
6553             sv_setsv(oreplsv, GvSV(PL_replgv));
6554     }
6555     result = 1;
6556     goto final_exit;
6557
6558 no:
6559     DEBUG_EXECUTE_r(
6560         PerlIO_printf(Perl_debug_log,
6561             "%*s  %sfailed...%s\n",
6562             REPORT_CODE_OFF+depth*2, "",
6563             PL_colors[4], PL_colors[5])
6564         );
6565
6566 no_silent:
6567     if (no_final) {
6568         if (yes_state) {
6569             goto yes;
6570         } else {
6571             goto final_exit;
6572         }
6573     }
6574     if (depth) {
6575         /* there's a previous state to backtrack to */
6576         st--;
6577         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6578             PL_regmatch_slab = PL_regmatch_slab->prev;
6579             st = SLAB_LAST(PL_regmatch_slab);
6580         }
6581         PL_regmatch_state = st;
6582         locinput= st->locinput;
6583
6584         DEBUG_STATE_pp("pop");
6585         depth--;
6586         if (yes_state == st)
6587             yes_state = st->u.yes.prev_yes_state;
6588
6589         state_num = st->resume_state + 1; /* failure = success + 1 */
6590         goto reenter_switch;
6591     }
6592     result = 0;
6593
6594   final_exit:
6595     if (rex->intflags & PREGf_VERBARG_SEEN) {
6596         SV *sv_err = get_sv("REGERROR", 1);
6597         SV *sv_mrk = get_sv("REGMARK", 1);
6598         if (result) {
6599             sv_commit = &PL_sv_no;
6600             if (!sv_yes_mark)
6601                 sv_yes_mark = &PL_sv_yes;
6602         } else {
6603             if (!sv_commit)
6604                 sv_commit = &PL_sv_yes;
6605             sv_yes_mark = &PL_sv_no;
6606         }
6607         sv_setsv(sv_err, sv_commit);
6608         sv_setsv(sv_mrk, sv_yes_mark);
6609     }
6610
6611
6612     if (last_pushed_cv) {
6613         dSP;
6614         POP_MULTICALL;
6615         PERL_UNUSED_VAR(SP);
6616     }
6617
6618     assert(!result ||  locinput - reginfo->strbeg >= 0);
6619     return result ?  locinput - reginfo->strbeg : -1;
6620 }
6621
6622 /*
6623  - regrepeat - repeatedly match something simple, report how many
6624  *
6625  * What 'simple' means is a node which can be the operand of a quantifier like
6626  * '+', or {1,3}
6627  *
6628  * startposp - pointer a pointer to the start position.  This is updated
6629  *             to point to the byte following the highest successful
6630  *             match.
6631  * p         - the regnode to be repeatedly matched against.
6632  * reginfo   - struct holding match state, such as strend
6633  * max       - maximum number of things to match.
6634  * depth     - (for debugging) backtracking depth.
6635  */
6636 STATIC I32
6637 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6638             regmatch_info *const reginfo, I32 max, int depth)
6639 {
6640     dVAR;
6641     char *scan;     /* Pointer to current position in target string */
6642     I32 c;
6643     char *loceol = reginfo->strend;   /* local version */
6644     I32 hardcount = 0;  /* How many matches so far */
6645     bool utf8_target = reginfo->is_utf8_target;
6646     int to_complement = 0;  /* Invert the result? */
6647     UV utf8_flags;
6648     _char_class_number classnum;
6649 #ifndef DEBUGGING
6650     PERL_UNUSED_ARG(depth);
6651 #endif
6652
6653     PERL_ARGS_ASSERT_REGREPEAT;
6654
6655     scan = *startposp;
6656     if (max == REG_INFTY)
6657         max = I32_MAX;
6658     else if (! utf8_target && scan + max < loceol)
6659         loceol = scan + max;
6660
6661     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6662      * to the maximum of how far we should go in it (leaving it set to the real
6663      * end, if the maximum permissible would take us beyond that).  This allows
6664      * us to make the loop exit condition that we haven't gone past <loceol> to
6665      * also mean that we haven't exceeded the max permissible count, saving a
6666      * test each time through the loop.  But it assumes that the OP matches a
6667      * single byte, which is true for most of the OPs below when applied to a
6668      * non-UTF-8 target.  Those relatively few OPs that don't have this
6669      * characteristic will have to compensate.
6670      *
6671      * There is no adjustment for UTF-8 targets, as the number of bytes per
6672      * character varies.  OPs will have to test both that the count is less
6673      * than the max permissible (using <hardcount> to keep track), and that we
6674      * are still within the bounds of the string (using <loceol>.  A few OPs
6675      * match a single byte no matter what the encoding.  They can omit the max
6676      * test if, for the UTF-8 case, they do the adjustment that was skipped
6677      * above.
6678      *
6679      * Thus, the code above sets things up for the common case; and exceptional
6680      * cases need extra work; the common case is to make sure <scan> doesn't
6681      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6682      * count doesn't exceed the maximum permissible */
6683
6684     switch (OP(p)) {
6685     case REG_ANY:
6686         if (utf8_target) {
6687             while (scan < loceol && hardcount < max && *scan != '\n') {
6688                 scan += UTF8SKIP(scan);
6689                 hardcount++;
6690             }
6691         } else {
6692             while (scan < loceol && *scan != '\n')
6693                 scan++;
6694         }
6695         break;
6696     case SANY:
6697         if (utf8_target) {
6698             while (scan < loceol && hardcount < max) {
6699                 scan += UTF8SKIP(scan);
6700                 hardcount++;
6701             }
6702         }
6703         else
6704             scan = loceol;
6705         break;
6706     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6707         if (utf8_target && scan + max < loceol) {
6708
6709             /* <loceol> hadn't been adjusted in the UTF-8 case */
6710             scan +=  max;
6711         }
6712         else {
6713             scan = loceol;
6714         }
6715         break;
6716     case EXACT:
6717         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6718
6719         c = (U8)*STRING(p);
6720
6721         /* Can use a simple loop if the pattern char to match on is invariant
6722          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6723          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6724          * true iff it doesn't matter if the argument is in UTF-8 or not */
6725         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6726             if (utf8_target && scan + max < loceol) {
6727                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6728                  * since here, to match at all, 1 char == 1 byte */
6729                 loceol = scan + max;
6730             }
6731             while (scan < loceol && UCHARAT(scan) == c) {
6732                 scan++;
6733             }
6734         }
6735         else if (reginfo->is_utf8_pat) {
6736             if (utf8_target) {
6737                 STRLEN scan_char_len;
6738
6739                 /* When both target and pattern are UTF-8, we have to do
6740                  * string EQ */
6741                 while (hardcount < max
6742                        && scan < loceol
6743                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6744                        && memEQ(scan, STRING(p), scan_char_len))
6745                 {
6746                     scan += scan_char_len;
6747                     hardcount++;
6748                 }
6749             }
6750             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6751
6752                 /* Target isn't utf8; convert the character in the UTF-8
6753                  * pattern to non-UTF8, and do a simple loop */
6754                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6755                 while (scan < loceol && UCHARAT(scan) == c) {
6756                     scan++;
6757                 }
6758             } /* else pattern char is above Latin1, can't possibly match the
6759                  non-UTF-8 target */
6760         }
6761         else {
6762
6763             /* Here, the string must be utf8; pattern isn't, and <c> is
6764              * different in utf8 than not, so can't compare them directly.
6765              * Outside the loop, find the two utf8 bytes that represent c, and
6766              * then look for those in sequence in the utf8 string */
6767             U8 high = UTF8_TWO_BYTE_HI(c);
6768             U8 low = UTF8_TWO_BYTE_LO(c);
6769
6770             while (hardcount < max
6771                     && scan + 1 < loceol
6772                     && UCHARAT(scan) == high
6773                     && UCHARAT(scan + 1) == low)
6774             {
6775                 scan += 2;
6776                 hardcount++;
6777             }
6778         }
6779         break;
6780
6781     case EXACTFA:
6782         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6783         goto do_exactf;
6784
6785     case EXACTFL:
6786         RXp_MATCH_TAINTED_on(prog);
6787         utf8_flags = FOLDEQ_UTF8_LOCALE;
6788         goto do_exactf;
6789
6790     case EXACTF:
6791             utf8_flags = 0;
6792             goto do_exactf;
6793
6794     case EXACTFU_SS:
6795     case EXACTFU_TRICKYFOLD:
6796     case EXACTFU:
6797         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6798
6799     do_exactf: {
6800         int c1, c2;
6801         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6802
6803         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6804
6805         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6806                                         reginfo))
6807         {
6808             if (c1 == CHRTEST_VOID) {
6809                 /* Use full Unicode fold matching */
6810                 char *tmpeol = reginfo->strend;
6811                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6812                 while (hardcount < max
6813                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6814                                              STRING(p), NULL, pat_len,
6815                                              reginfo->is_utf8_pat, utf8_flags))
6816                 {
6817                     scan = tmpeol;
6818                     tmpeol = reginfo->strend;
6819                     hardcount++;
6820                 }
6821             }
6822             else if (utf8_target) {
6823                 if (c1 == c2) {
6824                     while (scan < loceol
6825                            && hardcount < max
6826                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6827                     {
6828                         scan += UTF8SKIP(scan);
6829                         hardcount++;
6830                     }
6831                 }
6832                 else {
6833                     while (scan < loceol
6834                            && hardcount < max
6835                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6836                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6837                     {
6838                         scan += UTF8SKIP(scan);
6839                         hardcount++;
6840                     }
6841                 }
6842             }
6843             else if (c1 == c2) {
6844                 while (scan < loceol && UCHARAT(scan) == c1) {
6845                     scan++;
6846                 }
6847             }
6848             else {
6849                 while (scan < loceol &&
6850                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6851                 {
6852                     scan++;
6853                 }
6854             }
6855         }
6856         break;
6857     }
6858     case ANYOF:
6859     case ANYOF_WARN_SUPER:
6860         if (utf8_target) {
6861             while (hardcount < max
6862                    && scan < loceol
6863                    && reginclass(prog, p, (U8*)scan, utf8_target))
6864             {
6865                 scan += UTF8SKIP(scan);
6866                 hardcount++;
6867             }
6868         } else {
6869             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6870                 scan++;
6871         }
6872         break;
6873
6874     /* The argument (FLAGS) to all the POSIX node types is the class number */
6875
6876     case NPOSIXL:
6877         to_complement = 1;
6878         /* FALLTHROUGH */
6879
6880     case POSIXL:
6881         RXp_MATCH_TAINTED_on(prog);
6882         if (! utf8_target) {
6883             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6884                                                                    *scan)))
6885             {
6886                 scan++;
6887             }
6888         } else {
6889             while (hardcount < max && scan < loceol
6890                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6891                                                                   (U8 *) scan)))
6892             {
6893                 scan += UTF8SKIP(scan);
6894                 hardcount++;
6895             }
6896         }
6897         break;
6898
6899     case POSIXD:
6900         if (utf8_target) {
6901             goto utf8_posix;
6902         }
6903         /* FALLTHROUGH */
6904
6905     case POSIXA:
6906         if (utf8_target && scan + max < loceol) {
6907
6908             /* We didn't adjust <loceol> at the beginning of this routine
6909              * because is UTF-8, but it is actually ok to do so, since here, to
6910              * match, 1 char == 1 byte. */
6911             loceol = scan + max;
6912         }
6913         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6914             scan++;
6915         }
6916         break;
6917
6918     case NPOSIXD:
6919         if (utf8_target) {
6920             to_complement = 1;
6921             goto utf8_posix;
6922         }
6923         /* FALL THROUGH */
6924
6925     case NPOSIXA:
6926         if (! utf8_target) {
6927             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6928                 scan++;
6929             }
6930         }
6931         else {
6932
6933             /* The complement of something that matches only ASCII matches all
6934              * UTF-8 variant code points, plus everything in ASCII that isn't
6935              * in the class. */
6936             while (hardcount < max && scan < loceol
6937                    && (! UTF8_IS_INVARIANT(*scan)
6938                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
6939             {
6940                 scan += UTF8SKIP(scan);
6941                 hardcount++;
6942             }
6943         }
6944         break;
6945
6946     case NPOSIXU:
6947         to_complement = 1;
6948         /* FALLTHROUGH */
6949
6950     case POSIXU:
6951         if (! utf8_target) {
6952             while (scan < loceol && to_complement
6953                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
6954             {
6955                 scan++;
6956             }
6957         }
6958         else {
6959       utf8_posix:
6960             classnum = (_char_class_number) FLAGS(p);
6961             if (classnum < _FIRST_NON_SWASH_CC) {
6962
6963                 /* Here, a swash is needed for above-Latin1 code points.
6964                  * Process as many Latin1 code points using the built-in rules.
6965                  * Go to another loop to finish processing upon encountering
6966                  * the first Latin1 code point.  We could do that in this loop
6967                  * as well, but the other way saves having to test if the swash
6968                  * has been loaded every time through the loop: extra space to
6969                  * save a test. */
6970                 while (hardcount < max && scan < loceol) {
6971                     if (UTF8_IS_INVARIANT(*scan)) {
6972                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6973                                                                    classnum))))
6974                         {
6975                             break;
6976                         }
6977                         scan++;
6978                     }
6979                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6980                         if (! (to_complement
6981                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6982                                                                    *(scan + 1)),
6983                                                     classnum))))
6984                         {
6985                             break;
6986                         }
6987                         scan += 2;
6988                     }
6989                     else {
6990                         goto found_above_latin1;
6991                     }
6992
6993                     hardcount++;
6994                 }
6995             }
6996             else {
6997                 /* For these character classes, the knowledge of how to handle
6998                  * every code point is compiled in to Perl via a macro.  This
6999                  * code is written for making the loops as tight as possible.
7000                  * It could be refactored to save space instead */
7001                 switch (classnum) {
7002                     case _CC_ENUM_SPACE:    /* XXX would require separate code
7003                                                if we revert the change of \v
7004                                                matching this */
7005                         /* FALL THROUGH */
7006                     case _CC_ENUM_PSXSPC:
7007                         while (hardcount < max
7008                                && scan < loceol
7009                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7010                         {
7011                             scan += UTF8SKIP(scan);
7012                             hardcount++;
7013                         }
7014                         break;
7015                     case _CC_ENUM_BLANK:
7016                         while (hardcount < max
7017                                && scan < loceol
7018                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7019                         {
7020                             scan += UTF8SKIP(scan);
7021                             hardcount++;
7022                         }
7023                         break;
7024                     case _CC_ENUM_XDIGIT:
7025                         while (hardcount < max
7026                                && scan < loceol
7027                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7028                         {
7029                             scan += UTF8SKIP(scan);
7030                             hardcount++;
7031                         }
7032                         break;
7033                     case _CC_ENUM_VERTSPACE:
7034                         while (hardcount < max
7035                                && scan < loceol
7036                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7037                         {
7038                             scan += UTF8SKIP(scan);
7039                             hardcount++;
7040                         }
7041                         break;
7042                     case _CC_ENUM_CNTRL:
7043                         while (hardcount < max
7044                                && scan < loceol
7045                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7046                         {
7047                             scan += UTF8SKIP(scan);
7048                             hardcount++;
7049                         }
7050                         break;
7051                     default:
7052                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7053                 }
7054             }
7055         }
7056         break;
7057
7058       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7059
7060         /* Load the swash if not already present */
7061         if (! PL_utf8_swash_ptrs[classnum]) {
7062             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7063             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7064                                         "utf8", swash_property_names[classnum],
7065                                         &PL_sv_undef, 1, 0, NULL, &flags);
7066         }
7067
7068         while (hardcount < max && scan < loceol
7069                && to_complement ^ cBOOL(_generic_utf8(
7070                                        classnum,
7071                                        scan,
7072                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
7073                                                    (U8 *) scan,
7074                                                    TRUE))))
7075         {
7076             scan += UTF8SKIP(scan);
7077             hardcount++;
7078         }
7079         break;
7080
7081     case LNBREAK:
7082         if (utf8_target) {
7083             while (hardcount < max && scan < loceol &&
7084                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7085                 scan += c;
7086                 hardcount++;
7087             }
7088         } else {
7089             /* LNBREAK can match one or two latin chars, which is ok, but we
7090              * have to use hardcount in this situation, and throw away the
7091              * adjustment to <loceol> done before the switch statement */
7092             loceol = reginfo->strend;
7093             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7094                 scan+=c;
7095                 hardcount++;
7096             }
7097         }
7098         break;
7099
7100     case BOUND:
7101     case BOUNDA:
7102     case BOUNDL:
7103     case BOUNDU:
7104     case EOS:
7105     case GPOS:
7106     case KEEPS:
7107     case NBOUND:
7108     case NBOUNDA:
7109     case NBOUNDL:
7110     case NBOUNDU:
7111     case OPFAIL:
7112     case SBOL:
7113     case SEOL:
7114         /* These are all 0 width, so match right here or not at all. */
7115         break;
7116
7117     default:
7118         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7119         assert(0); /* NOTREACHED */
7120
7121     }
7122
7123     if (hardcount)
7124         c = hardcount;
7125     else
7126         c = scan - *startposp;
7127     *startposp = scan;
7128
7129     DEBUG_r({
7130         GET_RE_DEBUG_FLAGS_DECL;
7131         DEBUG_EXECUTE_r({
7132             SV * const prop = sv_newmortal();
7133             regprop(prog, prop, p);
7134             PerlIO_printf(Perl_debug_log,
7135                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7136                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7137         });
7138     });
7139
7140     return(c);
7141 }
7142
7143
7144 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7145 /*
7146 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7147 create a copy so that changes the caller makes won't change the shared one.
7148 If <altsvp> is non-null, will return NULL in it, for back-compat.
7149  */
7150 SV *
7151 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7152 {
7153     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7154
7155     if (altsvp) {
7156         *altsvp = NULL;
7157     }
7158
7159     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7160 }
7161 #endif
7162
7163 STATIC SV *
7164 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7165 {
7166     /* Returns the swash for the input 'node' in the regex 'prog'.
7167      * If <doinit> is true, will attempt to create the swash if not already
7168      *    done.
7169      * If <listsvp> is non-null, will return the swash initialization string in
7170      *    it.
7171      * Tied intimately to how regcomp.c sets up the data structure */
7172
7173     dVAR;
7174     SV *sw  = NULL;
7175     SV *si  = NULL;
7176     SV*  invlist = NULL;
7177
7178     RXi_GET_DECL(prog,progi);
7179     const struct reg_data * const data = prog ? progi->data : NULL;
7180
7181     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7182
7183     assert(ANYOF_NONBITMAP(node));
7184
7185     if (data && data->count) {
7186         const U32 n = ARG(node);
7187
7188         if (data->what[n] == 's') {
7189             SV * const rv = MUTABLE_SV(data->data[n]);
7190             AV * const av = MUTABLE_AV(SvRV(rv));
7191             SV **const ary = AvARRAY(av);
7192             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7193
7194             si = *ary;  /* ary[0] = the string to initialize the swash with */
7195
7196             /* Elements 2 and 3 are either both present or both absent. [2] is
7197              * any inversion list generated at compile time; [3] indicates if
7198              * that inversion list has any user-defined properties in it. */
7199             if (av_len(av) >= 2) {
7200                 invlist = ary[2];
7201                 if (SvUV(ary[3])) {
7202                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7203                 }
7204             }
7205             else {
7206                 invlist = NULL;
7207             }
7208
7209             /* Element [1] is reserved for the set-up swash.  If already there,
7210              * return it; if not, create it and store it there */
7211             if (SvROK(ary[1])) {
7212                 sw = ary[1];
7213             }
7214             else if (si && doinit) {
7215
7216                 sw = _core_swash_init("utf8", /* the utf8 package */
7217                                       "", /* nameless */
7218                                       si,
7219                                       1, /* binary */
7220                                       0, /* not from tr/// */
7221                                       invlist,
7222                                       &swash_init_flags);
7223                 (void)av_store(av, 1, sw);
7224             }
7225         }
7226     }
7227
7228     if (listsvp) {
7229         SV* matches_string = newSVpvn("", 0);
7230
7231         /* Use the swash, if any, which has to have incorporated into it all
7232          * possibilities */
7233         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7234             && (si && si != &PL_sv_undef))
7235         {
7236
7237             /* If no swash, use the input initialization string, if available */
7238             sv_catsv(matches_string, si);
7239         }
7240
7241         /* Add the inversion list to whatever we have.  This may have come from
7242          * the swash, or from an input parameter */
7243         if (invlist) {
7244             sv_catsv(matches_string, _invlist_contents(invlist));
7245         }
7246         *listsvp = matches_string;
7247     }
7248
7249     return sw;
7250 }
7251
7252 /*
7253  - reginclass - determine if a character falls into a character class
7254
7255   n is the ANYOF regnode
7256   p is the target string
7257   utf8_target tells whether p is in UTF-8.
7258
7259   Returns true if matched; false otherwise.
7260
7261   Note that this can be a synthetic start class, a combination of various
7262   nodes, so things you think might be mutually exclusive, such as locale,
7263   aren't.  It can match both locale and non-locale
7264
7265  */
7266
7267 STATIC bool
7268 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7269 {
7270     dVAR;
7271     const char flags = ANYOF_FLAGS(n);
7272     bool match = FALSE;
7273     UV c = *p;
7274
7275     PERL_ARGS_ASSERT_REGINCLASS;
7276
7277     /* If c is not already the code point, get it.  Note that
7278      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7279     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7280         STRLEN c_len = 0;
7281         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7282                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7283                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7284                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7285                  * UTF8_ALLOW_FFFF */
7286         if (c_len == (STRLEN)-1)
7287             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7288     }
7289
7290     /* If this character is potentially in the bitmap, check it */
7291     if (c < 256) {
7292         if (ANYOF_BITMAP_TEST(n, c))
7293             match = TRUE;
7294         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7295                 && ! utf8_target
7296                 && ! isASCII(c))
7297         {
7298             match = TRUE;
7299         }
7300         else if (flags & ANYOF_LOCALE) {
7301             RXp_MATCH_TAINTED_on(prog);
7302
7303             if ((flags & ANYOF_LOC_FOLD)
7304                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7305             {
7306                 match = TRUE;
7307             }
7308             else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7309
7310                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7311                  * if the class includes the Posix character class given by
7312                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7313                  * complemented Posix class given by int(bit/2).  So we loop
7314                  * through the bits, each time changing whether we complement
7315                  * the result or not.  Suppose for the sake of illustration
7316                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7317                  * is set, it means there is a match for this ANYOF node if the
7318                  * character is in the class given by the expression (0 / 2 = 0
7319                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7320                  * and since 'to_complement' is 0, the result will stay TRUE,
7321                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7322                  * bit 1 is 1.  That means there is a match if the character
7323                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7324                  * but will on bit 1.  On the second iteration 'to_complement'
7325                  * will be 1, so the exclusive or will reverse things, so we
7326                  * are testing for \W.  On the third iteration, 'to_complement'
7327                  * will be 0, and we would be testing for \s; the fourth
7328                  * iteration would test for \S, etc.
7329                  *
7330                  * Note that this code assumes that all the classes are closed
7331                  * under folding.  For example, if a character matches \w, then
7332                  * its fold does too; and vice versa.  This should be true for
7333                  * any well-behaved locale for all the currently defined Posix
7334                  * classes, except for :lower: and :upper:, which are handled
7335                  * by the pseudo-class :cased: which matches if either of the
7336                  * other two does.  To get rid of this assumption, an outer
7337                  * loop could be used below to iterate over both the source
7338                  * character, and its fold (if different) */
7339
7340                 int count = 0;
7341                 int to_complement = 0;
7342                 while (count < ANYOF_MAX) {
7343                     if (ANYOF_CLASS_TEST(n, count)
7344                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7345                     {
7346                         match = TRUE;
7347                         break;
7348                     }
7349                     count++;
7350                     to_complement ^= 1;
7351                 }
7352             }
7353         }
7354     }
7355
7356     /* If the bitmap didn't (or couldn't) match, and something outside the
7357      * bitmap could match, try that.  Locale nodes specify completely the
7358      * behavior of code points in the bit map (otherwise, a utf8 target would
7359      * cause them to be treated as Unicode and not locale), except in
7360      * the very unlikely event when this node is a synthetic start class, which
7361      * could be a combination of locale and non-locale nodes.  So allow locale
7362      * to match for the synthetic start class, which will give a false
7363      * positive that will be resolved when the match is done again as not part
7364      * of the synthetic start class */
7365     if (!match) {
7366         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7367             match = TRUE;       /* Everything above 255 matches */
7368         }
7369         else if (ANYOF_NONBITMAP(n)
7370                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7371                      || (utf8_target
7372                          && (c >=256
7373                              || (! (flags & ANYOF_LOCALE))
7374                              || OP(n) == ANYOF_SYNTHETIC))))
7375         {
7376             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7377             if (sw) {
7378                 U8 * utf8_p;
7379                 if (utf8_target) {
7380                     utf8_p = (U8 *) p;
7381                 } else { /* Convert to utf8 */
7382                     STRLEN len = 1;
7383                     utf8_p = bytes_to_utf8(p, &len);
7384                 }
7385
7386                 if (swash_fetch(sw, utf8_p, TRUE)) {
7387                     match = TRUE;
7388                 }
7389
7390                 /* If we allocated a string above, free it */
7391                 if (! utf8_target) Safefree(utf8_p);
7392             }
7393         }
7394
7395         if (UNICODE_IS_SUPER(c)
7396             && OP(n) == ANYOF_WARN_SUPER
7397             && ckWARN_d(WARN_NON_UNICODE))
7398         {
7399             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7400                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7401         }
7402     }
7403
7404     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7405     return cBOOL(flags & ANYOF_INVERT) ^ match;
7406 }
7407
7408 STATIC U8 *
7409 S_reghop3(U8 *s, I32 off, const U8* lim)
7410 {
7411     /* return the position 'off' UTF-8 characters away from 's', forward if
7412      * 'off' >= 0, backwards if negative.  But don't go outside of position
7413      * 'lim', which better be < s  if off < 0 */
7414
7415     dVAR;
7416
7417     PERL_ARGS_ASSERT_REGHOP3;
7418
7419     if (off >= 0) {
7420         while (off-- && s < lim) {
7421             /* XXX could check well-formedness here */
7422             s += UTF8SKIP(s);
7423         }
7424     }
7425     else {
7426         while (off++ && s > lim) {
7427             s--;
7428             if (UTF8_IS_CONTINUED(*s)) {
7429                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7430                     s--;
7431             }
7432             /* XXX could check well-formedness here */
7433         }
7434     }
7435     return s;
7436 }
7437
7438 #ifdef XXX_dmq
7439 /* there are a bunch of places where we use two reghop3's that should
7440    be replaced with this routine. but since thats not done yet
7441    we ifdef it out - dmq
7442 */
7443 STATIC U8 *
7444 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7445 {
7446     dVAR;
7447
7448     PERL_ARGS_ASSERT_REGHOP4;
7449
7450     if (off >= 0) {
7451         while (off-- && s < rlim) {
7452             /* XXX could check well-formedness here */
7453             s += UTF8SKIP(s);
7454         }
7455     }
7456     else {
7457         while (off++ && s > llim) {
7458             s--;
7459             if (UTF8_IS_CONTINUED(*s)) {
7460                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7461                     s--;
7462             }
7463             /* XXX could check well-formedness here */
7464         }
7465     }
7466     return s;
7467 }
7468 #endif
7469
7470 STATIC U8 *
7471 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7472 {
7473     dVAR;
7474
7475     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7476
7477     if (off >= 0) {
7478         while (off-- && s < lim) {
7479             /* XXX could check well-formedness here */
7480             s += UTF8SKIP(s);
7481         }
7482         if (off >= 0)
7483             return NULL;
7484     }
7485     else {
7486         while (off++ && s > lim) {
7487             s--;
7488             if (UTF8_IS_CONTINUED(*s)) {
7489                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7490                     s--;
7491             }
7492             /* XXX could check well-formedness here */
7493         }
7494         if (off <= 0)
7495             return NULL;
7496     }
7497     return s;
7498 }
7499
7500
7501 /* when executing a regex that may have (?{}), extra stuff needs setting
7502    up that will be visible to the called code, even before the current
7503    match has finished. In particular:
7504
7505    * $_ is localised to the SV currently being matched;
7506    * pos($_) is created if necessary, ready to be updated on each call-out
7507      to code;
7508    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7509      isn't set until the current pattern is successfully finished), so that
7510      $1 etc of the match-so-far can be seen;
7511    * save the old values of subbeg etc of the current regex, and  set then
7512      to the current string (again, this is normally only done at the end
7513      of execution)
7514 */
7515
7516 static void
7517 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7518 {
7519     MAGIC *mg;
7520     regexp *const rex = ReANY(reginfo->prog);
7521     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7522
7523     eval_state->rex = rex;
7524
7525     if (reginfo->sv) {
7526         /* Make $_ available to executed code. */
7527         if (reginfo->sv != DEFSV) {
7528             SAVE_DEFSV;
7529             DEFSV_set(reginfo->sv);
7530         }
7531
7532         if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
7533               && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
7534             /* prepare for quick setting of pos */
7535 #ifdef PERL_OLD_COPY_ON_WRITE
7536             if (SvIsCOW(reginfo->sv))
7537                 sv_force_normal_flags(reginfo->sv, 0);
7538 #endif
7539             mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
7540                              &PL_vtbl_mglob, NULL, 0);
7541             mg->mg_len = -1;
7542         }
7543         eval_state->pos_magic = mg;
7544         eval_state->pos       = mg->mg_len;
7545     }
7546     else
7547         eval_state->pos_magic = NULL;
7548
7549     if (!PL_reg_curpm) {
7550         Newxz(PL_reg_curpm, 1, PMOP);
7551 #ifdef USE_ITHREADS
7552         {
7553             SV* const repointer = &PL_sv_undef;
7554             /* this regexp is also owned by the new PL_reg_curpm, which
7555                will try to free it.  */
7556             av_push(PL_regex_padav, repointer);
7557             PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7558             PL_regex_pad = AvARRAY(PL_regex_padav);
7559         }
7560 #endif
7561     }
7562     SET_reg_curpm(reginfo->prog);
7563     eval_state->curpm = PL_curpm;
7564     PL_curpm = PL_reg_curpm;
7565     if (RXp_MATCH_COPIED(rex)) {
7566         /*  Here is a serious problem: we cannot rewrite subbeg,
7567             since it may be needed if this match fails.  Thus
7568             $` inside (?{}) could fail... */
7569         eval_state->subbeg     = rex->subbeg;
7570         eval_state->sublen     = rex->sublen;
7571         eval_state->suboffset  = rex->suboffset;
7572         eval_state->subcoffset = rex->subcoffset;
7573 #ifdef PERL_ANY_COW
7574         eval_state->saved_copy = rex->saved_copy;
7575 #endif
7576         RXp_MATCH_COPIED_off(rex);
7577     }
7578     else
7579         eval_state->subbeg = NULL;
7580     rex->subbeg = (char *)reginfo->strbeg;
7581     rex->suboffset = 0;
7582     rex->subcoffset = 0;
7583     rex->sublen = reginfo->strend - reginfo->strbeg;
7584 }
7585
7586
7587 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7588
7589 static void
7590 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7591 {
7592     dVAR;
7593     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7594     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
7595
7596     if (!eval_state)
7597         return;
7598
7599     /* undo the effects of S_setup_eval_state() */
7600
7601     if (eval_state->subbeg) {
7602         regexp * const rex = eval_state->rex;
7603         rex->subbeg     = eval_state->subbeg;
7604         rex->sublen     = eval_state->sublen;
7605         rex->suboffset  = eval_state->suboffset;
7606         rex->subcoffset = eval_state->subcoffset;
7607 #ifdef PERL_ANY_COW
7608         rex->saved_copy = eval_state->saved_copy;
7609 #endif
7610         RXp_MATCH_COPIED_on(rex);
7611     }
7612     if (eval_state->pos_magic)
7613         eval_state->pos_magic->mg_len = eval_state->pos;
7614
7615     PL_curpm = eval_state->curpm;
7616 }
7617
7618
7619 STATIC void
7620 S_to_utf8_substr(pTHX_ regexp *prog)
7621 {
7622     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7623      * on the converted value */
7624
7625     int i = 1;
7626
7627     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7628
7629     do {
7630         if (prog->substrs->data[i].substr
7631             && !prog->substrs->data[i].utf8_substr) {
7632             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7633             prog->substrs->data[i].utf8_substr = sv;
7634             sv_utf8_upgrade(sv);
7635             if (SvVALID(prog->substrs->data[i].substr)) {
7636                 if (SvTAIL(prog->substrs->data[i].substr)) {
7637                     /* Trim the trailing \n that fbm_compile added last
7638                        time.  */
7639                     SvCUR_set(sv, SvCUR(sv) - 1);
7640                     /* Whilst this makes the SV technically "invalid" (as its
7641                        buffer is no longer followed by "\0") when fbm_compile()
7642                        adds the "\n" back, a "\0" is restored.  */
7643                     fbm_compile(sv, FBMcf_TAIL);
7644                 } else
7645                     fbm_compile(sv, 0);
7646             }
7647             if (prog->substrs->data[i].substr == prog->check_substr)
7648                 prog->check_utf8 = sv;
7649         }
7650     } while (i--);
7651 }
7652
7653 STATIC bool
7654 S_to_byte_substr(pTHX_ regexp *prog)
7655 {
7656     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7657      * on the converted value; returns FALSE if can't be converted. */
7658
7659     dVAR;
7660     int i = 1;
7661
7662     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7663
7664     do {
7665         if (prog->substrs->data[i].utf8_substr
7666             && !prog->substrs->data[i].substr) {
7667             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7668             if (! sv_utf8_downgrade(sv, TRUE)) {
7669                 return FALSE;
7670             }
7671             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7672                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7673                     /* Trim the trailing \n that fbm_compile added last
7674                         time.  */
7675                     SvCUR_set(sv, SvCUR(sv) - 1);
7676                     fbm_compile(sv, FBMcf_TAIL);
7677                 } else
7678                     fbm_compile(sv, 0);
7679             }
7680             prog->substrs->data[i].substr = sv;
7681             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7682                 prog->check_substr = sv;
7683         }
7684     } while (i--);
7685
7686     return TRUE;
7687 }
7688
7689 /*
7690  * Local variables:
7691  * c-indentation-style: bsd
7692  * c-basic-offset: 4
7693  * indent-tabs-mode: nil
7694  * End:
7695  *
7696  * ex: set ts=8 sts=4 sw=4 et:
7697  */