regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102 /* Valid for non-utf8 strings: avoids the reginclass
 103  * call if there are no complications: i.e., if everything matchable is
 104  * straight forward in the bitmap */
 105 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 106                                               : ANYOF_BITMAP_TEST(p,*(c)))
 107
 108 /*
 109  * Forwards.
 110  */
 111
 112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 113 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 114
 115 #define HOPc(pos,off) \
 116         (char *)(reginfo->is_utf8_target \
 117             ? reghop3((U8*)pos, off, \
 118                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 119             : (U8*)(pos + off))
 120 #define HOPBACKc(pos, off) \
 121         (char*)(reginfo->is_utf8_target \
 122             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 123             : (pos - off >= reginfo->strbeg)    \
 124                 ? (U8*)pos - off                \
 125                 : NULL)
 126
 127 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 129
 130
 131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 132 #define NEXTCHR_IS_EOS (nextchr < 0)
 133
 134 #define SET_nextchr \
 135     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 136
 137 #define SET_locinput(p) \
 138     locinput = (p);  \
 139     SET_nextchr
 140
 141
 142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
 143         if (!swash_ptr) {                                                     \
 144             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 145             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 146                                          1, 0, NULL, &flags);                 \
 147             assert(swash_ptr);                                                \
 148         }                                                                     \
 149     } STMT_END
 150
 151 /* If in debug mode, we test that a known character properly matches */
 152 #ifdef DEBUGGING
 153 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 154                                           property_name,                      \
 155                                           utf8_char_in_property)              \
 156         LOAD_UTF8_CHARCLASS(swash_ptr, property_name);                        \
 157         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 158 #else
 159 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 160                                           property_name,                      \
 161                                           utf8_char_in_property)              \
 162         LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
 163 #endif
 164
 165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 166                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 167                                         swash_property_names[_CC_WORDCHAR],   \
 168                                         LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 169
 170 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 171     STMT_START {                                                              \
 172         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 173                                        "_X_regular_begin",                    \
 174                                        LATIN_CAPITAL_LETTER_SHARP_S_UTF8);    \
 175         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 176                                        "_X_extend",                           \
 177                                        COMBINING_GRAVE_ACCENT_UTF8);          \
 178     } STMT_END
 179
 180 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 181 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 182
 183 /* for use after a quantifier and before an EXACT-like node -- japhy */
 184 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 185  *
 186  * NOTE that *nothing* that affects backtracking should be in here, specifically
 187  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 188  * node that is in between two EXACT like nodes when ascertaining what the required
 189  * "follow" character is. This should probably be moved to regex compile time
 190  * although it may be done at run time beause of the REF possibility - more
 191  * investigation required. -- demerphq
 192 */
 193 #define JUMPABLE(rn) (      \
 194     OP(rn) == OPEN ||       \
 195     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 196     OP(rn) == EVAL ||   \
 197     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 198     OP(rn) == PLUS || OP(rn) == MINMOD || \
 199     OP(rn) == KEEPS || \
 200     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 201 )
 202 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 203
 204 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 205
 206 #if 0
 207 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 208    we don't need this definition. */
 209 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 210 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 211 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 212
 213 #else
 214 /* ... so we use this as its faster. */
 215 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 216 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 217 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 218 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 219
 220 #endif
 221
 222 /*
 223   Search for mandatory following text node; for lookahead, the text must
 224   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 225 */
 226 #define FIND_NEXT_IMPT(rn) STMT_START { \
 227     while (JUMPABLE(rn)) { \
 228         const OPCODE type = OP(rn); \
 229         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 230             rn = NEXTOPER(NEXTOPER(rn)); \
 231         else if (type == PLUS) \
 232             rn = NEXTOPER(rn); \
 233         else if (type == IFMATCH) \
 234             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 235         else rn += NEXT_OFF(rn); \
 236     } \
 237 } STMT_END
 238
 239 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 240  * These are for the pre-composed Hangul syllables, which are all in a
 241  * contiguous block and arranged there in such a way so as to facilitate
 242  * alorithmic determination of their characteristics.  As such, they don't need
 243  * a swash, but can be determined by simple arithmetic.  Almost all are
 244  * GCB=LVT, but every 28th one is a GCB=LV */
 245 #define SBASE 0xAC00    /* Start of block */
 246 #define SCount 11172    /* Length of block */
 247 #define TCount 28
 248
 249 #define SLAB_FIRST(s) (&(s)->states[0])
 250 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 251
 252 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 253 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 254 static regmatch_state * S_push_slab(pTHX);
 255
 256 #define REGCP_PAREN_ELEMS 3
 257 #define REGCP_OTHER_ELEMS 3
 258 #define REGCP_FRAME_ELEMS 1
 259 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 260  * are needed for the regexp context stack bookkeeping. */
 261
 262 STATIC CHECKPOINT
 263 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 264 {
 265     dVAR;
 266     const int retval = PL_savestack_ix;
 267     const int paren_elems_to_push =
 268                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 269     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 270     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 271     I32 p;
 272     GET_RE_DEBUG_FLAGS_DECL;
 273
 274     PERL_ARGS_ASSERT_REGCPPUSH;
 275
 276     if (paren_elems_to_push < 0)
 277         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 278                    paren_elems_to_push);
 279
 280     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 281         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 282                    " out of range (%lu-%ld)",
 283                    total_elems,
 284                    (unsigned long)maxopenparen,
 285                    (long)parenfloor);
 286
 287     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 288
 289     DEBUG_BUFFERS_r(
 290         if ((int)maxopenparen > (int)parenfloor)
 291             PerlIO_printf(Perl_debug_log,
 292                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 293                 PTR2UV(rex),
 294                 PTR2UV(rex->offs)
 295             );
 296     );
 297     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 298 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 299         SSPUSHIV(rex->offs[p].end);
 300         SSPUSHIV(rex->offs[p].start);
 301         SSPUSHINT(rex->offs[p].start_tmp);
 302         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 303             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 304             (UV)p,
 305             (IV)rex->offs[p].start,
 306             (IV)rex->offs[p].start_tmp,
 307             (IV)rex->offs[p].end
 308         ));
 309     }
 310 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 311     SSPUSHINT(maxopenparen);
 312     SSPUSHINT(rex->lastparen);
 313     SSPUSHINT(rex->lastcloseparen);
 314     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 315
 316     return retval;
 317 }
 318
 319 /* These are needed since we do not localize EVAL nodes: */
 320 #define REGCP_SET(cp)                                           \
 321     DEBUG_STATE_r(                                              \
 322             PerlIO_printf(Perl_debug_log,                       \
 323                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 324                 (IV)PL_savestack_ix));                          \
 325     cp = PL_savestack_ix
 326
 327 #define REGCP_UNWIND(cp)                                        \
 328     DEBUG_STATE_r(                                              \
 329         if (cp != PL_savestack_ix)                              \
 330             PerlIO_printf(Perl_debug_log,                       \
 331                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 332                 (IV)(cp), (IV)PL_savestack_ix));                \
 333     regcpblow(cp)
 334
 335 #define UNWIND_PAREN(lp, lcp)               \
 336     for (n = rex->lastparen; n > lp; n--)   \
 337         rex->offs[n].end = -1;              \
 338     rex->lastparen = n;                     \
 339     rex->lastcloseparen = lcp;
 340
 341
 342 STATIC void
 343 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 344 {
 345     dVAR;
 346     UV i;
 347     U32 paren;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     PERL_ARGS_ASSERT_REGCPPOP;
 351
 352     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 353     i = SSPOPUV;
 354     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 355     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 356     rex->lastcloseparen = SSPOPINT;
 357     rex->lastparen = SSPOPINT;
 358     *maxopenparen_p = SSPOPINT;
 359
 360     i -= REGCP_OTHER_ELEMS;
 361     /* Now restore the parentheses context. */
 362     DEBUG_BUFFERS_r(
 363         if (i || rex->lastparen + 1 <= rex->nparens)
 364             PerlIO_printf(Perl_debug_log,
 365                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 366                 PTR2UV(rex),
 367                 PTR2UV(rex->offs)
 368             );
 369     );
 370     paren = *maxopenparen_p;
 371     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 372         SSize_t tmps;
 373         rex->offs[paren].start_tmp = SSPOPINT;
 374         rex->offs[paren].start = SSPOPIV;
 375         tmps = SSPOPIV;
 376         if (paren <= rex->lastparen)
 377             rex->offs[paren].end = tmps;
 378         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 379             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 380             (UV)paren,
 381             (IV)rex->offs[paren].start,
 382             (IV)rex->offs[paren].start_tmp,
 383             (IV)rex->offs[paren].end,
 384             (paren > rex->lastparen ? "(skipped)" : ""));
 385         );
 386         paren--;
 387     }
 388 #if 1
 389     /* It would seem that the similar code in regtry()
 390      * already takes care of this, and in fact it is in
 391      * a better location to since this code can #if 0-ed out
 392      * but the code in regtry() is needed or otherwise tests
 393      * requiring null fields (pat.t#187 and split.t#{13,14}
 394      * (as of patchlevel 7877)  will fail.  Then again,
 395      * this code seems to be necessary or otherwise
 396      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 397      * --jhi updated by dapm */
 398     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 399         if (i > *maxopenparen_p)
 400             rex->offs[i].start = -1;
 401         rex->offs[i].end = -1;
 402         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 403             "    \\%"UVuf": %s   ..-1 undeffing\n",
 404             (UV)i,
 405             (i > *maxopenparen_p) ? "-1" : "  "
 406         ));
 407     }
 408 #endif
 409 }
 410
 411 /* restore the parens and associated vars at savestack position ix,
 412  * but without popping the stack */
 413
 414 STATIC void
 415 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 416 {
 417     I32 tmpix = PL_savestack_ix;
 418     PL_savestack_ix = ix;
 419     regcppop(rex, maxopenparen_p);
 420     PL_savestack_ix = tmpix;
 421 }
 422
 423 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 424
 425 STATIC bool
 426 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 427 {
 428     /* Returns a boolean as to whether or not 'character' is a member of the
 429      * Posix character class given by 'classnum' that should be equivalent to a
 430      * value in the typedef '_char_class_number'.
 431      *
 432      * Ideally this could be replaced by a just an array of function pointers
 433      * to the C library functions that implement the macros this calls.
 434      * However, to compile, the precise function signatures are required, and
 435      * these may vary from platform to to platform.  To avoid having to figure
 436      * out what those all are on each platform, I (khw) am using this method,
 437      * which adds an extra layer of function call overhead (unless the C
 438      * optimizer strips it away).  But we don't particularly care about
 439      * performance with locales anyway. */
 440
 441     switch ((_char_class_number) classnum) {
 442         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 443         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 444         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 445         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 446         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 447                                         || isUPPER_LC(character);
 448         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 449         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 450         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 451         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 452         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 453         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 454         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 455         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 456         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 457         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 458         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 459         default:    /* VERTSPACE should never occur in locales */
 460             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 461     }
 462
 463     assert(0); /* NOTREACHED */
 464     return FALSE;
 465 }
 466
 467 STATIC bool
 468 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 469 {
 470     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 471      * 'character' is a member of the Posix character class given by 'classnum'
 472      * that should be equivalent to a value in the typedef
 473      * '_char_class_number'.
 474      *
 475      * This just calls isFOO_lc on the code point for the character if it is in
 476      * the range 0-255.  Outside that range, all characters avoid Unicode
 477      * rules, ignoring any locale.  So use the Unicode function if this class
 478      * requires a swash, and use the Unicode macro otherwise. */
 479
 480     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 481
 482     if (UTF8_IS_INVARIANT(*character)) {
 483         return isFOO_lc(classnum, *character);
 484     }
 485     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 486         return isFOO_lc(classnum,
 487                         TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 488     }
 489
 490     if (classnum < _FIRST_NON_SWASH_CC) {
 491
 492         /* Initialize the swash unless done already */
 493         if (! PL_utf8_swash_ptrs[classnum]) {
 494             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 495             PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
 496                 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
 497         }
 498
 499         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 500                                  character,
 501                                  TRUE /* is UTF */ ));
 502     }
 503
 504     switch ((_char_class_number) classnum) {
 505         case _CC_ENUM_SPACE:
 506         case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 507
 508         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 509         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 510         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 511         default:                 return 0;  /* Things like CNTRL are always
 512                                                below 256 */
 513     }
 514
 515     assert(0); /* NOTREACHED */
 516     return FALSE;
 517 }
 518
 519 /*
 520  * pregexec and friends
 521  */
 522
 523 #ifndef PERL_IN_XSUB_RE
 524 /*
 525  - pregexec - match a regexp against a string
 526  */
 527 I32
 528 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 529          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 530 /* stringarg: the point in the string at which to begin matching */
 531 /* strend:    pointer to null at end of string */
 532 /* strbeg:    real beginning of string */
 533 /* minend:    end of match must be >= minend bytes after stringarg. */
 534 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 535  *            itself is accessed via the pointers above */
 536 /* nosave:    For optimizations. */
 537 {
 538     PERL_ARGS_ASSERT_PREGEXEC;
 539
 540     return
 541         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 542                       nosave ? 0 : REXEC_COPY_STR);
 543 }
 544 #endif
 545
 546 /*
 547  * Need to implement the following flags for reg_anch:
 548  *
 549  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 550  * USE_INTUIT_ML
 551  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 552  * INTUIT_AUTORITATIVE_ML
 553  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 554  * INTUIT_ONCE_ML
 555  *
 556  * Another flag for this function: SECOND_TIME (so that float substrs
 557  * with giant delta may be not rechecked).
 558  */
 559
 560 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 561    Otherwise, only SvCUR(sv) is used to get strbeg. */
 562
 563 /* XXXX Some places assume that there is a fixed substring.
 564         An update may be needed if optimizer marks as "INTUITable"
 565         RExen without fixed substrings.  Similarly, it is assumed that
 566         lengths of all the strings are no more than minlen, thus they
 567         cannot come from lookahead.
 568         (Or minlen should take into account lookahead.)
 569   NOTE: Some of this comment is not correct. minlen does now take account
 570   of lookahead/behind. Further research is required. -- demerphq
 571
 572 */
 573
 574 /* A failure to find a constant substring means that there is no need to make
 575    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 576    finding a substring too deep into the string means that fewer calls to
 577    regtry() should be needed.
 578
 579    REx compiler's optimizer found 4 possible hints:
 580         a) Anchored substring;
 581         b) Fixed substring;
 582         c) Whether we are anchored (beginning-of-line or \G);
 583         d) First node (of those at offset 0) which may distinguish positions;
 584    We use a)b)d) and multiline-part of c), and try to find a position in the
 585    string which does not contradict any of them.
 586  */
 587
 588 /* Most of decisions we do here should have been done at compile time.
 589    The nodes of the REx which we used for the search should have been
 590    deleted from the finite automaton. */
 591
 592 /* args:
 593  * rx:     the regex to match against
 594  * sv:     the SV being matched: only used for utf8 flag; the string
 595  *         itself is accessed via the pointers below. Note that on
 596  *         something like an overloaded SV, SvPOK(sv) may be false
 597  *         and the string pointers may point to something unrelated to
 598  *         the SV itself.
 599  * strbeg: real beginning of string
 600  * strpos: the point in the string at which to begin matching
 601  * strend: pointer to the byte following the last char of the string
 602  * flags   currently unused; set to 0
 603  * data:   currently unused; set to NULL
 604  */
 605
 606 char *
 607 Perl_re_intuit_start(pTHX_
 608                     REGEXP * const rx,
 609                     SV *sv,
 610                     const char * const strbeg,
 611                     char *strpos,
 612                     char *strend,
 613                     const U32 flags,
 614                     re_scream_pos_data *data)
 615 {
 616     dVAR;
 617     struct regexp *const prog = ReANY(rx);
 618     SSize_t start_shift = 0;
 619     /* Should be nonnegative! */
 620     SSize_t end_shift   = 0;
 621     char *s;
 622     SV *check;
 623     char *t;
 624     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 625     I32 ml_anch;
 626     char *other_last = NULL;    /* other substr checked before this */
 627     char *check_at = NULL;              /* check substr found at this pos */
 628     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 629     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 630     RXi_GET_DECL(prog,progi);
 631     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 632     regmatch_info *const reginfo = &reginfo_buf;
 633 #ifdef DEBUGGING
 634     const char * const i_strpos = strpos;
 635 #endif
 636     GET_RE_DEBUG_FLAGS_DECL;
 637
 638     PERL_ARGS_ASSERT_RE_INTUIT_START;
 639     PERL_UNUSED_ARG(flags);
 640     PERL_UNUSED_ARG(data);
 641
 642     /* CHR_DIST() would be more correct here but it makes things slow. */
 643     if (prog->minlen > strend - strpos) {
 644         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 645                               "String too short... [re_intuit_start]\n"));
 646         goto fail;
 647     }
 648
 649     reginfo->is_utf8_target = cBOOL(utf8_target);
 650     reginfo->info_aux = NULL;
 651     reginfo->strbeg = strbeg;
 652     reginfo->strend = strend;
 653     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 654     reginfo->intuit = 1;
 655     /* not actually used within intuit, but zero for safety anyway */
 656     reginfo->poscache_maxiter = 0;
 657
 658     if (utf8_target) {
 659         if (!prog->check_utf8 && prog->check_substr)
 660             to_utf8_substr(prog);
 661         check = prog->check_utf8;
 662     } else {
 663         if (!prog->check_substr && prog->check_utf8) {
 664             if (! to_byte_substr(prog)) {
 665                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 666             }
 667         }
 668         check = prog->check_substr;
 669     }
 670     if ((prog->extflags & RXf_ANCH)     /* Match at beg-of-str or after \n */
 671          && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
 672     {
 673         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 674                      || ( (prog->extflags & RXf_ANCH_BOL)
 675                           && !multiline ) );    /* Check after \n? */
 676
 677         if (!ml_anch) {
 678           if (    !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 679                && (strpos != strbeg)) {
 680               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 681               goto fail;
 682           }
 683           if (prog->check_offset_min == prog->check_offset_max
 684               && !(prog->extflags & RXf_CANY_SEEN)
 685               && ! multiline)   /* /m can cause \n's to match that aren't
 686                                    accounted for in the string max length.
 687                                    See [perl #115242] */
 688           {
 689             /* Substring at constant offset from beg-of-str... */
 690             SSize_t slen;
 691
 692             s = HOP3c(strpos, prog->check_offset_min, strend);
 693
 694             if (SvTAIL(check)) {
 695                 slen = SvCUR(check);    /* >= 1 */
 696
 697                 if ( strend - s > slen || strend - s < slen - 1
 698                      || (strend - s == slen && strend[-1] != '\n')) {
 699                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 700                     goto fail_finish;
 701                 }
 702                 /* Now should match s[0..slen-2] */
 703                 slen--;
 704                 if (slen && (*SvPVX_const(check) != *s
 705                              || (slen > 1
 706                                  && memNE(SvPVX_const(check), s, slen)))) {
 707                   report_neq:
 708                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 709                     goto fail_finish;
 710                 }
 711             }
 712             else if (*SvPVX_const(check) != *s
 713                      || ((slen = SvCUR(check)) > 1
 714                          && memNE(SvPVX_const(check), s, slen)))
 715                 goto report_neq;
 716             check_at = s;
 717             goto success_at_start;
 718           }
 719         }
 720         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 721         s = strpos;
 722         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 723         end_shift = prog->check_end_shift;
 724
 725         if (!ml_anch) {
 726             const SSize_t end = prog->check_offset_max + CHR_SVLEN(check)
 727                                          - (SvTAIL(check) != 0);
 728             const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 729
 730             if (end_shift < eshift)
 731                 end_shift = eshift;
 732         }
 733     }
 734     else {                              /* Can match at random position */
 735         ml_anch = 0;
 736         s = strpos;
 737         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 738         end_shift = prog->check_end_shift;
 739
 740         /* end shift should be non negative here */
 741     }
 742
 743 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 744     if (end_shift < 0)
 745         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 746                    (IV)end_shift, RX_PRECOMP(prog));
 747 #endif
 748
 749   restart:
 750     /* Find a possible match in the region s..strend by looking for
 751        the "check" substring in the region corrected by start/end_shift. */
 752
 753     {
 754         SSize_t srch_start_shift = start_shift;
 755         SSize_t srch_end_shift = end_shift;
 756         U8* start_point;
 757         U8* end_point;
 758         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 759             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 760             srch_start_shift = strbeg - s;
 761         }
 762     DEBUG_OPTIMISE_MORE_r({
 763         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 764             (IV)prog->check_offset_min,
 765             (IV)srch_start_shift,
 766             (IV)srch_end_shift,
 767             (IV)prog->check_end_shift);
 768     });
 769
 770         if (prog->extflags & RXf_CANY_SEEN) {
 771             start_point= (U8*)(s + srch_start_shift);
 772             end_point= (U8*)(strend - srch_end_shift);
 773         } else {
 774             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 775             end_point= HOP3(strend, -srch_end_shift, strbeg);
 776         }
 777         DEBUG_OPTIMISE_MORE_r({
 778             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 779                 (int)(end_point - start_point),
 780                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 781                 start_point);
 782         });
 783
 784         s = fbm_instr( start_point, end_point,
 785                       check, multiline ? FBMrf_MULTILINE : 0);
 786     }
 787     /* Update the count-of-usability, remove useless subpatterns,
 788         unshift s.  */
 789
 790     DEBUG_EXECUTE_r({
 791         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 792             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 793         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 794                           (s ? "Found" : "Did not find"),
 795             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 796                 ? "anchored" : "floating"),
 797             quoted,
 798             RE_SV_TAIL(check),
 799             (s ? " at offset " : "...\n") );
 800     });
 801
 802     if (!s)
 803         goto fail_finish;
 804     /* Finish the diagnostic message */
 805     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 806
 807     /* XXX dmq: first branch is for positive lookbehind...
 808        Our check string is offset from the beginning of the pattern.
 809        So we need to do any stclass tests offset forward from that
 810        point. I think. :-(
 811      */
 812
 813
 814
 815     check_at=s;
 816
 817
 818     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 819        Start with the other substr.
 820        XXXX no SCREAM optimization yet - and a very coarse implementation
 821        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 822                 *always* match.  Probably should be marked during compile...
 823        Probably it is right to do no SCREAM here...
 824      */
 825
 826     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 827                 : (prog->float_substr && prog->anchored_substr))
 828     {
 829         /* Take into account the "other" substring. */
 830         /* XXXX May be hopelessly wrong for UTF... */
 831         if (!other_last)
 832             other_last = strpos;
 833         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 834           do_other_anchored:
 835             {
 836                 char * const last = HOP3c(s, -start_shift, strbeg);
 837                 char *last1, *last2;
 838                 char * const saved_s = s;
 839                 SV* must;
 840
 841                 t = s - prog->check_offset_max;
 842                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 843                     && (!utf8_target
 844                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 845                             && t > strpos)))
 846                     NOOP;
 847                 else
 848                     t = strpos;
 849                 t = HOP3c(t, prog->anchored_offset, strend);
 850                 if (t < other_last)     /* These positions already checked */
 851                     t = other_last;
 852                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 853                 if (last < last1)
 854                     last1 = last;
 855                 /* XXXX It is not documented what units *_offsets are in.
 856                    We assume bytes, but this is clearly wrong.
 857                    Meaning this code needs to be carefully reviewed for errors.
 858                    dmq.
 859                   */
 860
 861                 /* On end-of-str: see comment below. */
 862                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 863                 if (must == &PL_sv_undef) {
 864                     s = (char*)NULL;
 865                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 866                 }
 867                 else
 868                     s = fbm_instr(
 869                         (unsigned char*)t,
 870                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 871                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 872                         must,
 873                         multiline ? FBMrf_MULTILINE : 0
 874                     );
 875                 DEBUG_EXECUTE_r({
 876                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 877                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 878                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 879                         (s ? "Found" : "Contradicts"),
 880                         quoted, RE_SV_TAIL(must));
 881                 });
 882
 883
 884                 if (!s) {
 885                     if (last1 >= last2) {
 886                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 887                                                 ", giving up...\n"));
 888                         goto fail_finish;
 889                     }
 890                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 891                         ", trying floating at offset %ld...\n",
 892                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 893                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 894                     s = HOP3c(last, 1, strend);
 895                     goto restart;
 896                 }
 897                 else {
 898                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 899                           (long)(s - i_strpos)));
 900                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 901                     other_last = HOP3c(s, 1, strend);
 902                     s = saved_s;
 903                     if (t == strpos)
 904                         goto try_at_start;
 905                     goto try_at_offset;
 906                 }
 907             }
 908         }
 909         else {          /* Take into account the floating substring. */
 910             char *last, *last1;
 911             char * const saved_s = s;
 912             SV* must;
 913
 914             t = HOP3c(s, -start_shift, strbeg);
 915             last1 = last =
 916                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 917             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 918                 last = HOP3c(t, prog->float_max_offset, strend);
 919             s = HOP3c(t, prog->float_min_offset, strend);
 920             if (s < other_last)
 921                 s = other_last;
 922  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 923             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 924             /* fbm_instr() takes into account exact value of end-of-str
 925                if the check is SvTAIL(ed).  Since false positives are OK,
 926                and end-of-str is not later than strend we are OK. */
 927             if (must == &PL_sv_undef) {
 928                 s = (char*)NULL;
 929                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 930             }
 931             else
 932                 s = fbm_instr((unsigned char*)s,
 933                               (unsigned char*)last + SvCUR(must)
 934                                   - (SvTAIL(must)!=0),
 935                               must, multiline ? FBMrf_MULTILINE : 0);
 936             DEBUG_EXECUTE_r({
 937                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 938                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 939                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 940                     (s ? "Found" : "Contradicts"),
 941                     quoted, RE_SV_TAIL(must));
 942             });
 943             if (!s) {
 944                 if (last1 == last) {
 945                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 946                                             ", giving up...\n"));
 947                     goto fail_finish;
 948                 }
 949                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 950                     ", trying anchored starting at offset %ld...\n",
 951                     (long)(saved_s + 1 - i_strpos)));
 952                 other_last = last;
 953                 s = HOP3c(t, 1, strend);
 954                 goto restart;
 955             }
 956             else {
 957                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 958                       (long)(s - i_strpos)));
 959                 other_last = s; /* Fix this later. --Hugo */
 960                 s = saved_s;
 961                 if (t == strpos)
 962                     goto try_at_start;
 963                 goto try_at_offset;
 964             }
 965         }
 966     }
 967
 968
 969     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 970
 971     DEBUG_OPTIMISE_MORE_r(
 972         PerlIO_printf(Perl_debug_log,
 973             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 974             (IV)prog->check_offset_min,
 975             (IV)prog->check_offset_max,
 976             (IV)(s-strpos),
 977             (IV)(t-strpos),
 978             (IV)(t-s),
 979             (IV)(strend-strpos)
 980         )
 981     );
 982
 983     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 984         && (!utf8_target
 985             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 986                  && t > strpos)))
 987     {
 988         /* Fixed substring is found far enough so that the match
 989            cannot start at strpos. */
 990       try_at_offset:
 991         if (ml_anch && t[-1] != '\n') {
 992             /* Eventually fbm_*() should handle this, but often
 993                anchored_offset is not 0, so this check will not be wasted. */
 994             /* XXXX In the code below we prefer to look for "^" even in
 995                presence of anchored substrings.  And we search even
 996                beyond the found float position.  These pessimizations
 997                are historical artefacts only.  */
 998           find_anchor:
 999             while (t < strend - prog->minlen) {
1000                 if (*t == '\n') {
1001                     if (t < check_at - prog->check_offset_min) {
1002                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1003                             /* Since we moved from the found position,
1004                                we definitely contradict the found anchored
1005                                substr.  Due to the above check we do not
1006                                contradict "check" substr.
1007                                Thus we can arrive here only if check substr
1008                                is float.  Redo checking for "other"=="fixed".
1009                              */
1010                             strpos = t + 1;
1011                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1012                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1013                             goto do_other_anchored;
1014                         }
1015                         /* We don't contradict the found floating substring. */
1016                         /* XXXX Why not check for STCLASS? */
1017                         s = t + 1;
1018                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1019                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1020                         goto set_useful;
1021                     }
1022                     /* Position contradicts check-string */
1023                     /* XXXX probably better to look for check-string
1024                        than for "\n", so one should lower the limit for t? */
1025                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1026                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1027                     other_last = strpos = s = t + 1;
1028                     goto restart;
1029                 }
1030                 t++;
1031             }
1032             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1033                         PL_colors[0], PL_colors[1]));
1034             goto fail_finish;
1035         }
1036         else {
1037             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1038                         PL_colors[0], PL_colors[1]));
1039         }
1040         s = t;
1041       set_useful:
1042         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1043     }
1044     else {
1045         /* The found string does not prohibit matching at strpos,
1046            - no optimization of calling REx engine can be performed,
1047            unless it was an MBOL and we are not after MBOL,
1048            or a future STCLASS check will fail this. */
1049       try_at_start:
1050         /* Even in this situation we may use MBOL flag if strpos is offset
1051            wrt the start of the string. */
1052         if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1053             /* May be due to an implicit anchor of m{.*foo}  */
1054             && !(prog->intflags & PREGf_IMPLICIT))
1055         {
1056             t = strpos;
1057             goto find_anchor;
1058         }
1059         DEBUG_EXECUTE_r( if (ml_anch)
1060             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1061                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1062         );
1063       success_at_start:
1064         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1065             && (utf8_target ? (
1066                 prog->check_utf8                /* Could be deleted already */
1067                 && --BmUSEFUL(prog->check_utf8) < 0
1068                 && (prog->check_utf8 == prog->float_utf8)
1069             ) : (
1070                 prog->check_substr              /* Could be deleted already */
1071                 && --BmUSEFUL(prog->check_substr) < 0
1072                 && (prog->check_substr == prog->float_substr)
1073             )))
1074         {
1075             /* If flags & SOMETHING - do not do it many times on the same match */
1076             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1077             /* XXX Does the destruction order has to change with utf8_target? */
1078             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1079             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1080             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1081             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1082             check = NULL;                       /* abort */
1083             s = strpos;
1084             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1085                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1086             if (prog->intflags & PREGf_IMPLICIT)
1087                 prog->extflags &= ~RXf_ANCH_MBOL;
1088             /* XXXX This is a remnant of the old implementation.  It
1089                     looks wasteful, since now INTUIT can use many
1090                     other heuristics. */
1091             prog->extflags &= ~RXf_USE_INTUIT;
1092             /* XXXX What other flags might need to be cleared in this branch? */
1093         }
1094         else
1095             s = strpos;
1096     }
1097
1098     /* Last resort... */
1099     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1100     /* trie stclasses are too expensive to use here, we are better off to
1101        leave it to regmatch itself */
1102     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1103         /* minlen == 0 is possible if regstclass is \b or \B,
1104            and the fixed substr is ''$.
1105            Since minlen is already taken into account, s+1 is before strend;
1106            accidentally, minlen >= 1 guaranties no false positives at s + 1
1107            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1108            regstclass does not come from lookahead...  */
1109         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1110            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1111         const U8* const str = (U8*)STRING(progi->regstclass);
1112         /* XXX this value could be pre-computed */
1113         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1114                     ?  (reginfo->is_utf8_pat
1115                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1116                         : STR_LEN(progi->regstclass))
1117                     : 1);
1118         char * endpos;
1119         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1120             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1121         else if (prog->float_substr || prog->float_utf8)
1122             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1123         else
1124             endpos= strend;
1125
1126         if (checked_upto < s)
1127            checked_upto = s;
1128         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1129                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1130
1131         t = s;
1132         s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1133                             reginfo);
1134         if (s) {
1135             checked_upto = s;
1136         } else {
1137 #ifdef DEBUGGING
1138             const char *what = NULL;
1139 #endif
1140             if (endpos == strend) {
1141                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142                                 "Could not match STCLASS...\n") );
1143                 goto fail;
1144             }
1145             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1146                                    "This position contradicts STCLASS...\n") );
1147             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1148                 goto fail;
1149             checked_upto = HOPBACKc(endpos, start_shift);
1150             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1151                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1152             /* Contradict one of substrings */
1153             if (prog->anchored_substr || prog->anchored_utf8) {
1154                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1155                     DEBUG_EXECUTE_r( what = "anchored" );
1156                   hop_and_restart:
1157                     s = HOP3c(t, 1, strend);
1158                     if (s + start_shift + end_shift > strend) {
1159                         /* XXXX Should be taken into account earlier? */
1160                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1161                                                "Could not match STCLASS...\n") );
1162                         goto fail;
1163                     }
1164                     if (!check)
1165                         goto giveup;
1166                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1167                                 "Looking for %s substr starting at offset %ld...\n",
1168                                  what, (long)(s + start_shift - i_strpos)) );
1169                     goto restart;
1170                 }
1171                 /* Have both, check_string is floating */
1172                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1173                     goto retry_floating_check;
1174                 /* Recheck anchored substring, but not floating... */
1175                 s = check_at;
1176                 if (!check)
1177                     goto giveup;
1178                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1179                           "Looking for anchored substr starting at offset %ld...\n",
1180                           (long)(other_last - i_strpos)) );
1181                 goto do_other_anchored;
1182             }
1183             /* Another way we could have checked stclass at the
1184                current position only: */
1185             if (ml_anch) {
1186                 s = t = t + 1;
1187                 if (!check)
1188                     goto giveup;
1189                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1190                           "Looking for /%s^%s/m starting at offset %ld...\n",
1191                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1192                 goto try_at_offset;
1193             }
1194             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1195                 goto fail;
1196             /* Check is floating substring. */
1197           retry_floating_check:
1198             t = check_at - start_shift;
1199             DEBUG_EXECUTE_r( what = "floating" );
1200             goto hop_and_restart;
1201         }
1202         if (t != s) {
1203             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1204                         "By STCLASS: moving %ld --> %ld\n",
1205                                   (long)(t - i_strpos), (long)(s - i_strpos))
1206                    );
1207         }
1208         else {
1209             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1210                                   "Does not contradict STCLASS...\n");
1211                    );
1212         }
1213     }
1214   giveup:
1215     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1216                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1217                           PL_colors[5], (long)(s - i_strpos)) );
1218     return s;
1219
1220   fail_finish:                          /* Substring not found */
1221     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1222         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1223   fail:
1224     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1225                           PL_colors[4], PL_colors[5]));
1226     return NULL;
1227 }
1228
1229 #define DECL_TRIE_TYPE(scan) \
1230     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1231                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1232                     trie_type = ((scan->flags == EXACT) \
1233                               ? (utf8_target ? trie_utf8 : trie_plain) \
1234                               : (scan->flags == EXACTFA) \
1235                                 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1236                                 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1237
1238 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1239 STMT_START {                               \
1240     STRLEN skiplen;                                                                 \
1241     U8 flags = FOLD_FLAGS_FULL; \
1242     switch (trie_type) {                                                            \
1243     case trie_utf8_exactfa_fold:                                                            \
1244         flags |= FOLD_FLAGS_NOMIX_ASCII; \
1245         /* FALL THROUGH */ \
1246     case trie_utf8_fold:                                                            \
1247         if ( foldlen>0 ) {                                                          \
1248             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1249             foldlen -= len;                                                         \
1250             uscan += len;                                                           \
1251             len=0;                                                                  \
1252         } else {                                                                    \
1253             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL);                \
1254             len = UTF8SKIP(uc);                                                     \
1255             skiplen = UNISKIP( uvc );                                               \
1256             foldlen -= skiplen;                                                     \
1257             uscan = foldbuf + skiplen;                                              \
1258         }                                                                           \
1259         break;                                                                      \
1260     case trie_latin_utf8_exactfa_fold: \
1261         flags |= FOLD_FLAGS_NOMIX_ASCII; \
1262         /* FALL THROUGH */ \
1263     case trie_latin_utf8_fold:                                                      \
1264         if ( foldlen>0 ) {                                                          \
1265             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1266             foldlen -= len;                                                         \
1267             uscan += len;                                                           \
1268             len=0;                                                                  \
1269         } else {                                                                    \
1270             len = 1;                                                                \
1271             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);   \
1272             skiplen = UNISKIP( uvc );                                               \
1273             foldlen -= skiplen;                                                     \
1274             uscan = foldbuf + skiplen;                                              \
1275         }                                                                           \
1276         break;                                                                      \
1277     case trie_utf8:                                                                 \
1278         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1279         break;                                                                      \
1280     case trie_plain:                                                                \
1281         uvc = (UV)*uc;                                                              \
1282         len = 1;                                                                    \
1283     }                                                                               \
1284     if (uvc < 256) {                                                                \
1285         charid = trie->charmap[ uvc ];                                              \
1286     }                                                                               \
1287     else {                                                                          \
1288         charid = 0;                                                                 \
1289         if (widecharmap) {                                                          \
1290             SV** const svpp = hv_fetch(widecharmap,                                 \
1291                         (char*)&uvc, sizeof(UV), 0);                                \
1292             if (svpp)                                                               \
1293                 charid = (U16)SvIV(*svpp);                                          \
1294         }                                                                           \
1295     }                                                                               \
1296 } STMT_END
1297
1298 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1299 STMT_START {                                              \
1300     while (s <= e) {                                      \
1301         if ( (CoNd)                                       \
1302              && (ln == 1 || folder(s, pat_string, ln))    \
1303              && (reginfo->intuit || regtry(reginfo, &s)) )\
1304             goto got_it;                                  \
1305         s++;                                              \
1306     }                                                     \
1307 } STMT_END
1308
1309 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1310 STMT_START {                                          \
1311     while (s < strend) {                              \
1312         CoDe                                          \
1313         s += UTF8SKIP(s);                             \
1314     }                                                 \
1315 } STMT_END
1316
1317 #define REXEC_FBC_SCAN(CoDe)                          \
1318 STMT_START {                                          \
1319     while (s < strend) {                              \
1320         CoDe                                          \
1321         s++;                                          \
1322     }                                                 \
1323 } STMT_END
1324
1325 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1326 REXEC_FBC_UTF8_SCAN(                                  \
1327     if (CoNd) {                                       \
1328         if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1329             goto got_it;                              \
1330         else                                          \
1331             tmp = doevery;                            \
1332     }                                                 \
1333     else                                              \
1334         tmp = 1;                                      \
1335 )
1336
1337 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1338 REXEC_FBC_SCAN(                                       \
1339     if (CoNd) {                                       \
1340         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))  \
1341             goto got_it;                              \
1342         else                                          \
1343             tmp = doevery;                            \
1344     }                                                 \
1345     else                                              \
1346         tmp = 1;                                      \
1347 )
1348
1349 #define REXEC_FBC_TRYIT               \
1350 if ((reginfo->intuit || regtry(reginfo, &s))) \
1351     goto got_it
1352
1353 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1354     if (utf8_target) {                                             \
1355         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1356     }                                                          \
1357     else {                                                     \
1358         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1359     }
1360
1361 #define DUMP_EXEC_POS(li,s,doutf8) \
1362     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1363                 startpos, doutf8)
1364
1365
1366 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1367         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1368         tmp = TEST_NON_UTF8(tmp);                                              \
1369         REXEC_FBC_UTF8_SCAN(                                                   \
1370             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1371                 tmp = !tmp;                                                    \
1372                 IF_SUCCESS;                                                    \
1373             }                                                                  \
1374             else {                                                             \
1375                 IF_FAIL;                                                       \
1376             }                                                                  \
1377         );                                                                     \
1378
1379 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1380         if (s == reginfo->strbeg) {                                            \
1381             tmp = '\n';                                                        \
1382         }                                                                      \
1383         else {                                                                 \
1384             U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
1385             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1386         }                                                                      \
1387         tmp = TeSt1_UtF8;                                                      \
1388         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1389         REXEC_FBC_UTF8_SCAN(                                                   \
1390             if (tmp == ! (TeSt2_UtF8)) { \
1391                 tmp = !tmp;                                                    \
1392                 IF_SUCCESS;                                                    \
1393             }                                                                  \
1394             else {                                                             \
1395                 IF_FAIL;                                                       \
1396             }                                                                  \
1397         );                                                                     \
1398
1399 /* The only difference between the BOUND and NBOUND cases is that
1400  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1401  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1402  * with the other one being empty */
1403 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1405
1406 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1407     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1408
1409 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1410     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1411
1412 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1413     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1414
1415
1416 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1417  * be passed in completely with the variable name being tested, which isn't
1418  * such a clean interface, but this is easier to read than it was before.  We
1419  * are looking for the boundary (or non-boundary between a word and non-word
1420  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1421  * must be different.  Find the "wordness" of the character just prior to this
1422  * one, and compare it with the wordness of this one.  If they differ, we have
1423  * a boundary.  At the beginning of the string, pretend that the previous
1424  * character was a new-line */
1425 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1426     if (utf8_target) {                                                         \
1427                 UTF8_CODE \
1428     }                                                                          \
1429     else {  /* Not utf8 */                                                     \
1430         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1431         tmp = TEST_NON_UTF8(tmp);                                              \
1432         REXEC_FBC_SCAN(                                                        \
1433             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1434                 tmp = !tmp;                                                    \
1435                 IF_SUCCESS;                                                    \
1436             }                                                                  \
1437             else {                                                             \
1438                 IF_FAIL;                                                       \
1439             }                                                                  \
1440         );                                                                     \
1441     }                                                                          \
1442     if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s)))           \
1443         goto got_it;
1444
1445 /* We know what class REx starts with.  Try to find this position... */
1446 /* if reginfo->intuit, its a dryrun */
1447 /* annoyingly all the vars in this routine have different names from their counterparts
1448    in regmatch. /grrr */
1449
1450 STATIC char *
1451 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1452     const char *strend, regmatch_info *reginfo)
1453 {
1454     dVAR;
1455     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1456     char *pat_string;   /* The pattern's exactish string */
1457     char *pat_end;          /* ptr to end char of pat_string */
1458     re_fold_t folder;   /* Function for computing non-utf8 folds */
1459     const U8 *fold_array;   /* array for folding ords < 256 */
1460     STRLEN ln;
1461     STRLEN lnc;
1462     U8 c1;
1463     U8 c2;
1464     char *e;
1465     I32 tmp = 1;        /* Scratch variable? */
1466     const bool utf8_target = reginfo->is_utf8_target;
1467     UV utf8_fold_flags = 0;
1468     const bool is_utf8_pat = reginfo->is_utf8_pat;
1469     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1470                                    with a result inverts that result, as 0^1 =
1471                                    1 and 1^1 = 0 */
1472     _char_class_number classnum;
1473
1474     RXi_GET_DECL(prog,progi);
1475
1476     PERL_ARGS_ASSERT_FIND_BYCLASS;
1477
1478     /* We know what class it must start with. */
1479     switch (OP(c)) {
1480     case ANYOF:
1481     case ANYOF_SYNTHETIC:
1482     case ANYOF_WARN_SUPER:
1483         if (utf8_target) {
1484             REXEC_FBC_UTF8_CLASS_SCAN(
1485                       reginclass(prog, c, (U8*)s, utf8_target));
1486         }
1487         else {
1488             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1489         }
1490         break;
1491     case CANY:
1492         REXEC_FBC_SCAN(
1493             if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1494                 goto got_it;
1495             else
1496                 tmp = doevery;
1497         );
1498         break;
1499
1500     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1501         assert(! is_utf8_pat);
1502         /* FALL THROUGH */
1503     case EXACTFA:
1504         if (is_utf8_pat || utf8_target) {
1505             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1506             goto do_exactf_utf8;
1507         }
1508         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1509         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1510         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1511
1512     case EXACTF:   /* This node only generated for non-utf8 patterns */
1513         assert(! is_utf8_pat);
1514         if (utf8_target) {
1515             utf8_fold_flags = 0;
1516             goto do_exactf_utf8;
1517         }
1518         fold_array = PL_fold;
1519         folder = foldEQ;
1520         goto do_exactf_non_utf8;
1521
1522     case EXACTFL:
1523         if (is_utf8_pat || utf8_target) {
1524             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1525             goto do_exactf_utf8;
1526         }
1527         fold_array = PL_fold_locale;
1528         folder = foldEQ_locale;
1529         goto do_exactf_non_utf8;
1530
1531     case EXACTFU_SS:
1532         if (is_utf8_pat) {
1533             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1534         }
1535         goto do_exactf_utf8;
1536
1537     case EXACTFU:
1538         if (is_utf8_pat || utf8_target) {
1539             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1540             goto do_exactf_utf8;
1541         }
1542
1543         /* Any 'ss' in the pattern should have been replaced by regcomp,
1544          * so we don't have to worry here about this single special case
1545          * in the Latin1 range */
1546         fold_array = PL_fold_latin1;
1547         folder = foldEQ_latin1;
1548
1549         /* FALL THROUGH */
1550
1551     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1552                            are no glitches with fold-length differences
1553                            between the target string and pattern */
1554
1555         /* The idea in the non-utf8 EXACTF* cases is to first find the
1556          * first character of the EXACTF* node and then, if necessary,
1557          * case-insensitively compare the full text of the node.  c1 is the
1558          * first character.  c2 is its fold.  This logic will not work for
1559          * Unicode semantics and the german sharp ss, which hence should
1560          * not be compiled into a node that gets here. */
1561         pat_string = STRING(c);
1562         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1563
1564         /* We know that we have to match at least 'ln' bytes (which is the
1565          * same as characters, since not utf8).  If we have to match 3
1566          * characters, and there are only 2 availabe, we know without
1567          * trying that it will fail; so don't start a match past the
1568          * required minimum number from the far end */
1569         e = HOP3c(strend, -((SSize_t)ln), s);
1570
1571         if (reginfo->intuit && e < s) {
1572             e = s;                      /* Due to minlen logic of intuit() */
1573         }
1574
1575         c1 = *pat_string;
1576         c2 = fold_array[c1];
1577         if (c1 == c2) { /* If char and fold are the same */
1578             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1579         }
1580         else {
1581             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1582         }
1583         break;
1584
1585     do_exactf_utf8:
1586     {
1587         unsigned expansion;
1588
1589         /* If one of the operands is in utf8, we can't use the simpler folding
1590          * above, due to the fact that many different characters can have the
1591          * same fold, or portion of a fold, or different- length fold */
1592         pat_string = STRING(c);
1593         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1594         pat_end = pat_string + ln;
1595         lnc = is_utf8_pat       /* length to match in characters */
1596                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1597                 : ln;
1598
1599         /* We have 'lnc' characters to match in the pattern, but because of
1600          * multi-character folding, each character in the target can match
1601          * up to 3 characters (Unicode guarantees it will never exceed
1602          * this) if it is utf8-encoded; and up to 2 if not (based on the
1603          * fact that the Latin 1 folds are already determined, and the
1604          * only multi-char fold in that range is the sharp-s folding to
1605          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1606          * string character.  Adjust lnc accordingly, rounding up, so that
1607          * if we need to match at least 4+1/3 chars, that really is 5. */
1608         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1609         lnc = (lnc + expansion - 1) / expansion;
1610
1611         /* As in the non-UTF8 case, if we have to match 3 characters, and
1612          * only 2 are left, it's guaranteed to fail, so don't start a
1613          * match that would require us to go beyond the end of the string
1614          */
1615         e = HOP3c(strend, -((SSize_t)lnc), s);
1616
1617         if (reginfo->intuit && e < s) {
1618             e = s;                      /* Due to minlen logic of intuit() */
1619         }
1620
1621         /* XXX Note that we could recalculate e to stop the loop earlier,
1622          * as the worst case expansion above will rarely be met, and as we
1623          * go along we would usually find that e moves further to the left.
1624          * This would happen only after we reached the point in the loop
1625          * where if there were no expansion we should fail.  Unclear if
1626          * worth the expense */
1627
1628         while (s <= e) {
1629             char *my_strend= (char *)strend;
1630             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1631                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1632                 && (reginfo->intuit || regtry(reginfo, &s)) )
1633             {
1634                 goto got_it;
1635             }
1636             s += (utf8_target) ? UTF8SKIP(s) : 1;
1637         }
1638         break;
1639     }
1640     case BOUNDL:
1641         RXp_MATCH_TAINTED_on(prog);
1642         FBC_BOUND(isWORDCHAR_LC,
1643                   isWORDCHAR_LC_uvchr(tmp),
1644                   isWORDCHAR_LC_utf8((U8*)s));
1645         break;
1646     case NBOUNDL:
1647         RXp_MATCH_TAINTED_on(prog);
1648         FBC_NBOUND(isWORDCHAR_LC,
1649                    isWORDCHAR_LC_uvchr(tmp),
1650                    isWORDCHAR_LC_utf8((U8*)s));
1651         break;
1652     case BOUND:
1653         FBC_BOUND(isWORDCHAR,
1654                   isWORDCHAR_uni(tmp),
1655                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1656         break;
1657     case BOUNDA:
1658         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1659                          isWORDCHAR_A(tmp),
1660                          isWORDCHAR_A((U8*)s));
1661         break;
1662     case NBOUND:
1663         FBC_NBOUND(isWORDCHAR,
1664                    isWORDCHAR_uni(tmp),
1665                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1666         break;
1667     case NBOUNDA:
1668         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1669                           isWORDCHAR_A(tmp),
1670                           isWORDCHAR_A((U8*)s));
1671         break;
1672     case BOUNDU:
1673         FBC_BOUND(isWORDCHAR_L1,
1674                   isWORDCHAR_uni(tmp),
1675                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1676         break;
1677     case NBOUNDU:
1678         FBC_NBOUND(isWORDCHAR_L1,
1679                    isWORDCHAR_uni(tmp),
1680                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1681         break;
1682     case LNBREAK:
1683         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1684                         is_LNBREAK_latin1_safe(s, strend)
1685         );
1686         break;
1687
1688     /* The argument to all the POSIX node types is the class number to pass to
1689      * _generic_isCC() to build a mask for searching in PL_charclass[] */
1690
1691     case NPOSIXL:
1692         to_complement = 1;
1693         /* FALLTHROUGH */
1694
1695     case POSIXL:
1696         RXp_MATCH_TAINTED_on(prog);
1697         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1698                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1699         break;
1700
1701     case NPOSIXD:
1702         to_complement = 1;
1703         /* FALLTHROUGH */
1704
1705     case POSIXD:
1706         if (utf8_target) {
1707             goto posix_utf8;
1708         }
1709         goto posixa;
1710
1711     case NPOSIXA:
1712         if (utf8_target) {
1713             /* The complement of something that matches only ASCII matches all
1714              * UTF-8 variant code points, plus everything in ASCII that isn't
1715              * in the class */
1716             REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1717                                       || ! _generic_isCC_A(*s, FLAGS(c)));
1718             break;
1719         }
1720
1721         to_complement = 1;
1722         /* FALLTHROUGH */
1723
1724     case POSIXA:
1725       posixa:
1726         /* Don't need to worry about utf8, as it can match only a single
1727          * byte invariant character. */
1728         REXEC_FBC_CLASS_SCAN(
1729                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1730         break;
1731
1732     case NPOSIXU:
1733         to_complement = 1;
1734         /* FALLTHROUGH */
1735
1736     case POSIXU:
1737         if (! utf8_target) {
1738             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1739                                                                     FLAGS(c))));
1740         }
1741         else {
1742
1743       posix_utf8:
1744             classnum = (_char_class_number) FLAGS(c);
1745             if (classnum < _FIRST_NON_SWASH_CC) {
1746                 while (s < strend) {
1747
1748                     /* We avoid loading in the swash as long as possible, but
1749                      * should we have to, we jump to a separate loop.  This
1750                      * extra 'if' statement is what keeps this code from being
1751                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1752                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
1753                         goto found_above_latin1;
1754                     }
1755                     if ((UTF8_IS_INVARIANT(*s)
1756                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1757                                                                 classnum)))
1758                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
1759                             && to_complement ^ cBOOL(
1760                                 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1761                                                                       *(s + 1)),
1762                                               classnum))))
1763                     {
1764                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1765                             goto got_it;
1766                         else {
1767                             tmp = doevery;
1768                         }
1769                     }
1770                     else {
1771                         tmp = 1;
1772                     }
1773                     s += UTF8SKIP(s);
1774                 }
1775             }
1776             else switch (classnum) {    /* These classes are implemented as
1777                                            macros */
1778                 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1779                                         revert the change of \v matching this */
1780                     /* FALL THROUGH */
1781
1782                 case _CC_ENUM_PSXSPC:
1783                     REXEC_FBC_UTF8_CLASS_SCAN(
1784                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
1785                     break;
1786
1787                 case _CC_ENUM_BLANK:
1788                     REXEC_FBC_UTF8_CLASS_SCAN(
1789                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
1790                     break;
1791
1792                 case _CC_ENUM_XDIGIT:
1793                     REXEC_FBC_UTF8_CLASS_SCAN(
1794                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1795                     break;
1796
1797                 case _CC_ENUM_VERTSPACE:
1798                     REXEC_FBC_UTF8_CLASS_SCAN(
1799                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
1800                     break;
1801
1802                 case _CC_ENUM_CNTRL:
1803                     REXEC_FBC_UTF8_CLASS_SCAN(
1804                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
1805                     break;
1806
1807                 default:
1808                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1809                     assert(0); /* NOTREACHED */
1810             }
1811         }
1812         break;
1813
1814       found_above_latin1:   /* Here we have to load a swash to get the result
1815                                for the current code point */
1816         if (! PL_utf8_swash_ptrs[classnum]) {
1817             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1818             PL_utf8_swash_ptrs[classnum] =
1819                     _core_swash_init("utf8", swash_property_names[classnum],
1820                                      &PL_sv_undef, 1, 0, NULL, &flags);
1821         }
1822
1823         /* This is a copy of the loop above for swash classes, though using the
1824          * FBC macro instead of being expanded out.  Since we've loaded the
1825          * swash, we don't have to check for that each time through the loop */
1826         REXEC_FBC_UTF8_CLASS_SCAN(
1827                 to_complement ^ cBOOL(_generic_utf8(
1828                                       classnum,
1829                                       s,
1830                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
1831                                                   (U8 *) s, TRUE))));
1832         break;
1833
1834     case AHOCORASICKC:
1835     case AHOCORASICK:
1836         {
1837             DECL_TRIE_TYPE(c);
1838             /* what trie are we using right now */
1839             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1840             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1841             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1842
1843             const char *last_start = strend - trie->minlen;
1844 #ifdef DEBUGGING
1845             const char *real_start = s;
1846 #endif
1847             STRLEN maxlen = trie->maxlen;
1848             SV *sv_points;
1849             U8 **points; /* map of where we were in the input string
1850                             when reading a given char. For ASCII this
1851                             is unnecessary overhead as the relationship
1852                             is always 1:1, but for Unicode, especially
1853                             case folded Unicode this is not true. */
1854             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1855             U8 *bitmap=NULL;
1856
1857
1858             GET_RE_DEBUG_FLAGS_DECL;
1859
1860             /* We can't just allocate points here. We need to wrap it in
1861              * an SV so it gets freed properly if there is a croak while
1862              * running the match */
1863             ENTER;
1864             SAVETMPS;
1865             sv_points=newSV(maxlen * sizeof(U8 *));
1866             SvCUR_set(sv_points,
1867                 maxlen * sizeof(U8 *));
1868             SvPOK_on(sv_points);
1869             sv_2mortal(sv_points);
1870             points=(U8**)SvPV_nolen(sv_points );
1871             if ( trie_type != trie_utf8_fold
1872                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
1873             {
1874                 if (trie->bitmap)
1875                     bitmap=(U8*)trie->bitmap;
1876                 else
1877                     bitmap=(U8*)ANYOF_BITMAP(c);
1878             }
1879             /* this is the Aho-Corasick algorithm modified a touch
1880                to include special handling for long "unknown char" sequences.
1881                The basic idea being that we use AC as long as we are dealing
1882                with a possible matching char, when we encounter an unknown char
1883                (and we have not encountered an accepting state) we scan forward
1884                until we find a legal starting char.
1885                AC matching is basically that of trie matching, except that when
1886                we encounter a failing transition, we fall back to the current
1887                states "fail state", and try the current char again, a process
1888                we repeat until we reach the root state, state 1, or a legal
1889                transition. If we fail on the root state then we can either
1890                terminate if we have reached an accepting state previously, or
1891                restart the entire process from the beginning if we have not.
1892
1893              */
1894             while (s <= last_start) {
1895                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1896                 U8 *uc = (U8*)s;
1897                 U16 charid = 0;
1898                 U32 base = 1;
1899                 U32 state = 1;
1900                 UV uvc = 0;
1901                 STRLEN len = 0;
1902                 STRLEN foldlen = 0;
1903                 U8 *uscan = (U8*)NULL;
1904                 U8 *leftmost = NULL;
1905 #ifdef DEBUGGING
1906                 U32 accepted_word= 0;
1907 #endif
1908                 U32 pointpos = 0;
1909
1910                 while ( state && uc <= (U8*)strend ) {
1911                     int failed=0;
1912                     U32 word = aho->states[ state ].wordnum;
1913
1914                     if( state==1 ) {
1915                         if ( bitmap ) {
1916                             DEBUG_TRIE_EXECUTE_r(
1917                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1918                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1919                                         (char *)uc, utf8_target );
1920                                     PerlIO_printf( Perl_debug_log,
1921                                         " Scanning for legal start char...\n");
1922                                 }
1923                             );
1924                             if (utf8_target) {
1925                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1926                                     uc += UTF8SKIP(uc);
1927                                 }
1928                             } else {
1929                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1930                                     uc++;
1931                                 }
1932                             }
1933                             s= (char *)uc;
1934                         }
1935                         if (uc >(U8*)last_start) break;
1936                     }
1937
1938                     if ( word ) {
1939                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1940                         if (!leftmost || lpos < leftmost) {
1941                             DEBUG_r(accepted_word=word);
1942                             leftmost= lpos;
1943                         }
1944                         if (base==0) break;
1945
1946                     }
1947                     points[pointpos++ % maxlen]= uc;
1948                     if (foldlen || uc < (U8*)strend) {
1949                         REXEC_TRIE_READ_CHAR(trie_type, trie,
1950                                          widecharmap, uc,
1951                                          uscan, len, uvc, charid, foldlen,
1952                                          foldbuf, uniflags);
1953                         DEBUG_TRIE_EXECUTE_r({
1954                             dump_exec_pos( (char *)uc, c, strend,
1955                                         real_start, s, utf8_target);
1956                             PerlIO_printf(Perl_debug_log,
1957                                 " Charid:%3u CP:%4"UVxf" ",
1958                                  charid, uvc);
1959                         });
1960                     }
1961                     else {
1962                         len = 0;
1963                         charid = 0;
1964                     }
1965
1966
1967                     do {
1968 #ifdef DEBUGGING
1969                         word = aho->states[ state ].wordnum;
1970 #endif
1971                         base = aho->states[ state ].trans.base;
1972
1973                         DEBUG_TRIE_EXECUTE_r({
1974                             if (failed)
1975                                 dump_exec_pos( (char *)uc, c, strend, real_start,
1976                                     s,   utf8_target );
1977                             PerlIO_printf( Perl_debug_log,
1978                                 "%sState: %4"UVxf", word=%"UVxf,
1979                                 failed ? " Fail transition to " : "",
1980                                 (UV)state, (UV)word);
1981                         });
1982                         if ( base ) {
1983                             U32 tmp;
1984                             I32 offset;
1985                             if (charid &&
1986                                  ( ((offset = base + charid
1987                                     - 1 - trie->uniquecharcount)) >= 0)
1988                                  && ((U32)offset < trie->lasttrans)
1989                                  && trie->trans[offset].check == state
1990                                  && (tmp=trie->trans[offset].next))
1991                             {
1992                                 DEBUG_TRIE_EXECUTE_r(
1993                                     PerlIO_printf( Perl_debug_log," - legal\n"));
1994                                 state = tmp;
1995                                 break;
1996                             }
1997                             else {
1998                                 DEBUG_TRIE_EXECUTE_r(
1999                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2000                                 failed = 1;
2001                                 state = aho->fail[state];
2002                             }
2003                         }
2004                         else {
2005                             /* we must be accepting here */
2006                             DEBUG_TRIE_EXECUTE_r(
2007                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2008                             failed = 1;
2009                             break;
2010                         }
2011                     } while(state);
2012                     uc += len;
2013                     if (failed) {
2014                         if (leftmost)
2015                             break;
2016                         if (!state) state = 1;
2017                     }
2018                 }
2019                 if ( aho->states[ state ].wordnum ) {
2020                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2021                     if (!leftmost || lpos < leftmost) {
2022                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2023                         leftmost = lpos;
2024                     }
2025                 }
2026                 if (leftmost) {
2027                     s = (char*)leftmost;
2028                     DEBUG_TRIE_EXECUTE_r({
2029                         PerlIO_printf(
2030                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2031                             (UV)accepted_word, (IV)(s - real_start)
2032                         );
2033                     });
2034                     if (reginfo->intuit || regtry(reginfo, &s)) {
2035                         FREETMPS;
2036                         LEAVE;
2037                         goto got_it;
2038                     }
2039                     s = HOPc(s,1);
2040                     DEBUG_TRIE_EXECUTE_r({
2041                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2042                     });
2043                 } else {
2044                     DEBUG_TRIE_EXECUTE_r(
2045                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2046                     break;
2047                 }
2048             }
2049             FREETMPS;
2050             LEAVE;
2051         }
2052         break;
2053     default:
2054         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2055         break;
2056     }
2057     return 0;
2058   got_it:
2059     return s;
2060 }
2061
2062 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2063  * flags have same meanings as with regexec_flags() */
2064
2065 static void
2066 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2067                             char *strbeg,
2068                             char *strend,
2069                             SV *sv,
2070                             U32 flags,
2071                             bool utf8_target)
2072 {
2073     struct regexp *const prog = ReANY(rx);
2074
2075     if (flags & REXEC_COPY_STR) {
2076 #ifdef PERL_ANY_COW
2077         if (SvCANCOW(sv)) {
2078             if (DEBUG_C_TEST) {
2079                 PerlIO_printf(Perl_debug_log,
2080                               "Copy on write: regexp capture, type %d\n",
2081                               (int) SvTYPE(sv));
2082             }
2083             /* Create a new COW SV to share the match string and store
2084              * in saved_copy, unless the current COW SV in saved_copy
2085              * is valid and suitable for our purpose */
2086             if ((   prog->saved_copy
2087                  && SvIsCOW(prog->saved_copy)
2088                  && SvPOKp(prog->saved_copy)
2089                  && SvIsCOW(sv)
2090                  && SvPOKp(sv)
2091                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2092             {
2093                 /* just reuse saved_copy SV */
2094                 if (RXp_MATCH_COPIED(prog)) {
2095                     Safefree(prog->subbeg);
2096                     RXp_MATCH_COPIED_off(prog);
2097                 }
2098             }
2099             else {
2100                 /* create new COW SV to share string */
2101                 RX_MATCH_COPY_FREE(rx);
2102                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2103             }
2104             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2105             assert (SvPOKp(prog->saved_copy));
2106             prog->sublen  = strend - strbeg;
2107             prog->suboffset = 0;
2108             prog->subcoffset = 0;
2109         } else
2110 #endif
2111         {
2112             SSize_t min = 0;
2113             SSize_t max = strend - strbeg;
2114             SSize_t sublen;
2115
2116             if (    (flags & REXEC_COPY_SKIP_POST)
2117                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2118                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2119             ) { /* don't copy $' part of string */
2120                 U32 n = 0;
2121                 max = -1;
2122                 /* calculate the right-most part of the string covered
2123                  * by a capture. Due to look-ahead, this may be to
2124                  * the right of $&, so we have to scan all captures */
2125                 while (n <= prog->lastparen) {
2126                     if (prog->offs[n].end > max)
2127                         max = prog->offs[n].end;
2128                     n++;
2129                 }
2130                 if (max == -1)
2131                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2132                             ? prog->offs[0].start
2133                             : 0;
2134                 assert(max >= 0 && max <= strend - strbeg);
2135             }
2136
2137             if (    (flags & REXEC_COPY_SKIP_PRE)
2138                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2139                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2140             ) { /* don't copy $` part of string */
2141                 U32 n = 0;
2142                 min = max;
2143                 /* calculate the left-most part of the string covered
2144                  * by a capture. Due to look-behind, this may be to
2145                  * the left of $&, so we have to scan all captures */
2146                 while (min && n <= prog->lastparen) {
2147                     if (   prog->offs[n].start != -1
2148                         && prog->offs[n].start < min)
2149                     {
2150                         min = prog->offs[n].start;
2151                     }
2152                     n++;
2153                 }
2154                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2155                     && min >  prog->offs[0].end
2156                 )
2157                     min = prog->offs[0].end;
2158
2159             }
2160
2161             assert(min >= 0 && min <= max && min <= strend - strbeg);
2162             sublen = max - min;
2163
2164             if (RX_MATCH_COPIED(rx)) {
2165                 if (sublen > prog->sublen)
2166                     prog->subbeg =
2167                             (char*)saferealloc(prog->subbeg, sublen+1);
2168             }
2169             else
2170                 prog->subbeg = (char*)safemalloc(sublen+1);
2171             Copy(strbeg + min, prog->subbeg, sublen, char);
2172             prog->subbeg[sublen] = '\0';
2173             prog->suboffset = min;
2174             prog->sublen = sublen;
2175             RX_MATCH_COPIED_on(rx);
2176         }
2177         prog->subcoffset = prog->suboffset;
2178         if (prog->suboffset && utf8_target) {
2179             /* Convert byte offset to chars.
2180              * XXX ideally should only compute this if @-/@+
2181              * has been seen, a la PL_sawampersand ??? */
2182
2183             /* If there's a direct correspondence between the
2184              * string which we're matching and the original SV,
2185              * then we can use the utf8 len cache associated with
2186              * the SV. In particular, it means that under //g,
2187              * sv_pos_b2u() will use the previously cached
2188              * position to speed up working out the new length of
2189              * subcoffset, rather than counting from the start of
2190              * the string each time. This stops
2191              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2192              * from going quadratic */
2193             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2194                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2195                                                 SV_GMAGIC|SV_CONST_RETURN);
2196             else
2197                 prog->subcoffset = utf8_length((U8*)strbeg,
2198                                     (U8*)(strbeg+prog->suboffset));
2199         }
2200     }
2201     else {
2202         RX_MATCH_COPY_FREE(rx);
2203         prog->subbeg = strbeg;
2204         prog->suboffset = 0;
2205         prog->subcoffset = 0;
2206         prog->sublen = strend - strbeg;
2207     }
2208 }
2209
2210
2211
2212
2213 /*
2214  - regexec_flags - match a regexp against a string
2215  */
2216 I32
2217 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2218               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2219 /* stringarg: the point in the string at which to begin matching */
2220 /* strend:    pointer to null at end of string */
2221 /* strbeg:    real beginning of string */
2222 /* minend:    end of match must be >= minend bytes after stringarg. */
2223 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2224  *            itself is accessed via the pointers above */
2225 /* data:      May be used for some additional optimizations.
2226               Currently unused. */
2227 /* flags:     For optimizations. See REXEC_* in regexp.h */
2228
2229 {
2230     dVAR;
2231     struct regexp *const prog = ReANY(rx);
2232     char *s;
2233     regnode *c;
2234     char *startpos;
2235     SSize_t minlen;             /* must match at least this many chars */
2236     SSize_t dontbother = 0;     /* how many characters not to try at end */
2237     const bool utf8_target = cBOOL(DO_UTF8(sv));
2238     I32 multiline;
2239     RXi_GET_DECL(prog,progi);
2240     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2241     regmatch_info *const reginfo = &reginfo_buf;
2242     regexp_paren_pair *swap = NULL;
2243     I32 oldsave;
2244     GET_RE_DEBUG_FLAGS_DECL;
2245
2246     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2247     PERL_UNUSED_ARG(data);
2248
2249     /* Be paranoid... */
2250     if (prog == NULL || stringarg == NULL) {
2251         Perl_croak(aTHX_ "NULL regexp parameter");
2252         return 0;
2253     }
2254
2255     DEBUG_EXECUTE_r(
2256         debug_start_match(rx, utf8_target, stringarg, strend,
2257         "Matching");
2258     );
2259
2260     startpos = stringarg;
2261
2262     if (prog->extflags & RXf_GPOS_SEEN) {
2263         MAGIC *mg;
2264
2265         /* set reginfo->ganch, the position where \G can match */
2266
2267         reginfo->ganch =
2268             (flags & REXEC_IGNOREPOS)
2269             ? stringarg /* use start pos rather than pos() */
2270             : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2271               /* Defined pos(): */
2272             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2273             : strbeg; /* pos() not defined; use start of string */
2274
2275         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2276             "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
2277
2278         /* in the presence of \G, we may need to start looking earlier in
2279          * the string than the suggested start point of stringarg:
2280          * if gofs->prog is set, then that's a known, fixed minimum
2281          * offset, such as
2282          * /..\G/:   gofs = 2
2283          * /ab|c\G/: gofs = 1
2284          * or if the minimum offset isn't known, then we have to go back
2285          * to the start of the string, e.g. /w+\G/
2286          */
2287
2288         if (prog->extflags & RXf_ANCH_GPOS) {
2289             startpos  = reginfo->ganch - prog->gofs;
2290             if (startpos <
2291                 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2292             {
2293                 DEBUG_r(PerlIO_printf(Perl_debug_log,
2294                         "fail: ganch-gofs before earliest possible start\n"));
2295                 return 0;
2296             }
2297         }
2298         else if (prog->gofs) {
2299             if (startpos - prog->gofs < strbeg)
2300                 startpos = strbeg;
2301             else
2302                 startpos -= prog->gofs;
2303         }
2304         else if (prog->extflags & RXf_GPOS_FLOAT)
2305             startpos = strbeg;
2306     }
2307
2308     minlen = prog->minlen;
2309     if ((startpos + minlen) > strend || startpos < strbeg) {
2310         DEBUG_r(PerlIO_printf(Perl_debug_log,
2311                     "Regex match can't succeed, so not even tried\n"));
2312         return 0;
2313     }
2314
2315     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2316      * which will call destuctors to reset PL_regmatch_state, free higher
2317      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2318      * regmatch_info_aux_eval */
2319
2320     oldsave = PL_savestack_ix;
2321
2322     s = startpos;
2323
2324     if ((prog->extflags & RXf_USE_INTUIT)
2325         && !(flags & REXEC_CHECKED))
2326     {
2327         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2328                                     flags, NULL);
2329         if (!s)
2330             return 0;
2331
2332         if (prog->extflags & RXf_CHECK_ALL) {
2333             /* we can match based purely on the result of INTUIT.
2334              * Set up captures etc just for $& and $-[0]
2335              * (an intuit-only match wont have $1,$2,..) */
2336             assert(!prog->nparens);
2337
2338             /* s/// doesn't like it if $& is earlier than where we asked it to
2339              * start searching (which can happen on something like /.\G/) */
2340             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2341                     && (s < stringarg))
2342             {
2343                 /* this should only be possible under \G */
2344                 assert(prog->extflags & RXf_GPOS_SEEN);
2345                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2346                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2347                 goto phooey;
2348             }
2349
2350             /* match via INTUIT shouldn't have any captures.
2351              * Let @-, @+, $^N know */
2352             prog->lastparen = prog->lastcloseparen = 0;
2353             RX_MATCH_UTF8_set(rx, utf8_target);
2354             prog->offs[0].start = s - strbeg;
2355             prog->offs[0].end = utf8_target
2356                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2357                 : s - strbeg + prog->minlenret;
2358             if ( !(flags & REXEC_NOT_FIRST) )
2359                 S_reg_set_capture_string(aTHX_ rx,
2360                                         strbeg, strend,
2361                                         sv, flags, utf8_target);
2362
2363             return 1;
2364         }
2365     }
2366
2367     multiline = prog->extflags & RXf_PMf_MULTILINE;
2368
2369     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2370         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2371                               "String too short [regexec_flags]...\n"));
2372         goto phooey;
2373     }
2374
2375     /* Check validity of program. */
2376     if (UCHARAT(progi->program) != REG_MAGIC) {
2377         Perl_croak(aTHX_ "corrupted regexp program");
2378     }
2379
2380     RX_MATCH_TAINTED_off(rx);
2381
2382     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2383     reginfo->intuit = 0;
2384     reginfo->is_utf8_target = cBOOL(utf8_target);
2385     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2386     reginfo->warned = FALSE;
2387     reginfo->strbeg  = strbeg;
2388     reginfo->sv = sv;
2389     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2390     reginfo->strend = strend;
2391     /* see how far we have to get to not match where we matched before */
2392     reginfo->till = stringarg + minend;
2393
2394     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2395         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2396            S_cleanup_regmatch_info_aux has executed (registered by
2397            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2398            magic belonging to this SV.
2399            Not newSVsv, either, as it does not COW.
2400         */
2401         reginfo->sv = newSV(0);
2402         sv_setsv(reginfo->sv, sv);
2403         SAVEFREESV(reginfo->sv);
2404     }
2405
2406     /* reserve next 2 or 3 slots in PL_regmatch_state:
2407      * slot N+0: may currently be in use: skip it
2408      * slot N+1: use for regmatch_info_aux struct
2409      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2410      * slot N+3: ready for use by regmatch()
2411      */
2412
2413     {
2414         regmatch_state *old_regmatch_state;
2415         regmatch_slab  *old_regmatch_slab;
2416         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2417
2418         /* on first ever match, allocate first slab */
2419         if (!PL_regmatch_slab) {
2420             Newx(PL_regmatch_slab, 1, regmatch_slab);
2421             PL_regmatch_slab->prev = NULL;
2422             PL_regmatch_slab->next = NULL;
2423             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2424         }
2425
2426         old_regmatch_state = PL_regmatch_state;
2427         old_regmatch_slab  = PL_regmatch_slab;
2428
2429         for (i=0; i <= max; i++) {
2430             if (i == 1)
2431                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2432             else if (i ==2)
2433                 reginfo->info_aux_eval =
2434                 reginfo->info_aux->info_aux_eval =
2435                             &(PL_regmatch_state->u.info_aux_eval);
2436
2437             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2438                 PL_regmatch_state = S_push_slab(aTHX);
2439         }
2440
2441         /* note initial PL_regmatch_state position; at end of match we'll
2442          * pop back to there and free any higher slabs */
2443
2444         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2445         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2446         reginfo->info_aux->poscache = NULL;
2447
2448         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2449
2450         if ((prog->extflags & RXf_EVAL_SEEN))
2451             S_setup_eval_state(aTHX_ reginfo);
2452         else
2453             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2454     }
2455
2456     /* If there is a "must appear" string, look for it. */
2457
2458     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2459         /* We have to be careful. If the previous successful match
2460            was from this regex we don't want a subsequent partially
2461            successful match to clobber the old results.
2462            So when we detect this possibility we add a swap buffer
2463            to the re, and switch the buffer each match. If we fail,
2464            we switch it back; otherwise we leave it swapped.
2465         */
2466         swap = prog->offs;
2467         /* do we need a save destructor here for eval dies? */
2468         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2469         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2470             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2471             PTR2UV(prog),
2472             PTR2UV(swap),
2473             PTR2UV(prog->offs)
2474         ));
2475     }
2476
2477     /* Simplest case:  anchored match need be tried only once. */
2478     /*  [unless only anchor is BOL and multiline is set] */
2479     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2480         if (s == startpos && regtry(reginfo, &s))
2481             goto got_it;
2482         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2483                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2484         {
2485             char *end;
2486
2487             if (minlen)
2488                 dontbother = minlen - 1;
2489             end = HOP3c(strend, -dontbother, strbeg) - 1;
2490             /* for multiline we only have to try after newlines */
2491             if (prog->check_substr || prog->check_utf8) {
2492                 /* because of the goto we can not easily reuse the macros for bifurcating the
2493                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2494                 if (utf8_target) {
2495                     if (s == startpos)
2496                         goto after_try_utf8;
2497                     while (1) {
2498                         if (regtry(reginfo, &s)) {
2499                             goto got_it;
2500                         }
2501                       after_try_utf8:
2502                         if (s > end) {
2503                             goto phooey;
2504                         }
2505                         if (prog->extflags & RXf_USE_INTUIT) {
2506                             s = re_intuit_start(rx, sv, strbeg,
2507                                     s + UTF8SKIP(s), strend, flags, NULL);
2508                             if (!s) {
2509                                 goto phooey;
2510                             }
2511                         }
2512                         else {
2513                             s += UTF8SKIP(s);
2514                         }
2515                     }
2516                 } /* end search for check string in unicode */
2517                 else {
2518                     if (s == startpos) {
2519                         goto after_try_latin;
2520                     }
2521                     while (1) {
2522                         if (regtry(reginfo, &s)) {
2523                             goto got_it;
2524                         }
2525                       after_try_latin:
2526                         if (s > end) {
2527                             goto phooey;
2528                         }
2529                         if (prog->extflags & RXf_USE_INTUIT) {
2530                             s = re_intuit_start(rx, sv, strbeg,
2531                                         s + 1, strend, flags, NULL);
2532                             if (!s) {
2533                                 goto phooey;
2534                             }
2535                         }
2536                         else {
2537                             s++;
2538                         }
2539                     }
2540                 } /* end search for check string in latin*/
2541             } /* end search for check string */
2542             else { /* search for newline */
2543                 if (s > startpos) {
2544                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2545                     s--;
2546                 }
2547                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2548                 while (s <= end) { /* note it could be possible to match at the end of the string */
2549                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2550                         if (regtry(reginfo, &s))
2551                             goto got_it;
2552                     }
2553                 }
2554             } /* end search for newline */
2555         } /* end anchored/multiline check string search */
2556         goto phooey;
2557     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2558     {
2559         /* For anchored \G, the only position it can match from is
2560          * (ganch-gofs); we already set startpos to this above; if intuit
2561          * moved us on from there, we can't possibly succeed */
2562         assert(startpos == reginfo->ganch - prog->gofs);
2563         if (s == startpos && regtry(reginfo, &s))
2564             goto got_it;
2565         goto phooey;
2566     }
2567
2568     /* Messy cases:  unanchored match. */
2569     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2570         /* we have /x+whatever/ */
2571         /* it must be a one character string (XXXX Except is_utf8_pat?) */
2572         char ch;
2573 #ifdef DEBUGGING
2574         int did_match = 0;
2575 #endif
2576         if (utf8_target) {
2577             if (! prog->anchored_utf8) {
2578                 to_utf8_substr(prog);
2579             }
2580             ch = SvPVX_const(prog->anchored_utf8)[0];
2581             REXEC_FBC_SCAN(
2582                 if (*s == ch) {
2583                     DEBUG_EXECUTE_r( did_match = 1 );
2584                     if (regtry(reginfo, &s)) goto got_it;
2585                     s += UTF8SKIP(s);
2586                     while (s < strend && *s == ch)
2587                         s += UTF8SKIP(s);
2588                 }
2589             );
2590
2591         }
2592         else {
2593             if (! prog->anchored_substr) {
2594                 if (! to_byte_substr(prog)) {
2595                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2596                 }
2597             }
2598             ch = SvPVX_const(prog->anchored_substr)[0];
2599             REXEC_FBC_SCAN(
2600                 if (*s == ch) {
2601                     DEBUG_EXECUTE_r( did_match = 1 );
2602                     if (regtry(reginfo, &s)) goto got_it;
2603                     s++;
2604                     while (s < strend && *s == ch)
2605                         s++;
2606                 }
2607             );
2608         }
2609         DEBUG_EXECUTE_r(if (!did_match)
2610                 PerlIO_printf(Perl_debug_log,
2611                                   "Did not find anchored character...\n")
2612                );
2613     }
2614     else if (prog->anchored_substr != NULL
2615               || prog->anchored_utf8 != NULL
2616               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2617                   && prog->float_max_offset < strend - s)) {
2618         SV *must;
2619         SSize_t back_max;
2620         SSize_t back_min;
2621         char *last;
2622         char *last1;            /* Last position checked before */
2623 #ifdef DEBUGGING
2624         int did_match = 0;
2625 #endif
2626         if (prog->anchored_substr || prog->anchored_utf8) {
2627             if (utf8_target) {
2628                 if (! prog->anchored_utf8) {
2629                     to_utf8_substr(prog);
2630                 }
2631                 must = prog->anchored_utf8;
2632             }
2633             else {
2634                 if (! prog->anchored_substr) {
2635                     if (! to_byte_substr(prog)) {
2636                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2637                     }
2638                 }
2639                 must = prog->anchored_substr;
2640             }
2641             back_max = back_min = prog->anchored_offset;
2642         } else {
2643             if (utf8_target) {
2644                 if (! prog->float_utf8) {
2645                     to_utf8_substr(prog);
2646                 }
2647                 must = prog->float_utf8;
2648             }
2649             else {
2650                 if (! prog->float_substr) {
2651                     if (! to_byte_substr(prog)) {
2652                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2653                     }
2654                 }
2655                 must = prog->float_substr;
2656             }
2657             back_max = prog->float_max_offset;
2658             back_min = prog->float_min_offset;
2659         }
2660
2661         if (back_min<0) {
2662             last = strend;
2663         } else {
2664             last = HOP3c(strend,        /* Cannot start after this */
2665                   -(SSize_t)(CHR_SVLEN(must)
2666                          - (SvTAIL(must) != 0) + back_min), strbeg);
2667         }
2668         if (s > reginfo->strbeg)
2669             last1 = HOPc(s, -1);
2670         else
2671             last1 = s - 1;      /* bogus */
2672
2673         /* XXXX check_substr already used to find "s", can optimize if
2674            check_substr==must. */
2675         dontbother = 0;
2676         strend = HOPc(strend, -dontbother);
2677         while ( (s <= last) &&
2678                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2679                                   (unsigned char*)strend, must,
2680                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2681             DEBUG_EXECUTE_r( did_match = 1 );
2682             if (HOPc(s, -back_max) > last1) {
2683                 last1 = HOPc(s, -back_min);
2684                 s = HOPc(s, -back_max);
2685             }
2686             else {
2687                 char * const t = (last1 >= reginfo->strbeg)
2688                                     ? HOPc(last1, 1) : last1 + 1;
2689
2690                 last1 = HOPc(s, -back_min);
2691                 s = t;
2692             }
2693             if (utf8_target) {
2694                 while (s <= last1) {
2695                     if (regtry(reginfo, &s))
2696                         goto got_it;
2697                     if (s >= last1) {
2698                         s++; /* to break out of outer loop */
2699                         break;
2700                     }
2701                     s += UTF8SKIP(s);
2702                 }
2703             }
2704             else {
2705                 while (s <= last1) {
2706                     if (regtry(reginfo, &s))
2707                         goto got_it;
2708                     s++;
2709                 }
2710             }
2711         }
2712         DEBUG_EXECUTE_r(if (!did_match) {
2713             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2714                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2715             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2716                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2717                                ? "anchored" : "floating"),
2718                 quoted, RE_SV_TAIL(must));
2719         });
2720         goto phooey;
2721     }
2722     else if ( (c = progi->regstclass) ) {
2723         if (minlen) {
2724             const OPCODE op = OP(progi->regstclass);
2725             /* don't bother with what can't match */
2726             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2727                 strend = HOPc(strend, -(minlen - 1));
2728         }
2729         DEBUG_EXECUTE_r({
2730             SV * const prop = sv_newmortal();
2731             regprop(prog, prop, c);
2732             {
2733                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2734                     s,strend-s,60);
2735                 PerlIO_printf(Perl_debug_log,
2736                     "Matching stclass %.*s against %s (%d bytes)\n",
2737                     (int)SvCUR(prop), SvPVX_const(prop),
2738                      quoted, (int)(strend - s));
2739             }
2740         });
2741         if (find_byclass(prog, c, s, strend, reginfo))
2742             goto got_it;
2743         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2744     }
2745     else {
2746         dontbother = 0;
2747         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2748             /* Trim the end. */
2749             char *last= NULL;
2750             SV* float_real;
2751             STRLEN len;
2752             const char *little;
2753
2754             if (utf8_target) {
2755                 if (! prog->float_utf8) {
2756                     to_utf8_substr(prog);
2757                 }
2758                 float_real = prog->float_utf8;
2759             }
2760             else {
2761                 if (! prog->float_substr) {
2762                     if (! to_byte_substr(prog)) {
2763                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2764                     }
2765                 }
2766                 float_real = prog->float_substr;
2767             }
2768
2769             little = SvPV_const(float_real, len);
2770             if (SvTAIL(float_real)) {
2771                     /* This means that float_real contains an artificial \n on
2772                      * the end due to the presence of something like this:
2773                      * /foo$/ where we can match both "foo" and "foo\n" at the
2774                      * end of the string.  So we have to compare the end of the
2775                      * string first against the float_real without the \n and
2776                      * then against the full float_real with the string.  We
2777                      * have to watch out for cases where the string might be
2778                      * smaller than the float_real or the float_real without
2779                      * the \n. */
2780                     char *checkpos= strend - len;
2781                     DEBUG_OPTIMISE_r(
2782                         PerlIO_printf(Perl_debug_log,
2783                             "%sChecking for float_real.%s\n",
2784                             PL_colors[4], PL_colors[5]));
2785                     if (checkpos + 1 < strbeg) {
2786                         /* can't match, even if we remove the trailing \n
2787                          * string is too short to match */
2788                         DEBUG_EXECUTE_r(
2789                             PerlIO_printf(Perl_debug_log,
2790                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2791                                 PL_colors[4], PL_colors[5]));
2792                         goto phooey;
2793                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2794                         /* can match, the end of the string matches without the
2795                          * "\n" */
2796                         last = checkpos + 1;
2797                     } else if (checkpos < strbeg) {
2798                         /* cant match, string is too short when the "\n" is
2799                          * included */
2800                         DEBUG_EXECUTE_r(
2801                             PerlIO_printf(Perl_debug_log,
2802                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2803                                 PL_colors[4], PL_colors[5]));
2804                         goto phooey;
2805                     } else if (!multiline) {
2806                         /* non multiline match, so compare with the "\n" at the
2807                          * end of the string */
2808                         if (memEQ(checkpos, little, len)) {
2809                             last= checkpos;
2810                         } else {
2811                             DEBUG_EXECUTE_r(
2812                                 PerlIO_printf(Perl_debug_log,
2813                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2814                                     PL_colors[4], PL_colors[5]));
2815                             goto phooey;
2816                         }
2817                     } else {
2818                         /* multiline match, so we have to search for a place
2819                          * where the full string is located */
2820                         goto find_last;
2821                     }
2822             } else {
2823                   find_last:
2824                     if (len)
2825                         last = rninstr(s, strend, little, little + len);
2826                     else
2827                         last = strend;  /* matching "$" */
2828             }
2829             if (!last) {
2830                 /* at one point this block contained a comment which was
2831                  * probably incorrect, which said that this was a "should not
2832                  * happen" case.  Even if it was true when it was written I am
2833                  * pretty sure it is not anymore, so I have removed the comment
2834                  * and replaced it with this one. Yves */
2835                 DEBUG_EXECUTE_r(
2836                     PerlIO_printf(Perl_debug_log,
2837                         "String does not contain required substring, cannot match.\n"
2838                     ));
2839                 goto phooey;
2840             }
2841             dontbother = strend - last + prog->float_min_offset;
2842         }
2843         if (minlen && (dontbother < minlen))
2844             dontbother = minlen - 1;
2845         strend -= dontbother;              /* this one's always in bytes! */
2846         /* We don't know much -- general case. */
2847         if (utf8_target) {
2848             for (;;) {
2849                 if (regtry(reginfo, &s))
2850                     goto got_it;
2851                 if (s >= strend)
2852                     break;
2853                 s += UTF8SKIP(s);
2854             };
2855         }
2856         else {
2857             do {
2858                 if (regtry(reginfo, &s))
2859                     goto got_it;
2860             } while (s++ < strend);
2861         }
2862     }
2863
2864     /* Failure. */
2865     goto phooey;
2866
2867 got_it:
2868     /* s/// doesn't like it if $& is earlier than where we asked it to
2869      * start searching (which can happen on something like /.\G/) */
2870     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2871             && (prog->offs[0].start < stringarg - strbeg))
2872     {
2873         /* this should only be possible under \G */
2874         assert(prog->extflags & RXf_GPOS_SEEN);
2875         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2876             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2877         goto phooey;
2878     }
2879
2880     DEBUG_BUFFERS_r(
2881         if (swap)
2882             PerlIO_printf(Perl_debug_log,
2883                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2884                 PTR2UV(prog),
2885                 PTR2UV(swap)
2886             );
2887     );
2888     Safefree(swap);
2889
2890     /* clean up; this will trigger destructors that will free all slabs
2891      * above the current one, and cleanup the regmatch_info_aux
2892      * and regmatch_info_aux_eval sructs */
2893
2894     LEAVE_SCOPE(oldsave);
2895
2896     if (RXp_PAREN_NAMES(prog))
2897         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2898
2899     RX_MATCH_UTF8_set(rx, utf8_target);
2900
2901     /* make sure $`, $&, $', and $digit will work later */
2902     if ( !(flags & REXEC_NOT_FIRST) )
2903         S_reg_set_capture_string(aTHX_ rx,
2904                                     strbeg, reginfo->strend,
2905                                     sv, flags, utf8_target);
2906
2907     return 1;
2908
2909 phooey:
2910     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2911                           PL_colors[4], PL_colors[5]));
2912
2913     /* clean up; this will trigger destructors that will free all slabs
2914      * above the current one, and cleanup the regmatch_info_aux
2915      * and regmatch_info_aux_eval sructs */
2916
2917     LEAVE_SCOPE(oldsave);
2918
2919     if (swap) {
2920         /* we failed :-( roll it back */
2921         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2922             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2923             PTR2UV(prog),
2924             PTR2UV(prog->offs),
2925             PTR2UV(swap)
2926         ));
2927         Safefree(prog->offs);
2928         prog->offs = swap;
2929     }
2930     return 0;
2931 }
2932
2933
2934 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2935  * Do inc before dec, in case old and new rex are the same */
2936 #define SET_reg_curpm(Re2) \
2937     if (reginfo->info_aux_eval) {                   \
2938         (void)ReREFCNT_inc(Re2);                    \
2939         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2940         PM_SETRE((PL_reg_curpm), (Re2));            \
2941     }
2942
2943
2944 /*
2945  - regtry - try match at specific point
2946  */
2947 STATIC I32                      /* 0 failure, 1 success */
2948 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2949 {
2950     dVAR;
2951     CHECKPOINT lastcp;
2952     REGEXP *const rx = reginfo->prog;
2953     regexp *const prog = ReANY(rx);
2954     SSize_t result;
2955     RXi_GET_DECL(prog,progi);
2956     GET_RE_DEBUG_FLAGS_DECL;
2957
2958     PERL_ARGS_ASSERT_REGTRY;
2959
2960     reginfo->cutpoint=NULL;
2961
2962     prog->offs[0].start = *startposp - reginfo->strbeg;
2963     prog->lastparen = 0;
2964     prog->lastcloseparen = 0;
2965
2966     /* XXXX What this code is doing here?!!!  There should be no need
2967        to do this again and again, prog->lastparen should take care of
2968        this!  --ilya*/
2969
2970     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2971      * Actually, the code in regcppop() (which Ilya may be meaning by
2972      * prog->lastparen), is not needed at all by the test suite
2973      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2974      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2975      * Meanwhile, this code *is* needed for the
2976      * above-mentioned test suite tests to succeed.  The common theme
2977      * on those tests seems to be returning null fields from matches.
2978      * --jhi updated by dapm */
2979 #if 1
2980     if (prog->nparens) {
2981         regexp_paren_pair *pp = prog->offs;
2982         I32 i;
2983         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2984             ++pp;
2985             pp->start = -1;
2986             pp->end = -1;
2987         }
2988     }
2989 #endif
2990     REGCP_SET(lastcp);
2991     result = regmatch(reginfo, *startposp, progi->program + 1);
2992     if (result != -1) {
2993         prog->offs[0].end = result;
2994         return 1;
2995     }
2996     if (reginfo->cutpoint)
2997         *startposp= reginfo->cutpoint;
2998     REGCP_UNWIND(lastcp);
2999     return 0;
3000 }
3001
3002
3003 #define sayYES goto yes
3004 #define sayNO goto no
3005 #define sayNO_SILENT goto no_silent
3006
3007 /* we dont use STMT_START/END here because it leads to
3008    "unreachable code" warnings, which are bogus, but distracting. */
3009 #define CACHEsayNO \
3010     if (ST.cache_mask) \
3011        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3012     sayNO
3013
3014 /* this is used to determine how far from the left messages like
3015    'failed...' are printed. It should be set such that messages
3016    are inline with the regop output that created them.
3017 */
3018 #define REPORT_CODE_OFF 32
3019
3020
3021 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3022 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3023 #define CHRTEST_NOT_A_CP_1 -999
3024 #define CHRTEST_NOT_A_CP_2 -998
3025
3026 /* grab a new slab and return the first slot in it */
3027
3028 STATIC regmatch_state *
3029 S_push_slab(pTHX)
3030 {
3031 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3032     dMY_CXT;
3033 #endif
3034     regmatch_slab *s = PL_regmatch_slab->next;
3035     if (!s) {
3036         Newx(s, 1, regmatch_slab);
3037         s->prev = PL_regmatch_slab;
3038         s->next = NULL;
3039         PL_regmatch_slab->next = s;
3040     }
3041     PL_regmatch_slab = s;
3042     return SLAB_FIRST(s);
3043 }
3044
3045
3046 /* push a new state then goto it */
3047
3048 #define PUSH_STATE_GOTO(state, node, input) \
3049     pushinput = input; \
3050     scan = node; \
3051     st->resume_state = state; \
3052     goto push_state;
3053
3054 /* push a new state with success backtracking, then goto it */
3055
3056 #define PUSH_YES_STATE_GOTO(state, node, input) \
3057     pushinput = input; \
3058     scan = node; \
3059     st->resume_state = state; \
3060     goto push_yes_state;
3061
3062
3063
3064
3065 /*
3066
3067 regmatch() - main matching routine
3068
3069 This is basically one big switch statement in a loop. We execute an op,
3070 set 'next' to point the next op, and continue. If we come to a point which
3071 we may need to backtrack to on failure such as (A|B|C), we push a
3072 backtrack state onto the backtrack stack. On failure, we pop the top
3073 state, and re-enter the loop at the state indicated. If there are no more
3074 states to pop, we return failure.
3075
3076 Sometimes we also need to backtrack on success; for example /A+/, where
3077 after successfully matching one A, we need to go back and try to
3078 match another one; similarly for lookahead assertions: if the assertion
3079 completes successfully, we backtrack to the state just before the assertion
3080 and then carry on.  In these cases, the pushed state is marked as
3081 'backtrack on success too'. This marking is in fact done by a chain of
3082 pointers, each pointing to the previous 'yes' state. On success, we pop to
3083 the nearest yes state, discarding any intermediate failure-only states.
3084 Sometimes a yes state is pushed just to force some cleanup code to be
3085 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3086 it to free the inner regex.
3087
3088 Note that failure backtracking rewinds the cursor position, while
3089 success backtracking leaves it alone.
3090
3091 A pattern is complete when the END op is executed, while a subpattern
3092 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3093 ops trigger the "pop to last yes state if any, otherwise return true"
3094 behaviour.
3095
3096 A common convention in this function is to use A and B to refer to the two
3097 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3098 the subpattern to be matched possibly multiple times, while B is the entire
3099 rest of the pattern. Variable and state names reflect this convention.
3100
3101 The states in the main switch are the union of ops and failure/success of
3102 substates associated with with that op.  For example, IFMATCH is the op
3103 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3104 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3105 successfully matched A and IFMATCH_A_fail is a state saying that we have
3106 just failed to match A. Resume states always come in pairs. The backtrack
3107 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3108 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3109 on success or failure.
3110
3111 The struct that holds a backtracking state is actually a big union, with
3112 one variant for each major type of op. The variable st points to the
3113 top-most backtrack struct. To make the code clearer, within each
3114 block of code we #define ST to alias the relevant union.
3115
3116 Here's a concrete example of a (vastly oversimplified) IFMATCH
3117 implementation:
3118
3119     switch (state) {
3120     ....
3121
3122 #define ST st->u.ifmatch
3123
3124     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3125         ST.foo = ...; // some state we wish to save
3126         ...
3127         // push a yes backtrack state with a resume value of
3128         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3129         // first node of A:
3130         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3131         // NOTREACHED
3132
3133     case IFMATCH_A: // we have successfully executed A; now continue with B
3134         next = B;
3135         bar = ST.foo; // do something with the preserved value
3136         break;
3137
3138     case IFMATCH_A_fail: // A failed, so the assertion failed
3139         ...;   // do some housekeeping, then ...
3140         sayNO; // propagate the failure
3141
3142 #undef ST
3143
3144     ...
3145     }
3146
3147 For any old-timers reading this who are familiar with the old recursive
3148 approach, the code above is equivalent to:
3149
3150     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3151     {
3152         int foo = ...
3153         ...
3154         if (regmatch(A)) {
3155             next = B;
3156             bar = foo;
3157             break;
3158         }
3159         ...;   // do some housekeeping, then ...
3160         sayNO; // propagate the failure
3161     }
3162
3163 The topmost backtrack state, pointed to by st, is usually free. If you
3164 want to claim it, populate any ST.foo fields in it with values you wish to
3165 save, then do one of
3166
3167         PUSH_STATE_GOTO(resume_state, node, newinput);
3168         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3169
3170 which sets that backtrack state's resume value to 'resume_state', pushes a
3171 new free entry to the top of the backtrack stack, then goes to 'node'.
3172 On backtracking, the free slot is popped, and the saved state becomes the
3173 new free state. An ST.foo field in this new top state can be temporarily
3174 accessed to retrieve values, but once the main loop is re-entered, it
3175 becomes available for reuse.
3176
3177 Note that the depth of the backtrack stack constantly increases during the
3178 left-to-right execution of the pattern, rather than going up and down with
3179 the pattern nesting. For example the stack is at its maximum at Z at the
3180 end of the pattern, rather than at X in the following:
3181
3182     /(((X)+)+)+....(Y)+....Z/
3183
3184 The only exceptions to this are lookahead/behind assertions and the cut,
3185 (?>A), which pop all the backtrack states associated with A before
3186 continuing.
3187
3188 Backtrack state structs are allocated in slabs of about 4K in size.
3189 PL_regmatch_state and st always point to the currently active state,
3190 and PL_regmatch_slab points to the slab currently containing
3191 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3192 allocated, and is never freed until interpreter destruction. When the slab
3193 is full, a new one is allocated and chained to the end. At exit from
3194 regmatch(), slabs allocated since entry are freed.
3195
3196 */
3197
3198
3199 #define DEBUG_STATE_pp(pp)                                  \
3200     DEBUG_STATE_r({                                         \
3201         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3202         PerlIO_printf(Perl_debug_log,                       \
3203             "    %*s"pp" %s%s%s%s%s\n",                     \
3204             depth*2, "",                                    \
3205             PL_reg_name[st->resume_state],                     \
3206             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3207             ((st==yes_state) ? "Y" : ""),                   \
3208             ((st==mark_state) ? "M" : ""),                  \
3209             ((st==yes_state||st==mark_state) ? "]" : "")    \
3210         );                                                  \
3211     });
3212
3213
3214 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3215
3216 #ifdef DEBUGGING
3217
3218 STATIC void
3219 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3220     const char *start, const char *end, const char *blurb)
3221 {
3222     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3223
3224     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3225
3226     if (!PL_colorset)
3227             reginitcolors();
3228     {
3229         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3230             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3231
3232         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3233             start, end - start, 60);
3234
3235         PerlIO_printf(Perl_debug_log,
3236             "%s%s REx%s %s against %s\n",
3237                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3238
3239         if (utf8_target||utf8_pat)
3240             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3241                 utf8_pat ? "pattern" : "",
3242                 utf8_pat && utf8_target ? " and " : "",
3243                 utf8_target ? "string" : ""
3244             );
3245     }
3246 }
3247
3248 STATIC void
3249 S_dump_exec_pos(pTHX_ const char *locinput,
3250                       const regnode *scan,
3251                       const char *loc_regeol,
3252                       const char *loc_bostr,
3253                       const char *loc_reg_starttry,
3254                       const bool utf8_target)
3255 {
3256     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3257     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3258     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3259     /* The part of the string before starttry has one color
3260        (pref0_len chars), between starttry and current
3261        position another one (pref_len - pref0_len chars),
3262        after the current position the third one.
3263        We assume that pref0_len <= pref_len, otherwise we
3264        decrease pref0_len.  */
3265     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3266         ? (5 + taill) - l : locinput - loc_bostr;
3267     int pref0_len;
3268
3269     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3270
3271     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3272         pref_len++;
3273     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3274     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3275         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3276               ? (5 + taill) - pref_len : loc_regeol - locinput);
3277     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3278         l--;
3279     if (pref0_len < 0)
3280         pref0_len = 0;
3281     if (pref0_len > pref_len)
3282         pref0_len = pref_len;
3283     {
3284         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3285
3286         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3287             (locinput - pref_len),pref0_len, 60, 4, 5);
3288
3289         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3290                     (locinput - pref_len + pref0_len),
3291                     pref_len - pref0_len, 60, 2, 3);
3292
3293         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3294                     locinput, loc_regeol - locinput, 10, 0, 1);
3295
3296         const STRLEN tlen=len0+len1+len2;
3297         PerlIO_printf(Perl_debug_log,
3298                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3299                     (IV)(locinput - loc_bostr),
3300                     len0, s0,
3301                     len1, s1,
3302                     (docolor ? "" : "> <"),
3303                     len2, s2,
3304                     (int)(tlen > 19 ? 0 :  19 - tlen),
3305                     "");
3306     }
3307 }
3308
3309 #endif
3310
3311 /* reg_check_named_buff_matched()
3312  * Checks to see if a named buffer has matched. The data array of
3313  * buffer numbers corresponding to the buffer is expected to reside
3314  * in the regexp->data->data array in the slot stored in the ARG() of
3315  * node involved. Note that this routine doesn't actually care about the
3316  * name, that information is not preserved from compilation to execution.
3317  * Returns the index of the leftmost defined buffer with the given name
3318  * or 0 if non of the buffers matched.
3319  */
3320 STATIC I32
3321 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3322 {
3323     I32 n;
3324     RXi_GET_DECL(rex,rexi);
3325     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3326     I32 *nums=(I32*)SvPVX(sv_dat);
3327
3328     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3329
3330     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3331         if ((I32)rex->lastparen >= nums[n] &&
3332             rex->offs[nums[n]].end != -1)
3333         {
3334             return nums[n];
3335         }
3336     }
3337     return 0;
3338 }
3339
3340
3341 static bool
3342 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3343         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3344 {
3345     /* This function determines if there are one or two characters that match
3346      * the first character of the passed-in EXACTish node <text_node>, and if
3347      * so, returns them in the passed-in pointers.
3348      *
3349      * If it determines that no possible character in the target string can
3350      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3351      * the first character in <text_node> requires UTF-8 to represent, and the
3352      * target string isn't in UTF-8.)
3353      *
3354      * If there are more than two characters that could match the beginning of
3355      * <text_node>, or if more context is required to determine a match or not,
3356      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3357      *
3358      * The motiviation behind this function is to allow the caller to set up
3359      * tight loops for matching.  If <text_node> is of type EXACT, there is
3360      * only one possible character that can match its first character, and so
3361      * the situation is quite simple.  But things get much more complicated if
3362      * folding is involved.  It may be that the first character of an EXACTFish
3363      * node doesn't participate in any possible fold, e.g., punctuation, so it
3364      * can be matched only by itself.  The vast majority of characters that are
3365      * in folds match just two things, their lower and upper-case equivalents.
3366      * But not all are like that; some have multiple possible matches, or match
3367      * sequences of more than one character.  This function sorts all that out.
3368      *
3369      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3370      * loop of trying to match A*, we know we can't exit where the thing
3371      * following it isn't a B.  And something can't be a B unless it is the
3372      * beginning of B.  By putting a quick test for that beginning in a tight
3373      * loop, we can rule out things that can't possibly be B without having to
3374      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3375      * character, we can make a tight loop matching A*, using the outputs of
3376      * this function.
3377      *
3378      * If the target string to match isn't in UTF-8, and there aren't
3379      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3380      * the one or two possible octets (which are characters in this situation)
3381      * that can match.  In all cases, if there is only one character that can
3382      * match, *<c1p> and *<c2p> will be identical.
3383      *
3384      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3385      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3386      * can match the beginning of <text_node>.  They should be declared with at
3387      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3388      * undefined what these contain.)  If one or both of the buffers are
3389      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3390      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3391      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3392      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3393      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3394
3395     const bool utf8_target = reginfo->is_utf8_target;
3396
3397     UV c1 = CHRTEST_NOT_A_CP_1;
3398     UV c2 = CHRTEST_NOT_A_CP_2;
3399     bool use_chrtest_void = FALSE;
3400     const bool is_utf8_pat = reginfo->is_utf8_pat;
3401
3402     /* Used when we have both utf8 input and utf8 output, to avoid converting
3403      * to/from code points */
3404     bool utf8_has_been_setup = FALSE;
3405
3406     dVAR;
3407
3408     U8 *pat = (U8*)STRING(text_node);
3409
3410     if (OP(text_node) == EXACT) {
3411
3412         /* In an exact node, only one thing can be matched, that first
3413          * character.  If both the pat and the target are UTF-8, we can just
3414          * copy the input to the output, avoiding finding the code point of
3415          * that character */
3416         if (!is_utf8_pat) {
3417             c2 = c1 = *pat;
3418         }
3419         else if (utf8_target) {
3420             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3421             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3422             utf8_has_been_setup = TRUE;
3423         }
3424         else {
3425             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3426         }
3427     }
3428     else /* an EXACTFish node */
3429          if ((is_utf8_pat
3430                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3431                                                     pat + STR_LEN(text_node)))
3432              || (!is_utf8_pat
3433                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3434                                                     pat + STR_LEN(text_node))))
3435     {
3436         /* Multi-character folds require more context to sort out.  Also
3437          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3438          * handled outside this routine */
3439         use_chrtest_void = TRUE;
3440     }
3441     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3442         c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3443         if (c1 > 256) {
3444             /* Load the folds hash, if not already done */
3445             SV** listp;
3446             if (! PL_utf8_foldclosures) {
3447                 if (! PL_utf8_tofold) {
3448                     U8 dummy[UTF8_MAXBYTES_CASE+1];
3449
3450                     /* Force loading this by folding an above-Latin1 char */
3451                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3452                     assert(PL_utf8_tofold); /* Verify that worked */
3453                 }
3454                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3455             }
3456
3457             /* The fold closures data structure is a hash with the keys being
3458              * the UTF-8 of every character that is folded to, like 'k', and
3459              * the values each an array of all code points that fold to its
3460              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3461              * not included */
3462             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3463                                      (char *) pat,
3464                                      UTF8SKIP(pat),
3465                                      FALSE))))
3466             {
3467                 /* Not found in the hash, therefore there are no folds
3468                  * containing it, so there is only a single character that
3469                  * could match */
3470                 c2 = c1;
3471             }
3472             else {  /* Does participate in folds */
3473                 AV* list = (AV*) *listp;
3474                 if (av_len(list) != 1) {
3475
3476                     /* If there aren't exactly two folds to this, it is outside
3477                      * the scope of this function */
3478                     use_chrtest_void = TRUE;
3479                 }
3480                 else {  /* There are two.  Get them */
3481                     SV** c_p = av_fetch(list, 0, FALSE);
3482                     if (c_p == NULL) {
3483                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3484                     }
3485                     c1 = SvUV(*c_p);
3486
3487                     c_p = av_fetch(list, 1, FALSE);
3488                     if (c_p == NULL) {
3489                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3490                     }
3491                     c2 = SvUV(*c_p);
3492
3493                     /* Folds that cross the 255/256 boundary are forbidden if
3494                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3495                      * pattern character is above 256, and its only other match
3496                      * is below 256, the only legal match will be to itself.
3497                      * We have thrown away the original, so have to compute
3498                      * which is the one above 255 */
3499                     if ((c1 < 256) != (c2 < 256)) {
3500                         if (OP(text_node) == EXACTFL
3501                             || ((OP(text_node) == EXACTFA
3502                                  || OP(text_node) == EXACTFA_NO_TRIE)
3503                                 && (isASCII(c1) || isASCII(c2))))
3504                         {
3505                             if (c1 < 256) {
3506                                 c1 = c2;
3507                             }
3508                             else {
3509                                 c2 = c1;
3510                             }
3511                         }
3512                     }
3513                 }
3514             }
3515         }
3516         else /* Here, c1 is < 255 */
3517              if (utf8_target
3518                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3519                  && OP(text_node) != EXACTFL
3520                  && ((OP(text_node) != EXACTFA
3521                       && OP(text_node) != EXACTFA_NO_TRIE)
3522                      || ! isASCII(c1)))
3523         {
3524             /* Here, there could be something above Latin1 in the target which
3525              * folds to this character in the pattern.  All such cases except
3526              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3527              * involved in their folds, so are outside the scope of this
3528              * function */
3529             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3530                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3531             }
3532             else {
3533                 use_chrtest_void = TRUE;
3534             }
3535         }
3536         else { /* Here nothing above Latin1 can fold to the pattern character */
3537             switch (OP(text_node)) {
3538
3539                 case EXACTFL:   /* /l rules */
3540                     c2 = PL_fold_locale[c1];
3541                     break;
3542
3543                 case EXACTF:   /* This node only generated for non-utf8
3544                                   patterns */
3545                     assert(! is_utf8_pat);
3546                     if (! utf8_target) {    /* /d rules */
3547                         c2 = PL_fold[c1];
3548                         break;
3549                     }
3550                     /* FALLTHROUGH */
3551                     /* /u rules for all these.  This happens to work for
3552                      * EXACTFA as nothing in Latin1 folds to ASCII */
3553                 case EXACTFA_NO_TRIE:   /* This node only generated for
3554                                            non-utf8 patterns */
3555                     assert(! is_utf8_pat);
3556                     /* FALL THROUGH */
3557                 case EXACTFA:
3558                 case EXACTFU_SS:
3559                 case EXACTFU:
3560                     c2 = PL_fold_latin1[c1];
3561                     break;
3562
3563                 default:
3564                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3565                     assert(0); /* NOTREACHED */
3566             }
3567         }
3568     }
3569
3570     /* Here have figured things out.  Set up the returns */
3571     if (use_chrtest_void) {
3572         *c2p = *c1p = CHRTEST_VOID;
3573     }
3574     else if (utf8_target) {
3575         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3576             uvchr_to_utf8(c1_utf8, c1);
3577             uvchr_to_utf8(c2_utf8, c2);
3578         }
3579
3580         /* Invariants are stored in both the utf8 and byte outputs; Use
3581          * negative numbers otherwise for the byte ones.  Make sure that the
3582          * byte ones are the same iff the utf8 ones are the same */
3583         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3584         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3585                 ? *c2_utf8
3586                 : (c1 == c2)
3587                   ? CHRTEST_NOT_A_CP_1
3588                   : CHRTEST_NOT_A_CP_2;
3589     }
3590     else if (c1 > 255) {
3591        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3592                            can represent */
3593            return FALSE;
3594        }
3595
3596        *c1p = *c2p = c2;    /* c2 is the only representable value */
3597     }
3598     else {  /* c1 is representable; see about c2 */
3599        *c1p = c1;
3600        *c2p = (c2 < 256) ? c2 : c1;
3601     }
3602
3603     return TRUE;
3604 }
3605
3606 /* returns -1 on failure, $+[0] on success */
3607 STATIC SSize_t
3608 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3609 {
3610 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3611     dMY_CXT;
3612 #endif
3613     dVAR;
3614     const bool utf8_target = reginfo->is_utf8_target;
3615     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3616     REGEXP *rex_sv = reginfo->prog;
3617     regexp *rex = ReANY(rex_sv);
3618     RXi_GET_DECL(rex,rexi);
3619     /* the current state. This is a cached copy of PL_regmatch_state */
3620     regmatch_state *st;
3621     /* cache heavy used fields of st in registers */
3622     regnode *scan;
3623     regnode *next;
3624     U32 n = 0;  /* general value; init to avoid compiler warning */
3625     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
3626     char *locinput = startpos;
3627     char *pushinput; /* where to continue after a PUSH */
3628     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3629
3630     bool result = 0;        /* return value of S_regmatch */
3631     int depth = 0;          /* depth of backtrack stack */
3632     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3633     const U32 max_nochange_depth =
3634         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3635         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3636     regmatch_state *yes_state = NULL; /* state to pop to on success of
3637                                                             subpattern */
3638     /* mark_state piggy backs on the yes_state logic so that when we unwind
3639        the stack on success we can update the mark_state as we go */
3640     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3641     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3642     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3643     U32 state_num;
3644     bool no_final = 0;      /* prevent failure from backtracking? */
3645     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3646     char *startpoint = locinput;
3647     SV *popmark = NULL;     /* are we looking for a mark? */
3648     SV *sv_commit = NULL;   /* last mark name seen in failure */
3649     SV *sv_yes_mark = NULL; /* last mark name we have seen
3650                                during a successful match */
3651     U32 lastopen = 0;       /* last open we saw */
3652     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3653     SV* const oreplsv = GvSV(PL_replgv);
3654     /* these three flags are set by various ops to signal information to
3655      * the very next op. They have a useful lifetime of exactly one loop
3656      * iteration, and are not preserved or restored by state pushes/pops
3657      */
3658     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3659     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3660     int logical = 0;        /* the following EVAL is:
3661                                 0: (?{...})
3662                                 1: (?(?{...})X|Y)
3663                                 2: (??{...})
3664                                or the following IFMATCH/UNLESSM is:
3665                                 false: plain (?=foo)
3666                                 true:  used as a condition: (?(?=foo))
3667                             */
3668     PAD* last_pad = NULL;
3669     dMULTICALL;
3670     I32 gimme = G_SCALAR;
3671     CV *caller_cv = NULL;       /* who called us */
3672     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3673     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3674     U32 maxopenparen = 0;       /* max '(' index seen so far */
3675     int to_complement;  /* Invert the result? */
3676     _char_class_number classnum;
3677     bool is_utf8_pat = reginfo->is_utf8_pat;
3678
3679 #ifdef DEBUGGING
3680     GET_RE_DEBUG_FLAGS_DECL;
3681 #endif
3682
3683     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3684     multicall_oldcatch = 0;
3685     multicall_cv = NULL;
3686     cx = NULL;
3687     PERL_UNUSED_VAR(multicall_cop);
3688     PERL_UNUSED_VAR(newsp);
3689
3690
3691     PERL_ARGS_ASSERT_REGMATCH;
3692
3693     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3694             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3695     }));
3696
3697     st = PL_regmatch_state;
3698
3699     /* Note that nextchr is a byte even in UTF */
3700     SET_nextchr;
3701     scan = prog;
3702     while (scan != NULL) {
3703
3704         DEBUG_EXECUTE_r( {
3705             SV * const prop = sv_newmortal();
3706             regnode *rnext=regnext(scan);
3707             DUMP_EXEC_POS( locinput, scan, utf8_target );
3708             regprop(rex, prop, scan);
3709
3710             PerlIO_printf(Perl_debug_log,
3711                     "%3"IVdf":%*s%s(%"IVdf")\n",
3712                     (IV)(scan - rexi->program), depth*2, "",
3713                     SvPVX_const(prop),
3714                     (PL_regkind[OP(scan)] == END || !rnext) ?
3715                         0 : (IV)(rnext - rexi->program));
3716         });
3717
3718         next = scan + NEXT_OFF(scan);
3719         if (next == scan)
3720             next = NULL;
3721         state_num = OP(scan);
3722
3723       reenter_switch:
3724         to_complement = 0;
3725
3726         SET_nextchr;
3727         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3728
3729         switch (state_num) {
3730         case BOL: /*  /^../  */
3731             if (locinput == reginfo->strbeg)
3732                 break;
3733             sayNO;
3734
3735         case MBOL: /*  /^../m  */
3736             if (locinput == reginfo->strbeg ||
3737                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3738             {
3739                 break;
3740             }
3741             sayNO;
3742
3743         case SBOL: /*  /^../s  */
3744             if (locinput == reginfo->strbeg)
3745                 break;
3746             sayNO;
3747
3748         case GPOS: /*  \G  */
3749             if (locinput == reginfo->ganch)
3750                 break;
3751             sayNO;
3752
3753         case KEEPS: /*   \K  */
3754             /* update the startpoint */
3755             st->u.keeper.val = rex->offs[0].start;
3756             rex->offs[0].start = locinput - reginfo->strbeg;
3757             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3758             assert(0); /*NOTREACHED*/
3759         case KEEPS_next_fail:
3760             /* rollback the start point change */
3761             rex->offs[0].start = st->u.keeper.val;
3762             sayNO_SILENT;
3763             assert(0); /*NOTREACHED*/
3764
3765         case EOL: /* /..$/  */
3766                 goto seol;
3767
3768         case MEOL: /* /..$/m  */
3769             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3770                 sayNO;
3771             break;
3772
3773         case SEOL: /* /..$/s  */
3774           seol:
3775             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3776                 sayNO;
3777             if (reginfo->strend - locinput > 1)
3778                 sayNO;
3779             break;
3780
3781         case EOS: /*  \z  */
3782             if (!NEXTCHR_IS_EOS)
3783                 sayNO;
3784             break;
3785
3786         case SANY: /*  /./s  */
3787             if (NEXTCHR_IS_EOS)
3788                 sayNO;
3789             goto increment_locinput;
3790
3791         case CANY: /*  \C  */
3792             if (NEXTCHR_IS_EOS)
3793                 sayNO;
3794             locinput++;
3795             break;
3796
3797         case REG_ANY: /*  /./  */
3798             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3799                 sayNO;
3800             goto increment_locinput;
3801
3802
3803 #undef  ST
3804 #define ST st->u.trie
3805         case TRIEC: /* (ab|cd) with known charclass */
3806             /* In this case the charclass data is available inline so
3807                we can fail fast without a lot of extra overhead.
3808              */
3809             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3810                 DEBUG_EXECUTE_r(
3811                     PerlIO_printf(Perl_debug_log,
3812                               "%*s  %sfailed to match trie start class...%s\n",
3813                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3814                 );
3815                 sayNO_SILENT;
3816                 assert(0); /* NOTREACHED */
3817             }
3818             /* FALL THROUGH */
3819         case TRIE:  /* (ab|cd)  */
3820             /* the basic plan of execution of the trie is:
3821              * At the beginning, run though all the states, and
3822              * find the longest-matching word. Also remember the position
3823              * of the shortest matching word. For example, this pattern:
3824              *    1  2 3 4    5
3825              *    ab|a|x|abcd|abc
3826              * when matched against the string "abcde", will generate
3827              * accept states for all words except 3, with the longest
3828              * matching word being 4, and the shortest being 2 (with
3829              * the position being after char 1 of the string).
3830              *
3831              * Then for each matching word, in word order (i.e. 1,2,4,5),
3832              * we run the remainder of the pattern; on each try setting
3833              * the current position to the character following the word,
3834              * returning to try the next word on failure.
3835              *
3836              * We avoid having to build a list of words at runtime by
3837              * using a compile-time structure, wordinfo[].prev, which
3838              * gives, for each word, the previous accepting word (if any).
3839              * In the case above it would contain the mappings 1->2, 2->0,
3840              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3841              * the longest word (4 above), a list of all words, by
3842              * following the list of prev pointers; this gives us the
3843              * unordered list 4,5,1,2. Then given the current word we have
3844              * just tried, we can go through the list and find the
3845              * next-biggest word to try (so if we just failed on word 2,
3846              * the next in the list is 4).
3847              *
3848              * Since at runtime we don't record the matching position in
3849              * the string for each word, we have to work that out for
3850              * each word we're about to process. The wordinfo table holds
3851              * the character length of each word; given that we recorded
3852              * at the start: the position of the shortest word and its
3853              * length in chars, we just need to move the pointer the
3854              * difference between the two char lengths. Depending on
3855              * Unicode status and folding, that's cheap or expensive.
3856              *
3857              * This algorithm is optimised for the case where are only a
3858              * small number of accept states, i.e. 0,1, or maybe 2.
3859              * With lots of accepts states, and having to try all of them,
3860              * it becomes quadratic on number of accept states to find all
3861              * the next words.
3862              */
3863
3864             {
3865                 /* what type of TRIE am I? (utf8 makes this contextual) */
3866                 DECL_TRIE_TYPE(scan);
3867
3868                 /* what trie are we using right now */
3869                 reg_trie_data * const trie
3870                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3871                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3872                 U32 state = trie->startstate;
3873
3874                 if (   trie->bitmap
3875                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3876                 {
3877                     if (trie->states[ state ].wordnum) {
3878                          DEBUG_EXECUTE_r(
3879                             PerlIO_printf(Perl_debug_log,
3880                                           "%*s  %smatched empty string...%s\n",
3881                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3882                         );
3883                         if (!trie->jump)
3884                             break;
3885                     } else {
3886                         DEBUG_EXECUTE_r(
3887                             PerlIO_printf(Perl_debug_log,
3888                                           "%*s  %sfailed to match trie start class...%s\n",
3889                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3890                         );
3891                         sayNO_SILENT;
3892                    }
3893                 }
3894
3895             {
3896                 U8 *uc = ( U8* )locinput;
3897
3898                 STRLEN len = 0;
3899                 STRLEN foldlen = 0;
3900                 U8 *uscan = (U8*)NULL;
3901                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3902                 U32 charcount = 0; /* how many input chars we have matched */
3903                 U32 accepted = 0; /* have we seen any accepting states? */
3904
3905                 ST.jump = trie->jump;
3906                 ST.me = scan;
3907                 ST.firstpos = NULL;
3908                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3909                 ST.nextword = 0;
3910
3911                 /* fully traverse the TRIE; note the position of the
3912                    shortest accept state and the wordnum of the longest
3913                    accept state */
3914
3915                 while ( state && uc <= (U8*)(reginfo->strend) ) {
3916                     U32 base = trie->states[ state ].trans.base;
3917                     UV uvc = 0;
3918                     U16 charid = 0;
3919                     U16 wordnum;
3920                     wordnum = trie->states[ state ].wordnum;
3921
3922                     if (wordnum) { /* it's an accept state */
3923                         if (!accepted) {
3924                             accepted = 1;
3925                             /* record first match position */
3926                             if (ST.longfold) {
3927                                 ST.firstpos = (U8*)locinput;
3928                                 ST.firstchars = 0;
3929                             }
3930                             else {
3931                                 ST.firstpos = uc;
3932                                 ST.firstchars = charcount;
3933                             }
3934                         }
3935                         if (!ST.nextword || wordnum < ST.nextword)
3936                             ST.nextword = wordnum;
3937                         ST.topword = wordnum;
3938                     }
3939
3940                     DEBUG_TRIE_EXECUTE_r({
3941                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3942                                 PerlIO_printf( Perl_debug_log,
3943                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3944                                     2+depth * 2, "", PL_colors[4],
3945                                     (UV)state, (accepted ? 'Y' : 'N'));
3946                     });
3947
3948                     /* read a char and goto next state */
3949                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3950                         I32 offset;
3951                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3952                                              uscan, len, uvc, charid, foldlen,
3953                                              foldbuf, uniflags);
3954                         charcount++;
3955                         if (foldlen>0)
3956                             ST.longfold = TRUE;
3957                         if (charid &&
3958                              ( ((offset =
3959                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3960
3961                              && ((U32)offset < trie->lasttrans)
3962                              && trie->trans[offset].check == state)
3963                         {
3964                             state = trie->trans[offset].next;
3965                         }
3966                         else {
3967                             state = 0;
3968                         }
3969                         uc += len;
3970
3971                     }
3972                     else {
3973                         state = 0;
3974                     }
3975                     DEBUG_TRIE_EXECUTE_r(
3976                         PerlIO_printf( Perl_debug_log,
3977                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3978                             charid, uvc, (UV)state, PL_colors[5] );
3979                     );
3980                 }
3981                 if (!accepted)
3982                    sayNO;
3983
3984                 /* calculate total number of accept states */
3985                 {
3986                     U16 w = ST.topword;
3987                     accepted = 0;
3988                     while (w) {
3989                         w = trie->wordinfo[w].prev;
3990                         accepted++;
3991                     }
3992                     ST.accepted = accepted;
3993                 }
3994
3995                 DEBUG_EXECUTE_r(
3996                     PerlIO_printf( Perl_debug_log,
3997                         "%*s  %sgot %"IVdf" possible matches%s\n",
3998                         REPORT_CODE_OFF + depth * 2, "",
3999                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4000                 );
4001                 goto trie_first_try; /* jump into the fail handler */
4002             }}
4003             assert(0); /* NOTREACHED */
4004
4005         case TRIE_next_fail: /* we failed - try next alternative */
4006         {
4007             U8 *uc;
4008             if ( ST.jump) {
4009                 REGCP_UNWIND(ST.cp);
4010                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4011             }
4012             if (!--ST.accepted) {
4013                 DEBUG_EXECUTE_r({
4014                     PerlIO_printf( Perl_debug_log,
4015                         "%*s  %sTRIE failed...%s\n",
4016                         REPORT_CODE_OFF+depth*2, "",
4017                         PL_colors[4],
4018                         PL_colors[5] );
4019                 });
4020                 sayNO_SILENT;
4021             }
4022             {
4023                 /* Find next-highest word to process.  Note that this code
4024                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
4025                 U16 min = 0;
4026                 U16 word;
4027                 U16 const nextword = ST.nextword;
4028                 reg_trie_wordinfo * const wordinfo
4029                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4030                 for (word=ST.topword; word; word=wordinfo[word].prev) {
4031                     if (word > nextword && (!min || word < min))
4032                         min = word;
4033                 }
4034                 ST.nextword = min;
4035             }
4036
4037           trie_first_try:
4038             if (do_cutgroup) {
4039                 do_cutgroup = 0;
4040                 no_final = 0;
4041             }
4042
4043             if ( ST.jump) {
4044                 ST.lastparen = rex->lastparen;
4045                 ST.lastcloseparen = rex->lastcloseparen;
4046                 REGCP_SET(ST.cp);
4047             }
4048
4049             /* find start char of end of current word */
4050             {
4051                 U32 chars; /* how many chars to skip */
4052                 reg_trie_data * const trie
4053                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4054
4055                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4056                             >=  ST.firstchars);
4057                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4058                             - ST.firstchars;
4059                 uc = ST.firstpos;
4060
4061                 if (ST.longfold) {
4062                     /* the hard option - fold each char in turn and find
4063                      * its folded length (which may be different */
4064                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4065                     STRLEN foldlen;
4066                     STRLEN len;
4067                     UV uvc;
4068                     U8 *uscan;
4069
4070                     while (chars) {
4071                         if (utf8_target) {
4072                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4073                                                     uniflags);
4074                             uc += len;
4075                         }
4076                         else {
4077                             uvc = *uc;
4078                             uc++;
4079                         }
4080                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4081                         uscan = foldbuf;
4082                         while (foldlen) {
4083                             if (!--chars)
4084                                 break;
4085                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4086                                             uniflags);
4087                             uscan += len;
4088                             foldlen -= len;
4089                         }
4090                     }
4091                 }
4092                 else {
4093                     if (utf8_target)
4094                         while (chars--)
4095                             uc += UTF8SKIP(uc);
4096                     else
4097                         uc += chars;
4098                 }
4099             }
4100
4101             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4102                             ? ST.jump[ST.nextword]
4103                             : NEXT_OFF(ST.me));
4104
4105             DEBUG_EXECUTE_r({
4106                 PerlIO_printf( Perl_debug_log,
4107                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4108                     REPORT_CODE_OFF+depth*2, "",
4109                     PL_colors[4],
4110                     ST.nextword,
4111                     PL_colors[5]
4112                     );
4113             });
4114
4115             if (ST.accepted > 1 || has_cutgroup) {
4116                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4117                 assert(0); /* NOTREACHED */
4118             }
4119             /* only one choice left - just continue */
4120             DEBUG_EXECUTE_r({
4121                 AV *const trie_words
4122                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4123                 SV ** const tmp = av_fetch( trie_words,
4124                     ST.nextword-1, 0 );
4125                 SV *sv= tmp ? sv_newmortal() : NULL;
4126
4127                 PerlIO_printf( Perl_debug_log,
4128                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4129                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4130                     ST.nextword,
4131                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4132                             PL_colors[0], PL_colors[1],
4133                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4134                         )
4135                     : "not compiled under -Dr",
4136                     PL_colors[5] );
4137             });
4138
4139             locinput = (char*)uc;
4140             continue; /* execute rest of RE */
4141             assert(0); /* NOTREACHED */
4142         }
4143 #undef  ST
4144
4145         case EXACT: {            /*  /abc/        */
4146             char *s = STRING(scan);
4147             ln = STR_LEN(scan);
4148             if (utf8_target != is_utf8_pat) {
4149                 /* The target and the pattern have differing utf8ness. */
4150                 char *l = locinput;
4151                 const char * const e = s + ln;
4152
4153                 if (utf8_target) {
4154                     /* The target is utf8, the pattern is not utf8.
4155                      * Above-Latin1 code points can't match the pattern;
4156                      * invariants match exactly, and the other Latin1 ones need
4157                      * to be downgraded to a single byte in order to do the
4158                      * comparison.  (If we could be confident that the target
4159                      * is not malformed, this could be refactored to have fewer
4160                      * tests by just assuming that if the first bytes match, it
4161                      * is an invariant, but there are tests in the test suite
4162                      * dealing with (??{...}) which violate this) */
4163                     while (s < e) {
4164                         if (l >= reginfo->strend
4165                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4166                         {
4167                             sayNO;
4168                         }
4169                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4170                             if (*l != *s) {
4171                                 sayNO;
4172                             }
4173                             l++;
4174                         }
4175                         else {
4176                             if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4177                             {
4178                                 sayNO;
4179                             }
4180                             l += 2;
4181                         }
4182                         s++;
4183                     }
4184                 }
4185                 else {
4186                     /* The target is not utf8, the pattern is utf8. */
4187                     while (s < e) {
4188                         if (l >= reginfo->strend
4189                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4190                         {
4191                             sayNO;
4192                         }
4193                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4194                             if (*s != *l) {
4195                                 sayNO;
4196                             }
4197                             s++;
4198                         }
4199                         else {
4200                             if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4201                             {
4202                                 sayNO;
4203                             }
4204                             s += 2;
4205                         }
4206                         l++;
4207                     }
4208                 }
4209                 locinput = l;
4210             }
4211             else {
4212                 /* The target and the pattern have the same utf8ness. */
4213                 /* Inline the first character, for speed. */
4214                 if (reginfo->strend - locinput < ln
4215                     || UCHARAT(s) != nextchr
4216                     || (ln > 1 && memNE(s, locinput, ln)))
4217                 {
4218                     sayNO;
4219                 }
4220                 locinput += ln;
4221             }
4222             break;
4223             }
4224
4225         case EXACTFL: {          /*  /abc/il      */
4226             re_fold_t folder;
4227             const U8 * fold_array;
4228             const char * s;
4229             U32 fold_utf8_flags;
4230
4231             RX_MATCH_TAINTED_on(reginfo->prog);
4232             folder = foldEQ_locale;
4233             fold_array = PL_fold_locale;
4234             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4235             goto do_exactf;
4236
4237         case EXACTFU_SS:         /*  /\x{df}/iu   */
4238         case EXACTFU:            /*  /abc/iu      */
4239             folder = foldEQ_latin1;
4240             fold_array = PL_fold_latin1;
4241             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4242             goto do_exactf;
4243
4244         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
4245                                    patterns */
4246             assert(! is_utf8_pat);
4247             /* FALL THROUGH */
4248         case EXACTFA:            /*  /abc/iaa     */
4249             folder = foldEQ_latin1;
4250             fold_array = PL_fold_latin1;
4251             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4252             goto do_exactf;
4253
4254         case EXACTF:             /*  /abc/i    This node only generated for
4255                                                non-utf8 patterns */
4256             assert(! is_utf8_pat);
4257             folder = foldEQ;
4258             fold_array = PL_fold;
4259             fold_utf8_flags = 0;
4260
4261           do_exactf:
4262             s = STRING(scan);
4263             ln = STR_LEN(scan);
4264
4265             if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4266               /* Either target or the pattern are utf8, or has the issue where
4267                * the fold lengths may differ. */
4268                 const char * const l = locinput;
4269                 char *e = reginfo->strend;
4270
4271                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4272                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4273                 {
4274                     sayNO;
4275                 }
4276                 locinput = e;
4277                 break;
4278             }
4279
4280             /* Neither the target nor the pattern are utf8 */
4281             if (UCHARAT(s) != nextchr
4282                 && !NEXTCHR_IS_EOS
4283                 && UCHARAT(s) != fold_array[nextchr])
4284             {
4285                 sayNO;
4286             }
4287             if (reginfo->strend - locinput < ln)
4288                 sayNO;
4289             if (ln > 1 && ! folder(s, locinput, ln))
4290                 sayNO;
4291             locinput += ln;
4292             break;
4293         }
4294
4295         /* XXX Could improve efficiency by separating these all out using a
4296          * macro or in-line function.  At that point regcomp.c would no longer
4297          * have to set the FLAGS fields of these */
4298         case BOUNDL:  /*  /\b/l  */
4299         case NBOUNDL: /*  /\B/l  */
4300             RX_MATCH_TAINTED_on(reginfo->prog);
4301             /* FALL THROUGH */
4302         case BOUND:   /*  /\b/   */
4303         case BOUNDU:  /*  /\b/u  */
4304         case BOUNDA:  /*  /\b/a  */
4305         case NBOUND:  /*  /\B/   */
4306         case NBOUNDU: /*  /\B/u  */
4307         case NBOUNDA: /*  /\B/a  */
4308             /* was last char in word? */
4309             if (utf8_target
4310                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4311                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4312             {
4313                 if (locinput == reginfo->strbeg)
4314                     ln = '\n';
4315                 else {
4316                     const U8 * const r =
4317                             reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4318
4319                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4320                 }
4321                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4322                     ln = isWORDCHAR_uni(ln);
4323                     if (NEXTCHR_IS_EOS)
4324                         n = 0;
4325                     else {
4326                         LOAD_UTF8_CHARCLASS_ALNUM();
4327                         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4328                                                                 utf8_target);
4329                     }
4330                 }
4331                 else {
4332                     ln = isWORDCHAR_LC_uvchr(ln);
4333                     n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4334                 }
4335             }
4336             else {
4337
4338                 /* Here the string isn't utf8, or is utf8 and only ascii
4339                  * characters are to match \w.  In the latter case looking at
4340                  * the byte just prior to the current one may be just the final
4341                  * byte of a multi-byte character.  This is ok.  There are two
4342                  * cases:
4343                  * 1) it is a single byte character, and then the test is doing
4344                  *      just what it's supposed to.
4345                  * 2) it is a multi-byte character, in which case the final
4346                  *      byte is never mistakable for ASCII, and so the test
4347                  *      will say it is not a word character, which is the
4348                  *      correct answer. */
4349                 ln = (locinput != reginfo->strbeg) ?
4350                     UCHARAT(locinput - 1) : '\n';
4351                 switch (FLAGS(scan)) {
4352                     case REGEX_UNICODE_CHARSET:
4353                         ln = isWORDCHAR_L1(ln);
4354                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4355                         break;
4356                     case REGEX_LOCALE_CHARSET:
4357                         ln = isWORDCHAR_LC(ln);
4358                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4359                         break;
4360                     case REGEX_DEPENDS_CHARSET:
4361                         ln = isWORDCHAR(ln);
4362                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4363                         break;
4364                     case REGEX_ASCII_RESTRICTED_CHARSET:
4365                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4366                         ln = isWORDCHAR_A(ln);
4367                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4368                         break;
4369                     default:
4370                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4371                         break;
4372                 }
4373             }
4374             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4375              * regcomp.sym */
4376             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4377                     sayNO;
4378             break;
4379
4380         case ANYOF:  /*  /[abc]/       */
4381         case ANYOF_WARN_SUPER:
4382             if (NEXTCHR_IS_EOS)
4383                 sayNO;
4384             if (utf8_target) {
4385                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4386                     sayNO;
4387                 locinput += UTF8SKIP(locinput);
4388             }
4389             else {
4390                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4391                     sayNO;
4392                 locinput++;
4393             }
4394             break;
4395
4396         /* The argument (FLAGS) to all the POSIX node types is the class number
4397          * */
4398
4399         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4400             to_complement = 1;
4401             /* FALLTHROUGH */
4402
4403         case POSIXL:    /* \w or [:punct:] etc. under /l */
4404             if (NEXTCHR_IS_EOS)
4405                 sayNO;
4406
4407             /* The locale hasn't influenced the outcome before this, so defer
4408              * tainting until now */
4409             RX_MATCH_TAINTED_on(reginfo->prog);
4410
4411             /* Use isFOO_lc() for characters within Latin1.  (Note that
4412              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4413              * wouldn't be invariant) */
4414             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4415                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4416                     sayNO;
4417                 }
4418             }
4419             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4420                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4421                                            (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4422                                                             *(locinput + 1))))))
4423                 {
4424                     sayNO;
4425                 }
4426             }
4427             else { /* Here, must be an above Latin-1 code point */
4428                 goto utf8_posix_not_eos;
4429             }
4430
4431             /* Here, must be utf8 */
4432             locinput += UTF8SKIP(locinput);
4433             break;
4434
4435         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4436             to_complement = 1;
4437             /* FALLTHROUGH */
4438
4439         case POSIXD:    /* \w or [:punct:] etc. under /d */
4440             if (utf8_target) {
4441                 goto utf8_posix;
4442             }
4443             goto posixa;
4444
4445         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4446
4447             if (NEXTCHR_IS_EOS) {
4448                 sayNO;
4449             }
4450
4451             /* All UTF-8 variants match */
4452             if (! UTF8_IS_INVARIANT(nextchr)) {
4453                 goto increment_locinput;
4454             }
4455
4456             to_complement = 1;
4457             /* FALLTHROUGH */
4458
4459         case POSIXA:    /* \w or [:punct:] etc. under /a */
4460
4461           posixa:
4462             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4463              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4464              * character is a single byte */
4465
4466             if (NEXTCHR_IS_EOS
4467                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4468                                                             FLAGS(scan)))))
4469             {
4470                 sayNO;
4471             }
4472
4473             /* Here we are either not in utf8, or we matched a utf8-invariant,
4474              * so the next char is the next byte */
4475             locinput++;
4476             break;
4477
4478         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4479             to_complement = 1;
4480             /* FALLTHROUGH */
4481
4482         case POSIXU:    /* \w or [:punct:] etc. under /u */
4483           utf8_posix:
4484             if (NEXTCHR_IS_EOS) {
4485                 sayNO;
4486             }
4487           utf8_posix_not_eos:
4488
4489             /* Use _generic_isCC() for characters within Latin1.  (Note that
4490              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4491              * wouldn't be invariant) */
4492             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4493                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4494                                                            FLAGS(scan)))))
4495                 {
4496                     sayNO;
4497                 }
4498                 locinput++;
4499             }
4500             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4501                 if (! (to_complement
4502                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4503                                                                *(locinput + 1)),
4504                                              FLAGS(scan)))))
4505                 {
4506                     sayNO;
4507                 }
4508                 locinput += 2;
4509             }
4510             else {  /* Handle above Latin-1 code points */
4511                 classnum = (_char_class_number) FLAGS(scan);
4512                 if (classnum < _FIRST_NON_SWASH_CC) {
4513
4514                     /* Here, uses a swash to find such code points.  Load if if
4515                      * not done already */
4516                     if (! PL_utf8_swash_ptrs[classnum]) {
4517                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4518                         PL_utf8_swash_ptrs[classnum]
4519                                 = _core_swash_init("utf8",
4520                                         swash_property_names[classnum],
4521                                         &PL_sv_undef, 1, 0, NULL, &flags);
4522                     }
4523                     if (! (to_complement
4524                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4525                                                (U8 *) locinput, TRUE))))
4526                     {
4527                         sayNO;
4528                     }
4529                 }
4530                 else {  /* Here, uses macros to find above Latin-1 code points */
4531                     switch (classnum) {
4532                         case _CC_ENUM_SPACE:    /* XXX would require separate
4533                                                    code if we revert the change
4534                                                    of \v matching this */
4535                         case _CC_ENUM_PSXSPC:
4536                             if (! (to_complement
4537                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4538                             {
4539                                 sayNO;
4540                             }
4541                             break;
4542                         case _CC_ENUM_BLANK:
4543                             if (! (to_complement
4544                                             ^ cBOOL(is_HORIZWS_high(locinput))))
4545                             {
4546                                 sayNO;
4547                             }
4548                             break;
4549                         case _CC_ENUM_XDIGIT:
4550                             if (! (to_complement
4551                                             ^ cBOOL(is_XDIGIT_high(locinput))))
4552                             {
4553                                 sayNO;
4554                             }
4555                             break;
4556                         case _CC_ENUM_VERTSPACE:
4557                             if (! (to_complement
4558                                             ^ cBOOL(is_VERTWS_high(locinput))))
4559                             {
4560                                 sayNO;
4561                             }
4562                             break;
4563                         default:    /* The rest, e.g. [:cntrl:], can't match
4564                                        above Latin1 */
4565                             if (! to_complement) {
4566                                 sayNO;
4567                             }
4568                             break;
4569                     }
4570                 }
4571                 locinput += UTF8SKIP(locinput);
4572             }
4573             break;
4574
4575         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4576                        a Unicode extended Grapheme Cluster */
4577             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4578               extended Grapheme Cluster is:
4579
4580             CR LF
4581             | Prepend* Begin Extend*
4582             | .
4583
4584             Begin is:           ( Special_Begin | ! Control )
4585             Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4586             Extend is:          ( Grapheme_Extend | Spacing_Mark )
4587             Control is:         [ GCB_Control | CR | LF ]
4588             Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4589
4590                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4591                we can rewrite
4592
4593                    Begin is ( Regular_Begin + Special Begin )
4594
4595                It turns out that 98.4% of all Unicode code points match
4596                Regular_Begin.  Doing it this way eliminates a table match in
4597                the previous implementation for almost all Unicode code points.
4598
4599                There is a subtlety with Prepend* which showed up in testing.
4600                Note that the Begin, and only the Begin is required in:
4601                 | Prepend* Begin Extend*
4602                Also, Begin contains '! Control'.  A Prepend must be a
4603                '!  Control', which means it must also be a Begin.  What it
4604                comes down to is that if we match Prepend* and then find no
4605                suitable Begin afterwards, that if we backtrack the last
4606                Prepend, that one will be a suitable Begin.
4607             */
4608
4609             if (NEXTCHR_IS_EOS)
4610                 sayNO;
4611             if  (! utf8_target) {
4612
4613                 /* Match either CR LF  or '.', as all the other possibilities
4614                  * require utf8 */
4615                 locinput++;         /* Match the . or CR */
4616                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4617                                        match the LF */
4618                     && locinput < reginfo->strend
4619                     && UCHARAT(locinput) == '\n')
4620                 {
4621                     locinput++;
4622                 }
4623             }
4624             else {
4625
4626                 /* Utf8: See if is ( CR LF ); already know that locinput <
4627                  * reginfo->strend, so locinput+1 is in bounds */
4628                 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4629                      && UCHARAT(locinput + 1) == '\n')
4630                 {
4631                     locinput += 2;
4632                 }
4633                 else {
4634                     STRLEN len;
4635
4636                     /* In case have to backtrack to beginning, then match '.' */
4637                     char *starting = locinput;
4638
4639                     /* In case have to backtrack the last prepend */
4640                     char *previous_prepend = NULL;
4641
4642                     LOAD_UTF8_CHARCLASS_GCB();
4643
4644                     /* Match (prepend)*   */
4645                     while (locinput < reginfo->strend
4646                            && (len = is_GCB_Prepend_utf8(locinput)))
4647                     {
4648                         previous_prepend = locinput;
4649                         locinput += len;
4650                     }
4651
4652                     /* As noted above, if we matched a prepend character, but
4653                      * the next thing won't match, back off the last prepend we
4654                      * matched, as it is guaranteed to match the begin */
4655                     if (previous_prepend
4656                         && (locinput >=  reginfo->strend
4657                             || (! swash_fetch(PL_utf8_X_regular_begin,
4658                                              (U8*)locinput, utf8_target)
4659                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4660                         )
4661                     {
4662                         locinput = previous_prepend;
4663                     }
4664
4665                     /* Note that here we know reginfo->strend > locinput, as we
4666                      * tested that upon input to this switch case, and if we
4667                      * moved locinput forward, we tested the result just above
4668                      * and it either passed, or we backed off so that it will
4669                      * now pass */
4670                     if (swash_fetch(PL_utf8_X_regular_begin,
4671                                     (U8*)locinput, utf8_target)) {
4672                         locinput += UTF8SKIP(locinput);
4673                     }
4674                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4675
4676                         /* Here did not match the required 'Begin' in the
4677                          * second term.  So just match the very first
4678                          * character, the '.' of the final term of the regex */
4679                         locinput = starting + UTF8SKIP(starting);
4680                         goto exit_utf8;
4681                     } else {
4682
4683                         /* Here is a special begin.  It can be composed of
4684                          * several individual characters.  One possibility is
4685                          * RI+ */
4686                         if ((len = is_GCB_RI_utf8(locinput))) {
4687                             locinput += len;
4688                             while (locinput < reginfo->strend
4689                                    && (len = is_GCB_RI_utf8(locinput)))
4690                             {
4691                                 locinput += len;
4692                             }
4693                         } else if ((len = is_GCB_T_utf8(locinput))) {
4694                             /* Another possibility is T+ */
4695                             locinput += len;
4696                             while (locinput < reginfo->strend
4697                                 && (len = is_GCB_T_utf8(locinput)))
4698                             {
4699                                 locinput += len;
4700                             }
4701                         } else {
4702
4703                             /* Here, neither RI+ nor T+; must be some other
4704                              * Hangul.  That means it is one of the others: L,
4705                              * LV, LVT or V, and matches:
4706                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4707
4708                             /* Match L*           */
4709                             while (locinput < reginfo->strend
4710                                    && (len = is_GCB_L_utf8(locinput)))
4711                             {
4712                                 locinput += len;
4713                             }
4714
4715                             /* Here, have exhausted L*.  If the next character
4716                              * is not an LV, LVT nor V, it means we had to have
4717                              * at least one L, so matches L+ in the original
4718                              * equation, we have a complete hangul syllable.
4719                              * Are done. */
4720
4721                             if (locinput < reginfo->strend
4722                                 && is_GCB_LV_LVT_V_utf8(locinput))
4723                             {
4724                                 /* Otherwise keep going.  Must be LV, LVT or V.
4725                                  * See if LVT, by first ruling out V, then LV */
4726                                 if (! is_GCB_V_utf8(locinput)
4727                                         /* All but every TCount one is LV */
4728                                     && (valid_utf8_to_uvchr((U8 *) locinput,
4729                                                                          NULL)
4730                                                                         - SBASE)
4731                                         % TCount != 0)
4732                                 {
4733                                     locinput += UTF8SKIP(locinput);
4734                                 } else {
4735
4736                                     /* Must be  V or LV.  Take it, then match
4737                                      * V*     */
4738                                     locinput += UTF8SKIP(locinput);
4739                                     while (locinput < reginfo->strend
4740                                            && (len = is_GCB_V_utf8(locinput)))
4741                                     {
4742                                         locinput += len;
4743                                     }
4744                                 }
4745
4746                                 /* And any of LV, LVT, or V can be followed
4747                                  * by T*            */
4748                                 while (locinput < reginfo->strend
4749                                        && (len = is_GCB_T_utf8(locinput)))
4750                                 {
4751                                     locinput += len;
4752                                 }
4753                             }
4754                         }
4755                     }
4756
4757                     /* Match any extender */
4758                     while (locinput < reginfo->strend
4759                             && swash_fetch(PL_utf8_X_extend,
4760                                             (U8*)locinput, utf8_target))
4761                     {
4762                         locinput += UTF8SKIP(locinput);
4763                     }
4764                 }
4765             exit_utf8:
4766                 if (locinput > reginfo->strend) sayNO;
4767             }
4768             break;
4769
4770         case NREFFL:  /*  /\g{name}/il  */
4771         {   /* The capture buffer cases.  The ones beginning with N for the
4772                named buffers just convert to the equivalent numbered and
4773                pretend they were called as the corresponding numbered buffer
4774                op.  */
4775             /* don't initialize these in the declaration, it makes C++
4776                unhappy */
4777             const char *s;
4778             char type;
4779             re_fold_t folder;
4780             const U8 *fold_array;
4781             UV utf8_fold_flags;
4782
4783             RX_MATCH_TAINTED_on(reginfo->prog);
4784             folder = foldEQ_locale;
4785             fold_array = PL_fold_locale;
4786             type = REFFL;
4787             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4788             goto do_nref;
4789
4790         case NREFFA:  /*  /\g{name}/iaa  */
4791             folder = foldEQ_latin1;
4792             fold_array = PL_fold_latin1;
4793             type = REFFA;
4794             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4795             goto do_nref;
4796
4797         case NREFFU:  /*  /\g{name}/iu  */
4798             folder = foldEQ_latin1;
4799             fold_array = PL_fold_latin1;
4800             type = REFFU;
4801             utf8_fold_flags = 0;
4802             goto do_nref;
4803
4804         case NREFF:  /*  /\g{name}/i  */
4805             folder = foldEQ;
4806             fold_array = PL_fold;
4807             type = REFF;
4808             utf8_fold_flags = 0;
4809             goto do_nref;
4810
4811         case NREF:  /*  /\g{name}/   */
4812             type = REF;
4813             folder = NULL;
4814             fold_array = NULL;
4815             utf8_fold_flags = 0;
4816           do_nref:
4817
4818             /* For the named back references, find the corresponding buffer
4819              * number */
4820             n = reg_check_named_buff_matched(rex,scan);
4821
4822             if ( ! n ) {
4823                 sayNO;
4824             }
4825             goto do_nref_ref_common;
4826
4827         case REFFL:  /*  /\1/il  */
4828             RX_MATCH_TAINTED_on(reginfo->prog);
4829             folder = foldEQ_locale;
4830             fold_array = PL_fold_locale;
4831             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4832             goto do_ref;
4833
4834         case REFFA:  /*  /\1/iaa  */
4835             folder = foldEQ_latin1;
4836             fold_array = PL_fold_latin1;
4837             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4838             goto do_ref;
4839
4840         case REFFU:  /*  /\1/iu  */
4841             folder = foldEQ_latin1;
4842             fold_array = PL_fold_latin1;
4843             utf8_fold_flags = 0;
4844             goto do_ref;
4845
4846         case REFF:  /*  /\1/i  */
4847             folder = foldEQ;
4848             fold_array = PL_fold;
4849             utf8_fold_flags = 0;
4850             goto do_ref;
4851
4852         case REF:  /*  /\1/    */
4853             folder = NULL;
4854             fold_array = NULL;
4855             utf8_fold_flags = 0;
4856
4857           do_ref:
4858             type = OP(scan);
4859             n = ARG(scan);  /* which paren pair */
4860
4861           do_nref_ref_common:
4862             ln = rex->offs[n].start;
4863             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4864             if (rex->lastparen < n || ln == -1)
4865                 sayNO;                  /* Do not match unless seen CLOSEn. */
4866             if (ln == rex->offs[n].end)
4867                 break;
4868
4869             s = reginfo->strbeg + ln;
4870             if (type != REF     /* REF can do byte comparison */
4871                 && (utf8_target || type == REFFU))
4872             { /* XXX handle REFFL better */
4873                 char * limit = reginfo->strend;
4874
4875                 /* This call case insensitively compares the entire buffer
4876                     * at s, with the current input starting at locinput, but
4877                     * not going off the end given by reginfo->strend, and
4878                     * returns in <limit> upon success, how much of the
4879                     * current input was matched */
4880                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4881                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4882                 {
4883                     sayNO;
4884                 }
4885                 locinput = limit;
4886                 break;
4887             }
4888
4889             /* Not utf8:  Inline the first character, for speed. */
4890             if (!NEXTCHR_IS_EOS &&
4891                 UCHARAT(s) != nextchr &&
4892                 (type == REF ||
4893                  UCHARAT(s) != fold_array[nextchr]))
4894                 sayNO;
4895             ln = rex->offs[n].end - ln;
4896             if (locinput + ln > reginfo->strend)
4897                 sayNO;
4898             if (ln > 1 && (type == REF
4899                            ? memNE(s, locinput, ln)
4900                            : ! folder(s, locinput, ln)))
4901                 sayNO;
4902             locinput += ln;
4903             break;
4904         }
4905
4906         case NOTHING: /* null op; e.g. the 'nothing' following
4907                        * the '*' in m{(a+|b)*}' */
4908             break;
4909         case TAIL: /* placeholder while compiling (A|B|C) */
4910             break;
4911
4912         case BACK: /* ??? doesn't appear to be used ??? */
4913             break;
4914
4915 #undef  ST
4916 #define ST st->u.eval
4917         {
4918             SV *ret;
4919             REGEXP *re_sv;
4920             regexp *re;
4921             regexp_internal *rei;
4922             regnode *startpoint;
4923
4924         case GOSTART: /*  (?R)  */
4925         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4926             if (cur_eval && cur_eval->locinput==locinput) {
4927                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4928                     Perl_croak(aTHX_ "Infinite recursion in regex");
4929                 if ( ++nochange_depth > max_nochange_depth )
4930                     Perl_croak(aTHX_
4931                         "Pattern subroutine nesting without pos change"
4932                         " exceeded limit in regex");
4933             } else {
4934                 nochange_depth = 0;
4935             }
4936             re_sv = rex_sv;
4937             re = rex;
4938             rei = rexi;
4939             if (OP(scan)==GOSUB) {
4940                 startpoint = scan + ARG2L(scan);
4941                 ST.close_paren = ARG(scan);
4942             } else {
4943                 startpoint = rei->program+1;
4944                 ST.close_paren = 0;
4945             }
4946             goto eval_recurse_doit;
4947             assert(0); /* NOTREACHED */
4948
4949         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4950             if (cur_eval && cur_eval->locinput==locinput) {
4951                 if ( ++nochange_depth > max_nochange_depth )
4952                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4953             } else {
4954                 nochange_depth = 0;
4955             }
4956             {
4957                 /* execute the code in the {...} */
4958
4959                 dSP;
4960                 IV before;
4961                 OP * const oop = PL_op;
4962                 COP * const ocurcop = PL_curcop;
4963                 OP *nop;
4964                 CV *newcv;
4965
4966                 /* save *all* paren positions */
4967                 regcppush(rex, 0, maxopenparen);
4968                 REGCP_SET(runops_cp);
4969
4970                 if (!caller_cv)
4971                     caller_cv = find_runcv(NULL);
4972
4973                 n = ARG(scan);
4974
4975                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4976                     newcv = (ReANY(
4977                                                 (REGEXP*)(rexi->data->data[n])
4978                                             ))->qr_anoncv
4979                                         ;
4980                     nop = (OP*)rexi->data->data[n+1];
4981                 }
4982                 else if (rexi->data->what[n] == 'l') { /* literal code */
4983                     newcv = caller_cv;
4984                     nop = (OP*)rexi->data->data[n];
4985                     assert(CvDEPTH(newcv));
4986                 }
4987                 else {
4988                     /* literal with own CV */
4989                     assert(rexi->data->what[n] == 'L');
4990                     newcv = rex->qr_anoncv;
4991                     nop = (OP*)rexi->data->data[n];
4992                 }
4993
4994                 /* normally if we're about to execute code from the same
4995                  * CV that we used previously, we just use the existing
4996                  * CX stack entry. However, its possible that in the
4997                  * meantime we may have backtracked, popped from the save
4998                  * stack, and undone the SAVECOMPPAD(s) associated with
4999                  * PUSH_MULTICALL; in which case PL_comppad no longer
5000                  * points to newcv's pad. */
5001                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5002                 {
5003                     U8 flags = (CXp_SUB_RE |
5004                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5005                     if (last_pushed_cv) {
5006                         CHANGE_MULTICALL_FLAGS(newcv, flags);
5007                     }
5008                     else {
5009                         PUSH_MULTICALL_FLAGS(newcv, flags);
5010                     }
5011                     last_pushed_cv = newcv;
5012                 }
5013                 else {
5014                     /* these assignments are just to silence compiler
5015                      * warnings */
5016                     multicall_cop = NULL;
5017                     newsp = NULL;
5018                 }
5019                 last_pad = PL_comppad;
5020
5021                 /* the initial nextstate you would normally execute
5022                  * at the start of an eval (which would cause error
5023                  * messages to come from the eval), may be optimised
5024                  * away from the execution path in the regex code blocks;
5025                  * so manually set PL_curcop to it initially */
5026                 {
5027                     OP *o = cUNOPx(nop)->op_first;
5028                     assert(o->op_type == OP_NULL);
5029                     if (o->op_targ == OP_SCOPE) {
5030                         o = cUNOPo->op_first;
5031                     }
5032                     else {
5033                         assert(o->op_targ == OP_LEAVE);
5034                         o = cUNOPo->op_first;
5035                         assert(o->op_type == OP_ENTER);
5036                         o = o->op_sibling;
5037                     }
5038
5039                     if (o->op_type != OP_STUB) {
5040                         assert(    o->op_type == OP_NEXTSTATE
5041                                 || o->op_type == OP_DBSTATE
5042                                 || (o->op_type == OP_NULL
5043                                     &&  (  o->op_targ == OP_NEXTSTATE
5044                                         || o->op_targ == OP_DBSTATE
5045                                         )
5046                                     )
5047                         );
5048                         PL_curcop = (COP*)o;
5049                     }
5050                 }
5051                 nop = nop->op_next;
5052
5053                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5054                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5055
5056                 rex->offs[0].end = locinput - reginfo->strbeg;
5057                 if (reginfo->info_aux_eval->pos_magic)
5058                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5059                                   reginfo->sv, reginfo->strbeg,
5060                                   locinput - reginfo->strbeg);
5061
5062                 if (sv_yes_mark) {
5063                     SV *sv_mrk = get_sv("REGMARK", 1);
5064                     sv_setsv(sv_mrk, sv_yes_mark);
5065                 }
5066
5067                 /* we don't use MULTICALL here as we want to call the
5068                  * first op of the block of interest, rather than the
5069                  * first op of the sub */
5070                 before = (IV)(SP-PL_stack_base);
5071                 PL_op = nop;
5072                 CALLRUNOPS(aTHX);                       /* Scalar context. */
5073                 SPAGAIN;
5074                 if ((IV)(SP-PL_stack_base) == before)
5075                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
5076                 else {
5077                     ret = POPs;
5078                     PUTBACK;
5079                 }
5080
5081                 /* before restoring everything, evaluate the returned
5082                  * value, so that 'uninit' warnings don't use the wrong
5083                  * PL_op or pad. Also need to process any magic vars
5084                  * (e.g. $1) *before* parentheses are restored */
5085
5086                 PL_op = NULL;
5087
5088                 re_sv = NULL;
5089                 if (logical == 0)        /*   (?{})/   */
5090                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5091                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5092                     sw = cBOOL(SvTRUE(ret));
5093                     logical = 0;
5094                 }
5095                 else {                   /*  /(??{})  */
5096                     /*  if its overloaded, let the regex compiler handle
5097                      *  it; otherwise extract regex, or stringify  */
5098                     if (!SvAMAGIC(ret)) {
5099                         SV *sv = ret;
5100                         if (SvROK(sv))
5101                             sv = SvRV(sv);
5102                         if (SvTYPE(sv) == SVt_REGEXP)
5103                             re_sv = (REGEXP*) sv;
5104                         else if (SvSMAGICAL(sv)) {
5105                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5106                             if (mg)
5107                                 re_sv = (REGEXP *) mg->mg_obj;
5108                         }
5109
5110                         /* force any magic, undef warnings here */
5111                         if (!re_sv) {
5112                             ret = sv_mortalcopy(ret);
5113                             (void) SvPV_force_nolen(ret);
5114                         }
5115                     }
5116
5117                 }
5118
5119                 /* *** Note that at this point we don't restore
5120                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5121                  * be used again soon. This is safe as long as nothing
5122                  * in the regexp code uses the pad ! */
5123                 PL_op = oop;
5124                 PL_curcop = ocurcop;
5125                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5126                 PL_curpm = PL_reg_curpm;
5127
5128                 if (logical != 2)
5129                     break;
5130             }
5131
5132                 /* only /(??{})/  from now on */
5133                 logical = 0;
5134                 {
5135                     /* extract RE object from returned value; compiling if
5136                      * necessary */
5137
5138                     if (re_sv) {
5139                         re_sv = reg_temp_copy(NULL, re_sv);
5140                     }
5141                     else {
5142                         U32 pm_flags = 0;
5143
5144                         if (SvUTF8(ret) && IN_BYTES) {
5145                             /* In use 'bytes': make a copy of the octet
5146                              * sequence, but without the flag on */
5147                             STRLEN len;
5148                             const char *const p = SvPV(ret, len);
5149                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5150                         }
5151                         if (rex->intflags & PREGf_USE_RE_EVAL)
5152                             pm_flags |= PMf_USE_RE_EVAL;
5153
5154                         /* if we got here, it should be an engine which
5155                          * supports compiling code blocks and stuff */
5156                         assert(rex->engine && rex->engine->op_comp);
5157                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5158                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5159                                     rex->engine, NULL, NULL,
5160                                     /* copy /msix etc to inner pattern */
5161                                     scan->flags,
5162                                     pm_flags);
5163
5164                         if (!(SvFLAGS(ret)
5165                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5166                                  | SVs_GMG))) {
5167                             /* This isn't a first class regexp. Instead, it's
5168                                caching a regexp onto an existing, Perl visible
5169                                scalar.  */
5170                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5171                         }
5172                         /* safe to do now that any $1 etc has been
5173                          * interpolated into the new pattern string and
5174                          * compiled */
5175                         S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5176                     }
5177                     SAVEFREESV(re_sv);
5178                     re = ReANY(re_sv);
5179                 }
5180                 RXp_MATCH_COPIED_off(re);
5181                 re->subbeg = rex->subbeg;
5182                 re->sublen = rex->sublen;
5183                 re->suboffset = rex->suboffset;
5184                 re->subcoffset = rex->subcoffset;
5185                 rei = RXi_GET(re);
5186                 DEBUG_EXECUTE_r(
5187                     debug_start_match(re_sv, utf8_target, locinput,
5188                                     reginfo->strend, "Matching embedded");
5189                 );
5190                 startpoint = rei->program + 1;
5191                 ST.close_paren = 0; /* only used for GOSUB */
5192
5193         eval_recurse_doit: /* Share code with GOSUB below this line */
5194                 /* run the pattern returned from (??{...}) */
5195
5196                 /* Save *all* the positions. */
5197                 ST.cp = regcppush(rex, 0, maxopenparen);
5198                 REGCP_SET(ST.lastcp);
5199
5200                 re->lastparen = 0;
5201                 re->lastcloseparen = 0;
5202
5203                 maxopenparen = 0;
5204
5205                 /* invalidate the S-L poscache. We're now executing a
5206                  * different set of WHILEM ops (and their associated
5207                  * indexes) against the same string, so the bits in the
5208                  * cache are meaningless. Setting maxiter to zero forces
5209                  * the cache to be invalidated and zeroed before reuse.
5210                  * XXX This is too dramatic a measure. Ideally we should
5211                  * save the old cache and restore when running the outer
5212                  * pattern again */
5213                 reginfo->poscache_maxiter = 0;
5214
5215                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5216
5217                 ST.prev_rex = rex_sv;
5218                 ST.prev_curlyx = cur_curlyx;
5219                 rex_sv = re_sv;
5220                 SET_reg_curpm(rex_sv);
5221                 rex = re;
5222                 rexi = rei;
5223                 cur_curlyx = NULL;
5224                 ST.B = next;
5225                 ST.prev_eval = cur_eval;
5226                 cur_eval = st;
5227                 /* now continue from first node in postoned RE */
5228                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5229                 assert(0); /* NOTREACHED */
5230         }
5231
5232         case EVAL_AB: /* cleanup after a successful (??{A})B */
5233             /* note: this is called twice; first after popping B, then A */
5234             rex_sv = ST.prev_rex;
5235             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5236             SET_reg_curpm(rex_sv);
5237             rex = ReANY(rex_sv);
5238             rexi = RXi_GET(rex);
5239             regcpblow(ST.cp);
5240             cur_eval = ST.prev_eval;
5241             cur_curlyx = ST.prev_curlyx;
5242
5243             /* Invalidate cache. See "invalidate" comment above. */
5244             reginfo->poscache_maxiter = 0;
5245             if ( nochange_depth )
5246                 nochange_depth--;
5247             sayYES;
5248
5249
5250         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5251             /* note: this is called twice; first after popping B, then A */
5252             rex_sv = ST.prev_rex;
5253             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5254             SET_reg_curpm(rex_sv);
5255             rex = ReANY(rex_sv);
5256             rexi = RXi_GET(rex);
5257
5258             REGCP_UNWIND(ST.lastcp);
5259             regcppop(rex, &maxopenparen);
5260             cur_eval = ST.prev_eval;
5261             cur_curlyx = ST.prev_curlyx;
5262             /* Invalidate cache. See "invalidate" comment above. */
5263             reginfo->poscache_maxiter = 0;
5264             if ( nochange_depth )
5265                 nochange_depth--;
5266             sayNO_SILENT;
5267 #undef ST
5268
5269         case OPEN: /*  (  */
5270             n = ARG(scan);  /* which paren pair */
5271             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5272             if (n > maxopenparen)
5273                 maxopenparen = n;
5274             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5275                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5276                 PTR2UV(rex),
5277                 PTR2UV(rex->offs),
5278                 (UV)n,
5279                 (IV)rex->offs[n].start_tmp,
5280                 (UV)maxopenparen
5281             ));
5282             lastopen = n;
5283             break;
5284
5285 /* XXX really need to log other places start/end are set too */
5286 #define CLOSE_CAPTURE \
5287     rex->offs[n].start = rex->offs[n].start_tmp; \
5288     rex->offs[n].end = locinput - reginfo->strbeg; \
5289     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5290         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5291         PTR2UV(rex), \
5292         PTR2UV(rex->offs), \
5293         (UV)n, \
5294         (IV)rex->offs[n].start, \
5295         (IV)rex->offs[n].end \
5296     ))
5297
5298         case CLOSE:  /*  )  */
5299             n = ARG(scan);  /* which paren pair */
5300             CLOSE_CAPTURE;
5301             if (n > rex->lastparen)
5302                 rex->lastparen = n;
5303             rex->lastcloseparen = n;
5304             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5305                 goto fake_end;
5306             }
5307             break;
5308
5309         case ACCEPT:  /*  (*ACCEPT)  */
5310             if (ARG(scan)){
5311                 regnode *cursor;
5312                 for (cursor=scan;
5313                      cursor && OP(cursor)!=END;
5314                      cursor=regnext(cursor))
5315                 {
5316                     if ( OP(cursor)==CLOSE ){
5317                         n = ARG(cursor);
5318                         if ( n <= lastopen ) {
5319                             CLOSE_CAPTURE;
5320                             if (n > rex->lastparen)
5321                                 rex->lastparen = n;
5322                             rex->lastcloseparen = n;
5323                             if ( n == ARG(scan) || (cur_eval &&
5324                                 cur_eval->u.eval.close_paren == n))
5325                                 break;
5326                         }
5327                     }
5328                 }
5329             }
5330             goto fake_end;
5331             /*NOTREACHED*/
5332
5333         case GROUPP:  /*  (?(1))  */
5334             n = ARG(scan);  /* which paren pair */
5335             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5336             break;
5337
5338         case NGROUPP:  /*  (?(<name>))  */
5339             /* reg_check_named_buff_matched returns 0 for no match */
5340             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5341             break;
5342
5343         case INSUBP:   /*  (?(R))  */
5344             n = ARG(scan);
5345             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5346             break;
5347
5348         case DEFINEP:  /*  (?(DEFINE))  */
5349             sw = 0;
5350             break;
5351
5352         case IFTHEN:   /*  (?(cond)A|B)  */
5353             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5354             if (sw)
5355                 next = NEXTOPER(NEXTOPER(scan));
5356             else {
5357                 next = scan + ARG(scan);
5358                 if (OP(next) == IFTHEN) /* Fake one. */
5359                     next = NEXTOPER(NEXTOPER(next));
5360             }
5361             break;
5362
5363         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5364             logical = scan->flags;
5365             break;
5366
5367 /*******************************************************************
5368
5369 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5370 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5371 STAR/PLUS/CURLY/CURLYN are used instead.)
5372
5373 A*B is compiled as <CURLYX><A><WHILEM><B>
5374
5375 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5376 state, which contains the current count, initialised to -1. It also sets
5377 cur_curlyx to point to this state, with any previous value saved in the
5378 state block.
5379
5380 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5381 since the pattern may possibly match zero times (i.e. it's a while {} loop
5382 rather than a do {} while loop).
5383
5384 Each entry to WHILEM represents a successful match of A. The count in the
5385 CURLYX block is incremented, another WHILEM state is pushed, and execution
5386 passes to A or B depending on greediness and the current count.
5387
5388 For example, if matching against the string a1a2a3b (where the aN are
5389 substrings that match /A/), then the match progresses as follows: (the
5390 pushed states are interspersed with the bits of strings matched so far):
5391
5392     <CURLYX cnt=-1>
5393     <CURLYX cnt=0><WHILEM>
5394     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5395     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5396     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5397     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5398
5399 (Contrast this with something like CURLYM, which maintains only a single
5400 backtrack state:
5401
5402     <CURLYM cnt=0> a1
5403     a1 <CURLYM cnt=1> a2
5404     a1 a2 <CURLYM cnt=2> a3
5405     a1 a2 a3 <CURLYM cnt=3> b
5406 )
5407
5408 Each WHILEM state block marks a point to backtrack to upon partial failure
5409 of A or B, and also contains some minor state data related to that
5410 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5411 overall state, such as the count, and pointers to the A and B ops.
5412
5413 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5414 must always point to the *current* CURLYX block, the rules are:
5415
5416 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5417 and set cur_curlyx to point the new block.
5418
5419 When popping the CURLYX block after a successful or unsuccessful match,
5420 restore the previous cur_curlyx.
5421
5422 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5423 to the outer one saved in the CURLYX block.
5424
5425 When popping the WHILEM block after a successful or unsuccessful B match,
5426 restore the previous cur_curlyx.
5427
5428 Here's an example for the pattern (AI* BI)*BO
5429 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5430
5431 cur_
5432 curlyx backtrack stack
5433 ------ ---------------
5434 NULL
5435 CO     <CO prev=NULL> <WO>
5436 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5437 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5438 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5439
5440 At this point the pattern succeeds, and we work back down the stack to
5441 clean up, restoring as we go:
5442
5443 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5444 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5445 CO     <CO prev=NULL> <WO>
5446 NULL
5447
5448 *******************************************************************/
5449
5450 #define ST st->u.curlyx
5451
5452         case CURLYX:    /* start of /A*B/  (for complex A) */
5453         {
5454             /* No need to save/restore up to this paren */
5455             I32 parenfloor = scan->flags;
5456
5457             assert(next); /* keep Coverity happy */
5458             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5459                 next += ARG(next);
5460
5461             /* XXXX Probably it is better to teach regpush to support
5462                parenfloor > maxopenparen ... */
5463             if (parenfloor > (I32)rex->lastparen)
5464                 parenfloor = rex->lastparen; /* Pessimization... */
5465
5466             ST.prev_curlyx= cur_curlyx;
5467             cur_curlyx = st;
5468             ST.cp = PL_savestack_ix;
5469
5470             /* these fields contain the state of the current curly.
5471              * they are accessed by subsequent WHILEMs */
5472             ST.parenfloor = parenfloor;
5473             ST.me = scan;
5474             ST.B = next;
5475             ST.minmod = minmod;
5476             minmod = 0;
5477             ST.count = -1;      /* this will be updated by WHILEM */
5478             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5479
5480             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5481             assert(0); /* NOTREACHED */
5482         }
5483
5484         case CURLYX_end: /* just finished matching all of A*B */
5485             cur_curlyx = ST.prev_curlyx;
5486             sayYES;
5487             assert(0); /* NOTREACHED */
5488
5489         case CURLYX_end_fail: /* just failed to match all of A*B */
5490             regcpblow(ST.cp);
5491             cur_curlyx = ST.prev_curlyx;
5492             sayNO;
5493             assert(0); /* NOTREACHED */
5494
5495
5496 #undef ST
5497 #define ST st->u.whilem
5498
5499         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5500         {
5501             /* see the discussion above about CURLYX/WHILEM */
5502             I32 n;
5503             int min = ARG1(cur_curlyx->u.curlyx.me);
5504             int max = ARG2(cur_curlyx->u.curlyx.me);
5505             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5506
5507             assert(cur_curlyx); /* keep Coverity happy */
5508             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5509             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5510             ST.cache_offset = 0;
5511             ST.cache_mask = 0;
5512
5513
5514             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5515                   "%*s  whilem: matched %ld out of %d..%d\n",
5516                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5517             );
5518
5519             /* First just match a string of min A's. */
5520
5521             if (n < min) {
5522                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5523                                     maxopenparen);
5524                 cur_curlyx->u.curlyx.lastloc = locinput;
5525                 REGCP_SET(ST.lastcp);
5526
5527                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5528                 assert(0); /* NOTREACHED */
5529             }
5530
5531             /* If degenerate A matches "", assume A done. */
5532
5533             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5534                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5535                    "%*s  whilem: empty match detected, trying continuation...\n",
5536                    REPORT_CODE_OFF+depth*2, "")
5537                 );
5538                 goto do_whilem_B_max;
5539             }
5540
5541             /* super-linear cache processing.
5542              *
5543              * The idea here is that for certain types of CURLYX/WHILEM -
5544              * principally those whose upper bound is infinity (and
5545              * excluding regexes that have things like \1 and other very
5546              * non-regular expresssiony things), then if a pattern like
5547              * /....A*.../ fails and we backtrack to the WHILEM, then we
5548              * make a note that this particular WHILEM op was at string
5549              * position 47 (say) when the rest of pattern failed. Then, if
5550              * we ever find ourselves back at that WHILEM, and at string
5551              * position 47 again, we can just fail immediately rather than
5552              * running the rest of the pattern again.
5553              *
5554              * This is very handy when patterns start to go
5555              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5556              * with a combinatorial explosion of backtracking.
5557              *
5558              * The cache is implemented as a bit array, with one bit per
5559              * string byte position per WHILEM op (up to 16) - so its
5560              * between 0.25 and 2x the string size.
5561              *
5562              * To avoid allocating a poscache buffer every time, we do an
5563              * initially countdown; only after we have  executed a WHILEM
5564              * op (string-length x #WHILEMs) times do we allocate the
5565              * cache.
5566              *
5567              * The top 4 bits of scan->flags byte say how many different
5568              * relevant CURLLYX/WHILEM op pairs there are, while the
5569              * bottom 4-bits is the identifying index number of this
5570              * WHILEM.
5571              */
5572
5573             if (scan->flags) {
5574
5575                 if (!reginfo->poscache_maxiter) {
5576                     /* start the countdown: Postpone detection until we
5577                      * know the match is not *that* much linear. */
5578                     reginfo->poscache_maxiter
5579                         =    (reginfo->strend - reginfo->strbeg + 1)
5580                            * (scan->flags>>4);
5581                     /* possible overflow for long strings and many CURLYX's */
5582                     if (reginfo->poscache_maxiter < 0)
5583                         reginfo->poscache_maxiter = I32_MAX;
5584                     reginfo->poscache_iter = reginfo->poscache_maxiter;
5585                 }
5586
5587                 if (reginfo->poscache_iter-- == 0) {
5588                     /* initialise cache */
5589                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5590                     regmatch_info_aux *const aux = reginfo->info_aux;
5591                     if (aux->poscache) {
5592                         if ((SSize_t)reginfo->poscache_size < size) {
5593                             Renew(aux->poscache, size, char);
5594                             reginfo->poscache_size = size;
5595                         }
5596                         Zero(aux->poscache, size, char);
5597                     }
5598                     else {
5599                         reginfo->poscache_size = size;
5600                         Newxz(aux->poscache, size, char);
5601                     }
5602                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5603       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5604                               PL_colors[4], PL_colors[5])
5605                     );
5606                 }
5607
5608                 if (reginfo->poscache_iter < 0) {
5609                     /* have we already failed at this position? */
5610                     SSize_t offset, mask;
5611
5612                     reginfo->poscache_iter = -1; /* stop eventual underflow */
5613                     offset  = (scan->flags & 0xf) - 1
5614                                 +   (locinput - reginfo->strbeg)
5615                                   * (scan->flags>>4);
5616                     mask    = 1 << (offset % 8);
5617                     offset /= 8;
5618                     if (reginfo->info_aux->poscache[offset] & mask) {
5619                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5620                             "%*s  whilem: (cache) already tried at this position...\n",
5621                             REPORT_CODE_OFF+depth*2, "")
5622                         );
5623                         sayNO; /* cache records failure */
5624                     }
5625                     ST.cache_offset = offset;
5626                     ST.cache_mask   = mask;
5627                 }
5628             }
5629
5630             /* Prefer B over A for minimal matching. */
5631
5632             if (cur_curlyx->u.curlyx.minmod) {
5633                 ST.save_curlyx = cur_curlyx;
5634                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5635                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5636                             maxopenparen);
5637                 REGCP_SET(ST.lastcp);
5638                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5639                                     locinput);
5640                 assert(0); /* NOTREACHED */
5641             }
5642
5643             /* Prefer A over B for maximal matching. */
5644
5645             if (n < max) { /* More greed allowed? */
5646                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5647                             maxopenparen);
5648                 cur_curlyx->u.curlyx.lastloc = locinput;
5649                 REGCP_SET(ST.lastcp);
5650                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5651                 assert(0); /* NOTREACHED */
5652             }
5653             goto do_whilem_B_max;
5654         }
5655         assert(0); /* NOTREACHED */
5656
5657         case WHILEM_B_min: /* just matched B in a minimal match */
5658         case WHILEM_B_max: /* just matched B in a maximal match */
5659             cur_curlyx = ST.save_curlyx;
5660             sayYES;
5661             assert(0); /* NOTREACHED */
5662
5663         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5664             cur_curlyx = ST.save_curlyx;
5665             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5666             cur_curlyx->u.curlyx.count--;
5667             CACHEsayNO;
5668             assert(0); /* NOTREACHED */
5669
5670         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5671             /* FALL THROUGH */
5672         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5673             REGCP_UNWIND(ST.lastcp);
5674             regcppop(rex, &maxopenparen);
5675             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5676             cur_curlyx->u.curlyx.count--;
5677             CACHEsayNO;
5678             assert(0); /* NOTREACHED */
5679
5680         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5681             REGCP_UNWIND(ST.lastcp);
5682             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5683             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5684                 "%*s  whilem: failed, trying continuation...\n",
5685                 REPORT_CODE_OFF+depth*2, "")
5686             );
5687           do_whilem_B_max:
5688             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5689                 && ckWARN(WARN_REGEXP)
5690                 && !reginfo->warned)
5691             {
5692                 reginfo->warned = TRUE;
5693                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5694                      "Complex regular subexpression recursion limit (%d) "
5695                      "exceeded",
5696                      REG_INFTY - 1);
5697             }
5698
5699             /* now try B */
5700             ST.save_curlyx = cur_curlyx;
5701             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5702             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5703                                 locinput);
5704             assert(0); /* NOTREACHED */
5705
5706         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5707             cur_curlyx = ST.save_curlyx;
5708             REGCP_UNWIND(ST.lastcp);
5709             regcppop(rex, &maxopenparen);
5710
5711             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5712                 /* Maximum greed exceeded */
5713                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5714                     && ckWARN(WARN_REGEXP)
5715                     && !reginfo->warned)
5716                 {
5717                     reginfo->warned     = TRUE;
5718                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5719                         "Complex regular subexpression recursion "
5720                         "limit (%d) exceeded",
5721                         REG_INFTY - 1);
5722                 }
5723                 cur_curlyx->u.curlyx.count--;
5724                 CACHEsayNO;
5725             }
5726
5727             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5728                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5729             );
5730             /* Try grabbing another A and see if it helps. */
5731             cur_curlyx->u.curlyx.lastloc = locinput;
5732             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5733                             maxopenparen);
5734             REGCP_SET(ST.lastcp);
5735             PUSH_STATE_GOTO(WHILEM_A_min,
5736                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5737                 locinput);
5738             assert(0); /* NOTREACHED */
5739
5740 #undef  ST
5741 #define ST st->u.branch
5742
5743         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5744             next = scan + ARG(scan);
5745             if (next == scan)
5746                 next = NULL;
5747             scan = NEXTOPER(scan);
5748             /* FALL THROUGH */
5749
5750         case BRANCH:        /*  /(...|A|...)/ */
5751             scan = NEXTOPER(scan); /* scan now points to inner node */
5752             ST.lastparen = rex->lastparen;
5753             ST.lastcloseparen = rex->lastcloseparen;
5754             ST.next_branch = next;
5755             REGCP_SET(ST.cp);
5756
5757             /* Now go into the branch */
5758             if (has_cutgroup) {
5759                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5760             } else {
5761                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5762             }
5763             assert(0); /* NOTREACHED */
5764
5765         case CUTGROUP:  /*  /(*THEN)/  */
5766             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5767                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5768             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5769             assert(0); /* NOTREACHED */
5770
5771         case CUTGROUP_next_fail:
5772             do_cutgroup = 1;
5773             no_final = 1;
5774             if (st->u.mark.mark_name)
5775                 sv_commit = st->u.mark.mark_name;
5776             sayNO;
5777             assert(0); /* NOTREACHED */
5778
5779         case BRANCH_next:
5780             sayYES;
5781             assert(0); /* NOTREACHED */
5782
5783         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5784             if (do_cutgroup) {
5785                 do_cutgroup = 0;
5786                 no_final = 0;
5787             }
5788             REGCP_UNWIND(ST.cp);
5789             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5790             scan = ST.next_branch;
5791             /* no more branches? */
5792             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5793                 DEBUG_EXECUTE_r({
5794                     PerlIO_printf( Perl_debug_log,
5795                         "%*s  %sBRANCH failed...%s\n",
5796                         REPORT_CODE_OFF+depth*2, "",
5797                         PL_colors[4],
5798                         PL_colors[5] );
5799                 });
5800                 sayNO_SILENT;
5801             }
5802             continue; /* execute next BRANCH[J] op */
5803             assert(0); /* NOTREACHED */
5804
5805         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5806             minmod = 1;
5807             break;
5808
5809 #undef  ST
5810 #define ST st->u.curlym
5811
5812         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5813
5814             /* This is an optimisation of CURLYX that enables us to push
5815              * only a single backtracking state, no matter how many matches
5816              * there are in {m,n}. It relies on the pattern being constant
5817              * length, with no parens to influence future backrefs
5818              */
5819
5820             ST.me = scan;
5821             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5822
5823             ST.lastparen      = rex->lastparen;
5824             ST.lastcloseparen = rex->lastcloseparen;
5825
5826             /* if paren positive, emulate an OPEN/CLOSE around A */
5827             if (ST.me->flags) {
5828                 U32 paren = ST.me->flags;
5829                 if (paren > maxopenparen)
5830                     maxopenparen = paren;
5831                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5832             }
5833             ST.A = scan;
5834             ST.B = next;
5835             ST.alen = 0;
5836             ST.count = 0;
5837             ST.minmod = minmod;
5838             minmod = 0;
5839             ST.c1 = CHRTEST_UNINIT;
5840             REGCP_SET(ST.cp);
5841
5842             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5843                 goto curlym_do_B;
5844
5845           curlym_do_A: /* execute the A in /A{m,n}B/  */
5846             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5847             assert(0); /* NOTREACHED */
5848
5849         case CURLYM_A: /* we've just matched an A */
5850             ST.count++;
5851             /* after first match, determine A's length: u.curlym.alen */
5852             if (ST.count == 1) {
5853                 if (reginfo->is_utf8_target) {
5854                     char *s = st->locinput;
5855                     while (s < locinput) {
5856                         ST.alen++;
5857                         s += UTF8SKIP(s);
5858                     }
5859                 }
5860                 else {
5861                     ST.alen = locinput - st->locinput;
5862                 }
5863                 if (ST.alen == 0)
5864                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5865             }
5866             DEBUG_EXECUTE_r(
5867                 PerlIO_printf(Perl_debug_log,
5868                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5869                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5870                           (IV) ST.count, (IV)ST.alen)
5871             );
5872
5873             if (cur_eval && cur_eval->u.eval.close_paren &&
5874                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5875                 goto fake_end;
5876
5877             {
5878                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5879                 if ( max == REG_INFTY || ST.count < max )
5880                     goto curlym_do_A; /* try to match another A */
5881             }
5882             goto curlym_do_B; /* try to match B */
5883
5884         case CURLYM_A_fail: /* just failed to match an A */
5885             REGCP_UNWIND(ST.cp);
5886
5887             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5888                 || (cur_eval && cur_eval->u.eval.close_paren &&
5889                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5890                 sayNO;
5891
5892           curlym_do_B: /* execute the B in /A{m,n}B/  */
5893             if (ST.c1 == CHRTEST_UNINIT) {
5894                 /* calculate c1 and c2 for possible match of 1st char
5895                  * following curly */
5896                 ST.c1 = ST.c2 = CHRTEST_VOID;
5897                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5898                     regnode *text_node = ST.B;
5899                     if (! HAS_TEXT(text_node))
5900                         FIND_NEXT_IMPT(text_node);
5901                     /* this used to be
5902
5903                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5904
5905                         But the former is redundant in light of the latter.
5906
5907                         if this changes back then the macro for
5908                         IS_TEXT and friends need to change.
5909                      */
5910                     if (PL_regkind[OP(text_node)] == EXACT) {
5911                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5912                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5913                            reginfo))
5914                         {
5915                             sayNO;
5916                         }
5917                     }
5918                 }
5919             }
5920
5921             DEBUG_EXECUTE_r(
5922                 PerlIO_printf(Perl_debug_log,
5923                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5924                     (int)(REPORT_CODE_OFF+(depth*2)),
5925                     "", (IV)ST.count)
5926                 );
5927             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5928                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5929                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5930                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5931                     {
5932                         /* simulate B failing */
5933                         DEBUG_OPTIMISE_r(
5934                             PerlIO_printf(Perl_debug_log,
5935                                 "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
5936                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5937                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5938                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5939                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5940                         );
5941                         state_num = CURLYM_B_fail;
5942                         goto reenter_switch;
5943                     }
5944                 }
5945                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5946                     /* simulate B failing */
5947                     DEBUG_OPTIMISE_r(
5948                         PerlIO_printf(Perl_debug_log,
5949                             "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
5950                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5951                             (int) nextchr, ST.c1, ST.c2)
5952                     );
5953                     state_num = CURLYM_B_fail;
5954                     goto reenter_switch;
5955                 }
5956             }
5957
5958             if (ST.me->flags) {
5959                 /* emulate CLOSE: mark current A as captured */
5960                 I32 paren = ST.me->flags;
5961                 if (ST.count) {
5962                     rex->offs[paren].start
5963                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5964                     rex->offs[paren].end = locinput - reginfo->strbeg;
5965                     if ((U32)paren > rex->lastparen)
5966                         rex->lastparen = paren;
5967                     rex->lastcloseparen = paren;
5968                 }
5969                 else
5970                     rex->offs[paren].end = -1;
5971                 if (cur_eval && cur_eval->u.eval.close_paren &&
5972                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5973                 {
5974                     if (ST.count)
5975                         goto fake_end;
5976                     else
5977                         sayNO;
5978                 }
5979             }
5980
5981             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5982             assert(0); /* NOTREACHED */
5983
5984         case CURLYM_B_fail: /* just failed to match a B */
5985             REGCP_UNWIND(ST.cp);
5986             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5987             if (ST.minmod) {
5988                 I32 max = ARG2(ST.me);
5989                 if (max != REG_INFTY && ST.count == max)
5990                     sayNO;
5991                 goto curlym_do_A; /* try to match a further A */
5992             }
5993             /* backtrack one A */
5994             if (ST.count == ARG1(ST.me) /* min */)
5995                 sayNO;
5996             ST.count--;
5997             SET_locinput(HOPc(locinput, -ST.alen));
5998             goto curlym_do_B; /* try to match B */
5999
6000 #undef ST
6001 #define ST st->u.curly
6002
6003 #define CURLY_SETPAREN(paren, success) \
6004     if (paren) { \
6005         if (success) { \
6006             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6007             rex->offs[paren].end = locinput - reginfo->strbeg; \
6008             if (paren > rex->lastparen) \
6009                 rex->lastparen = paren; \
6010             rex->lastcloseparen = paren; \
6011         } \
6012         else { \
6013             rex->offs[paren].end = -1; \
6014             rex->lastparen      = ST.lastparen; \
6015             rex->lastcloseparen = ST.lastcloseparen; \
6016         } \
6017     }
6018
6019         case STAR:              /*  /A*B/ where A is width 1 char */
6020             ST.paren = 0;
6021             ST.min = 0;
6022             ST.max = REG_INFTY;
6023             scan = NEXTOPER(scan);
6024             goto repeat;
6025
6026         case PLUS:              /*  /A+B/ where A is width 1 char */
6027             ST.paren = 0;
6028             ST.min = 1;
6029             ST.max = REG_INFTY;
6030             scan = NEXTOPER(scan);
6031             goto repeat;
6032
6033         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
6034             ST.paren = scan->flags;     /* Which paren to set */
6035             ST.lastparen      = rex->lastparen;
6036             ST.lastcloseparen = rex->lastcloseparen;
6037             if (ST.paren > maxopenparen)
6038                 maxopenparen = ST.paren;
6039             ST.min = ARG1(scan);  /* min to match */
6040             ST.max = ARG2(scan);  /* max to match */
6041             if (cur_eval && cur_eval->u.eval.close_paren &&
6042                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6043                 ST.min=1;
6044                 ST.max=1;
6045             }
6046             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6047             goto repeat;
6048
6049         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
6050             ST.paren = 0;
6051             ST.min = ARG1(scan);  /* min to match */
6052             ST.max = ARG2(scan);  /* max to match */
6053             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6054           repeat:
6055             /*
6056             * Lookahead to avoid useless match attempts
6057             * when we know what character comes next.
6058             *
6059             * Used to only do .*x and .*?x, but now it allows
6060             * for )'s, ('s and (?{ ... })'s to be in the way
6061             * of the quantifier and the EXACT-like node.  -- japhy
6062             */
6063
6064             assert(ST.min <= ST.max);
6065             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6066                 ST.c1 = ST.c2 = CHRTEST_VOID;
6067             }
6068             else {
6069                 regnode *text_node = next;
6070
6071                 if (! HAS_TEXT(text_node))
6072                     FIND_NEXT_IMPT(text_node);
6073
6074                 if (! HAS_TEXT(text_node))
6075                     ST.c1 = ST.c2 = CHRTEST_VOID;
6076                 else {
6077                     if ( PL_regkind[OP(text_node)] != EXACT ) {
6078                         ST.c1 = ST.c2 = CHRTEST_VOID;
6079                     }
6080                     else {
6081
6082                     /*  Currently we only get here when
6083
6084                         PL_rekind[OP(text_node)] == EXACT
6085
6086                         if this changes back then the macro for IS_TEXT and
6087                         friends need to change. */
6088                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6089                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6090                            reginfo))
6091                         {
6092                             sayNO;
6093                         }
6094                     }
6095                 }
6096             }
6097
6098             ST.A = scan;
6099             ST.B = next;
6100             if (minmod) {
6101                 char *li = locinput;
6102                 minmod = 0;
6103                 if (ST.min &&
6104                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6105                             < ST.min)
6106                     sayNO;
6107                 SET_locinput(li);
6108                 ST.count = ST.min;
6109                 REGCP_SET(ST.cp);
6110                 if (ST.c1 == CHRTEST_VOID)
6111                     goto curly_try_B_min;
6112
6113                 ST.oldloc = locinput;
6114
6115                 /* set ST.maxpos to the furthest point along the
6116                  * string that could possibly match */
6117                 if  (ST.max == REG_INFTY) {
6118                     ST.maxpos = reginfo->strend - 1;
6119                     if (utf8_target)
6120                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6121                             ST.maxpos--;
6122                 }
6123                 else if (utf8_target) {
6124                     int m = ST.max - ST.min;
6125                     for (ST.maxpos = locinput;
6126                          m >0 && ST.maxpos < reginfo->strend; m--)
6127                         ST.maxpos += UTF8SKIP(ST.maxpos);
6128                 }
6129                 else {
6130                     ST.maxpos = locinput + ST.max - ST.min;
6131                     if (ST.maxpos >= reginfo->strend)
6132                         ST.maxpos = reginfo->strend - 1;
6133                 }
6134                 goto curly_try_B_min_known;
6135
6136             }
6137             else {
6138                 /* avoid taking address of locinput, so it can remain
6139                  * a register var */
6140                 char *li = locinput;
6141                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6142                 if (ST.count < ST.min)
6143                     sayNO;
6144                 SET_locinput(li);
6145                 if ((ST.count > ST.min)
6146                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6147                 {
6148                     /* A{m,n} must come at the end of the string, there's
6149                      * no point in backing off ... */
6150                     ST.min = ST.count;
6151                     /* ...except that $ and \Z can match before *and* after
6152                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6153                        We may back off by one in this case. */
6154                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6155                         ST.min--;
6156                 }
6157                 REGCP_SET(ST.cp);
6158                 goto curly_try_B_max;
6159             }
6160             assert(0); /* NOTREACHED */
6161
6162
6163         case CURLY_B_min_known_fail:
6164             /* failed to find B in a non-greedy match where c1,c2 valid */
6165
6166             REGCP_UNWIND(ST.cp);
6167             if (ST.paren) {
6168                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6169             }
6170             /* Couldn't or didn't -- move forward. */
6171             ST.oldloc = locinput;
6172             if (utf8_target)
6173                 locinput += UTF8SKIP(locinput);
6174             else
6175                 locinput++;
6176             ST.count++;
6177           curly_try_B_min_known:
6178              /* find the next place where 'B' could work, then call B */
6179             {
6180                 int n;
6181                 if (utf8_target) {
6182                     n = (ST.oldloc == locinput) ? 0 : 1;
6183                     if (ST.c1 == ST.c2) {
6184                         /* set n to utf8_distance(oldloc, locinput) */
6185                         while (locinput <= ST.maxpos
6186                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6187                         {
6188                             locinput += UTF8SKIP(locinput);
6189                             n++;
6190                         }
6191                     }
6192                     else {
6193                         /* set n to utf8_distance(oldloc, locinput) */
6194                         while (locinput <= ST.maxpos
6195                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6196                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6197                         {
6198                             locinput += UTF8SKIP(locinput);
6199                             n++;
6200                         }
6201                     }
6202                 }
6203                 else {  /* Not utf8_target */
6204                     if (ST.c1 == ST.c2) {
6205                         while (locinput <= ST.maxpos &&
6206                                UCHARAT(locinput) != ST.c1)
6207                             locinput++;
6208                     }
6209                     else {
6210                         while (locinput <= ST.maxpos
6211                                && UCHARAT(locinput) != ST.c1
6212                                && UCHARAT(locinput) != ST.c2)
6213                             locinput++;
6214                     }
6215                     n = locinput - ST.oldloc;
6216                 }
6217                 if (locinput > ST.maxpos)
6218                     sayNO;
6219                 if (n) {
6220                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6221                      * at b; check that everything between oldloc and
6222                      * locinput matches */
6223                     char *li = ST.oldloc;
6224                     ST.count += n;
6225                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6226                         sayNO;
6227                     assert(n == REG_INFTY || locinput == li);
6228                 }
6229                 CURLY_SETPAREN(ST.paren, ST.count);
6230                 if (cur_eval && cur_eval->u.eval.close_paren &&
6231                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6232                     goto fake_end;
6233                 }
6234                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6235             }
6236             assert(0); /* NOTREACHED */
6237
6238
6239         case CURLY_B_min_fail:
6240             /* failed to find B in a non-greedy match where c1,c2 invalid */
6241
6242             REGCP_UNWIND(ST.cp);
6243             if (ST.paren) {
6244                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6245             }
6246             /* failed -- move forward one */
6247             {
6248                 char *li = locinput;
6249                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6250                     sayNO;
6251                 }
6252                 locinput = li;
6253             }
6254             {
6255                 ST.count++;
6256                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6257                         ST.count > 0)) /* count overflow ? */
6258                 {
6259                   curly_try_B_min:
6260                     CURLY_SETPAREN(ST.paren, ST.count);
6261                     if (cur_eval && cur_eval->u.eval.close_paren &&
6262                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6263                         goto fake_end;
6264                     }
6265                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6266                 }
6267             }
6268             sayNO;
6269             assert(0); /* NOTREACHED */
6270
6271
6272         curly_try_B_max:
6273             /* a successful greedy match: now try to match B */
6274             if (cur_eval && cur_eval->u.eval.close_paren &&
6275                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6276                 goto fake_end;
6277             }
6278             {
6279                 bool could_match = locinput < reginfo->strend;
6280
6281                 /* If it could work, try it. */
6282                 if (ST.c1 != CHRTEST_VOID && could_match) {
6283                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6284                     {
6285                         could_match = memEQ(locinput,
6286                                             ST.c1_utf8,
6287                                             UTF8SKIP(locinput))
6288                                     || memEQ(locinput,
6289                                              ST.c2_utf8,
6290                                              UTF8SKIP(locinput));
6291                     }
6292                     else {
6293                         could_match = UCHARAT(locinput) == ST.c1
6294                                       || UCHARAT(locinput) == ST.c2;
6295                     }
6296                 }
6297                 if (ST.c1 == CHRTEST_VOID || could_match) {
6298                     CURLY_SETPAREN(ST.paren, ST.count);
6299                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6300                     assert(0); /* NOTREACHED */
6301                 }
6302             }
6303             /* FALL THROUGH */
6304
6305         case CURLY_B_max_fail:
6306             /* failed to find B in a greedy match */
6307
6308             REGCP_UNWIND(ST.cp);
6309             if (ST.paren) {
6310                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6311             }
6312             /*  back up. */
6313             if (--ST.count < ST.min)
6314                 sayNO;
6315             locinput = HOPc(locinput, -1);
6316             goto curly_try_B_max;
6317
6318 #undef ST
6319
6320         case END: /*  last op of main pattern  */
6321             fake_end:
6322             if (cur_eval) {
6323                 /* we've just finished A in /(??{A})B/; now continue with B */
6324
6325                 st->u.eval.prev_rex = rex_sv;           /* inner */
6326
6327                 /* Save *all* the positions. */
6328                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6329                 rex_sv = cur_eval->u.eval.prev_rex;
6330                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6331                 SET_reg_curpm(rex_sv);
6332                 rex = ReANY(rex_sv);
6333                 rexi = RXi_GET(rex);
6334                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6335
6336                 REGCP_SET(st->u.eval.lastcp);
6337
6338                 /* Restore parens of the outer rex without popping the
6339                  * savestack */
6340                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6341                                         &maxopenparen);
6342
6343                 st->u.eval.prev_eval = cur_eval;
6344                 cur_eval = cur_eval->u.eval.prev_eval;
6345                 DEBUG_EXECUTE_r(
6346                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6347                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6348                 if ( nochange_depth )
6349                     nochange_depth--;
6350
6351                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6352                                     locinput); /* match B */
6353             }
6354
6355             if (locinput < reginfo->till) {
6356                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6357                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6358                                       PL_colors[4],
6359                                       (long)(locinput - startpos),
6360                                       (long)(reginfo->till - startpos),
6361                                       PL_colors[5]));
6362
6363                 sayNO_SILENT;           /* Cannot match: too short. */
6364             }
6365             sayYES;                     /* Success! */
6366
6367         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6368             DEBUG_EXECUTE_r(
6369             PerlIO_printf(Perl_debug_log,
6370                 "%*s  %ssubpattern success...%s\n",
6371                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6372             sayYES;                     /* Success! */
6373
6374 #undef  ST
6375 #define ST st->u.ifmatch
6376
6377         {
6378             char *newstart;
6379
6380         case SUSPEND:   /* (?>A) */
6381             ST.wanted = 1;
6382             newstart = locinput;
6383             goto do_ifmatch;
6384
6385         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6386             ST.wanted = 0;
6387             goto ifmatch_trivial_fail_test;
6388
6389         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6390             ST.wanted = 1;
6391           ifmatch_trivial_fail_test:
6392             if (scan->flags) {
6393                 char * const s = HOPBACKc(locinput, scan->flags);
6394                 if (!s) {
6395                     /* trivial fail */
6396                     if (logical) {
6397                         logical = 0;
6398                         sw = 1 - cBOOL(ST.wanted);
6399                     }
6400                     else if (ST.wanted)
6401                         sayNO;
6402                     next = scan + ARG(scan);
6403                     if (next == scan)
6404                         next = NULL;
6405                     break;
6406                 }
6407                 newstart = s;
6408             }
6409             else
6410                 newstart = locinput;
6411
6412           do_ifmatch:
6413             ST.me = scan;
6414             ST.logical = logical;
6415             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6416
6417             /* execute body of (?...A) */
6418             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6419             assert(0); /* NOTREACHED */
6420         }
6421
6422         case IFMATCH_A_fail: /* body of (?...A) failed */
6423             ST.wanted = !ST.wanted;
6424             /* FALL THROUGH */
6425
6426         case IFMATCH_A: /* body of (?...A) succeeded */
6427             if (ST.logical) {
6428                 sw = cBOOL(ST.wanted);
6429             }
6430             else if (!ST.wanted)
6431                 sayNO;
6432
6433             if (OP(ST.me) != SUSPEND) {
6434                 /* restore old position except for (?>...) */
6435                 locinput = st->locinput;
6436             }
6437             scan = ST.me + ARG(ST.me);
6438             if (scan == ST.me)
6439                 scan = NULL;
6440             continue; /* execute B */
6441
6442 #undef ST
6443
6444         case LONGJMP: /*  alternative with many branches compiles to
6445                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6446             next = scan + ARG(scan);
6447             if (next == scan)
6448                 next = NULL;
6449             break;
6450
6451         case COMMIT:  /*  (*COMMIT)  */
6452             reginfo->cutpoint = reginfo->strend;
6453             /* FALLTHROUGH */
6454
6455         case PRUNE:   /*  (*PRUNE)   */
6456             if (!scan->flags)
6457                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6458             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6459             assert(0); /* NOTREACHED */
6460
6461         case COMMIT_next_fail:
6462             no_final = 1;
6463             /* FALLTHROUGH */
6464
6465         case OPFAIL:   /* (*FAIL)  */
6466             sayNO;
6467             assert(0); /* NOTREACHED */
6468
6469 #define ST st->u.mark
6470         case MARKPOINT: /*  (*MARK:foo)  */
6471             ST.prev_mark = mark_state;
6472             ST.mark_name = sv_commit = sv_yes_mark
6473                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6474             mark_state = st;
6475             ST.mark_loc = locinput;
6476             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6477             assert(0); /* NOTREACHED */
6478
6479         case MARKPOINT_next:
6480             mark_state = ST.prev_mark;
6481             sayYES;
6482             assert(0); /* NOTREACHED */
6483
6484         case MARKPOINT_next_fail:
6485             if (popmark && sv_eq(ST.mark_name,popmark))
6486             {
6487                 if (ST.mark_loc > startpoint)
6488                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6489                 popmark = NULL; /* we found our mark */
6490                 sv_commit = ST.mark_name;
6491
6492                 DEBUG_EXECUTE_r({
6493                         PerlIO_printf(Perl_debug_log,
6494                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6495                             REPORT_CODE_OFF+depth*2, "",
6496                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6497                 });
6498             }
6499             mark_state = ST.prev_mark;
6500             sv_yes_mark = mark_state ?
6501                 mark_state->u.mark.mark_name : NULL;
6502             sayNO;
6503             assert(0); /* NOTREACHED */
6504
6505         case SKIP:  /*  (*SKIP)  */
6506             if (scan->flags) {
6507                 /* (*SKIP) : if we fail we cut here*/
6508                 ST.mark_name = NULL;
6509                 ST.mark_loc = locinput;
6510                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6511             } else {
6512                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6513                    otherwise do nothing.  Meaning we need to scan
6514                  */
6515                 regmatch_state *cur = mark_state;
6516                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6517
6518                 while (cur) {
6519                     if ( sv_eq( cur->u.mark.mark_name,
6520                                 find ) )
6521                     {
6522                         ST.mark_name = find;
6523                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6524                     }
6525                     cur = cur->u.mark.prev_mark;
6526                 }
6527             }
6528             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6529             break;
6530
6531         case SKIP_next_fail:
6532             if (ST.mark_name) {
6533                 /* (*CUT:NAME) - Set up to search for the name as we
6534                    collapse the stack*/
6535                 popmark = ST.mark_name;
6536             } else {
6537                 /* (*CUT) - No name, we cut here.*/
6538                 if (ST.mark_loc > startpoint)
6539                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6540                 /* but we set sv_commit to latest mark_name if there
6541                    is one so they can test to see how things lead to this
6542                    cut */
6543                 if (mark_state)
6544                     sv_commit=mark_state->u.mark.mark_name;
6545             }
6546             no_final = 1;
6547             sayNO;
6548             assert(0); /* NOTREACHED */
6549 #undef ST
6550
6551         case LNBREAK: /* \R */
6552             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6553                 locinput += n;
6554             } else
6555                 sayNO;
6556             break;
6557
6558         default:
6559             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6560                           PTR2UV(scan), OP(scan));
6561             Perl_croak(aTHX_ "regexp memory corruption");
6562
6563         /* this is a point to jump to in order to increment
6564          * locinput by one character */
6565         increment_locinput:
6566             assert(!NEXTCHR_IS_EOS);
6567             if (utf8_target) {
6568                 locinput += PL_utf8skip[nextchr];
6569                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6570                 if (locinput > reginfo->strend)
6571                     sayNO;
6572             }
6573             else
6574                 locinput++;
6575             break;
6576
6577         } /* end switch */
6578
6579         /* switch break jumps here */
6580         scan = next; /* prepare to execute the next op and ... */
6581         continue;    /* ... jump back to the top, reusing st */
6582         assert(0); /* NOTREACHED */
6583
6584       push_yes_state:
6585         /* push a state that backtracks on success */
6586         st->u.yes.prev_yes_state = yes_state;
6587         yes_state = st;
6588         /* FALL THROUGH */
6589       push_state:
6590         /* push a new regex state, then continue at scan  */
6591         {
6592             regmatch_state *newst;
6593
6594             DEBUG_STACK_r({
6595                 regmatch_state *cur = st;
6596                 regmatch_state *curyes = yes_state;
6597                 int curd = depth;
6598                 regmatch_slab *slab = PL_regmatch_slab;
6599                 for (;curd > -1;cur--,curd--) {
6600                     if (cur < SLAB_FIRST(slab)) {
6601                         slab = slab->prev;
6602                         cur = SLAB_LAST(slab);
6603                     }
6604                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6605                         REPORT_CODE_OFF + 2 + depth * 2,"",
6606                         curd, PL_reg_name[cur->resume_state],
6607                         (curyes == cur) ? "yes" : ""
6608                     );
6609                     if (curyes == cur)
6610                         curyes = cur->u.yes.prev_yes_state;
6611                 }
6612             } else
6613                 DEBUG_STATE_pp("push")
6614             );
6615             depth++;
6616             st->locinput = locinput;
6617             newst = st+1;
6618             if (newst >  SLAB_LAST(PL_regmatch_slab))
6619                 newst = S_push_slab(aTHX);
6620             PL_regmatch_state = newst;
6621
6622             locinput = pushinput;
6623             st = newst;
6624             continue;
6625             assert(0); /* NOTREACHED */
6626         }
6627     }
6628
6629     /*
6630     * We get here only if there's trouble -- normally "case END" is
6631     * the terminating point.
6632     */
6633     Perl_croak(aTHX_ "corrupted regexp pointers");
6634     /*NOTREACHED*/
6635     sayNO;
6636
6637 yes:
6638     if (yes_state) {
6639         /* we have successfully completed a subexpression, but we must now
6640          * pop to the state marked by yes_state and continue from there */
6641         assert(st != yes_state);
6642 #ifdef DEBUGGING
6643         while (st != yes_state) {
6644             st--;
6645             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6646                 PL_regmatch_slab = PL_regmatch_slab->prev;
6647                 st = SLAB_LAST(PL_regmatch_slab);
6648             }
6649             DEBUG_STATE_r({
6650                 if (no_final) {
6651                     DEBUG_STATE_pp("pop (no final)");
6652                 } else {
6653                     DEBUG_STATE_pp("pop (yes)");
6654                 }
6655             });
6656             depth--;
6657         }
6658 #else
6659         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6660             || yes_state > SLAB_LAST(PL_regmatch_slab))
6661         {
6662             /* not in this slab, pop slab */
6663             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6664             PL_regmatch_slab = PL_regmatch_slab->prev;
6665             st = SLAB_LAST(PL_regmatch_slab);
6666         }
6667         depth -= (st - yes_state);
6668 #endif
6669         st = yes_state;
6670         yes_state = st->u.yes.prev_yes_state;
6671         PL_regmatch_state = st;
6672
6673         if (no_final)
6674             locinput= st->locinput;
6675         state_num = st->resume_state + no_final;
6676         goto reenter_switch;
6677     }
6678
6679     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6680                           PL_colors[4], PL_colors[5]));
6681
6682     if (reginfo->info_aux_eval) {
6683         /* each successfully executed (?{...}) block does the equivalent of
6684          *   local $^R = do {...}
6685          * When popping the save stack, all these locals would be undone;
6686          * bypass this by setting the outermost saved $^R to the latest
6687          * value */
6688         if (oreplsv != GvSV(PL_replgv))
6689             sv_setsv(oreplsv, GvSV(PL_replgv));
6690     }
6691     result = 1;
6692     goto final_exit;
6693
6694 no:
6695     DEBUG_EXECUTE_r(
6696         PerlIO_printf(Perl_debug_log,
6697             "%*s  %sfailed...%s\n",
6698             REPORT_CODE_OFF+depth*2, "",
6699             PL_colors[4], PL_colors[5])
6700         );
6701
6702 no_silent:
6703     if (no_final) {
6704         if (yes_state) {
6705             goto yes;
6706         } else {
6707             goto final_exit;
6708         }
6709     }
6710     if (depth) {
6711         /* there's a previous state to backtrack to */
6712         st--;
6713         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6714             PL_regmatch_slab = PL_regmatch_slab->prev;
6715             st = SLAB_LAST(PL_regmatch_slab);
6716         }
6717         PL_regmatch_state = st;
6718         locinput= st->locinput;
6719
6720         DEBUG_STATE_pp("pop");
6721         depth--;
6722         if (yes_state == st)
6723             yes_state = st->u.yes.prev_yes_state;
6724
6725         state_num = st->resume_state + 1; /* failure = success + 1 */
6726         goto reenter_switch;
6727     }
6728     result = 0;
6729
6730   final_exit:
6731     if (rex->intflags & PREGf_VERBARG_SEEN) {
6732         SV *sv_err = get_sv("REGERROR", 1);
6733         SV *sv_mrk = get_sv("REGMARK", 1);
6734         if (result) {
6735             sv_commit = &PL_sv_no;
6736             if (!sv_yes_mark)
6737                 sv_yes_mark = &PL_sv_yes;
6738         } else {
6739             if (!sv_commit)
6740                 sv_commit = &PL_sv_yes;
6741             sv_yes_mark = &PL_sv_no;
6742         }
6743         sv_setsv(sv_err, sv_commit);
6744         sv_setsv(sv_mrk, sv_yes_mark);
6745     }
6746
6747
6748     if (last_pushed_cv) {
6749         dSP;
6750         POP_MULTICALL;
6751         PERL_UNUSED_VAR(SP);
6752     }
6753
6754     assert(!result ||  locinput - reginfo->strbeg >= 0);
6755     return result ?  locinput - reginfo->strbeg : -1;
6756 }
6757
6758 /*
6759  - regrepeat - repeatedly match something simple, report how many
6760  *
6761  * What 'simple' means is a node which can be the operand of a quantifier like
6762  * '+', or {1,3}
6763  *
6764  * startposp - pointer a pointer to the start position.  This is updated
6765  *             to point to the byte following the highest successful
6766  *             match.
6767  * p         - the regnode to be repeatedly matched against.
6768  * reginfo   - struct holding match state, such as strend
6769  * max       - maximum number of things to match.
6770  * depth     - (for debugging) backtracking depth.
6771  */
6772 STATIC I32
6773 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6774             regmatch_info *const reginfo, I32 max, int depth)
6775 {
6776     dVAR;
6777     char *scan;     /* Pointer to current position in target string */
6778     I32 c;
6779     char *loceol = reginfo->strend;   /* local version */
6780     I32 hardcount = 0;  /* How many matches so far */
6781     bool utf8_target = reginfo->is_utf8_target;
6782     int to_complement = 0;  /* Invert the result? */
6783     UV utf8_flags;
6784     _char_class_number classnum;
6785 #ifndef DEBUGGING
6786     PERL_UNUSED_ARG(depth);
6787 #endif
6788
6789     PERL_ARGS_ASSERT_REGREPEAT;
6790
6791     scan = *startposp;
6792     if (max == REG_INFTY)
6793         max = I32_MAX;
6794     else if (! utf8_target && loceol - scan > max)
6795         loceol = scan + max;
6796
6797     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6798      * to the maximum of how far we should go in it (leaving it set to the real
6799      * end, if the maximum permissible would take us beyond that).  This allows
6800      * us to make the loop exit condition that we haven't gone past <loceol> to
6801      * also mean that we haven't exceeded the max permissible count, saving a
6802      * test each time through the loop.  But it assumes that the OP matches a
6803      * single byte, which is true for most of the OPs below when applied to a
6804      * non-UTF-8 target.  Those relatively few OPs that don't have this
6805      * characteristic will have to compensate.
6806      *
6807      * There is no adjustment for UTF-8 targets, as the number of bytes per
6808      * character varies.  OPs will have to test both that the count is less
6809      * than the max permissible (using <hardcount> to keep track), and that we
6810      * are still within the bounds of the string (using <loceol>.  A few OPs
6811      * match a single byte no matter what the encoding.  They can omit the max
6812      * test if, for the UTF-8 case, they do the adjustment that was skipped
6813      * above.
6814      *
6815      * Thus, the code above sets things up for the common case; and exceptional
6816      * cases need extra work; the common case is to make sure <scan> doesn't
6817      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6818      * count doesn't exceed the maximum permissible */
6819
6820     switch (OP(p)) {
6821     case REG_ANY:
6822         if (utf8_target) {
6823             while (scan < loceol && hardcount < max && *scan != '\n') {
6824                 scan += UTF8SKIP(scan);
6825                 hardcount++;
6826             }
6827         } else {
6828             while (scan < loceol && *scan != '\n')
6829                 scan++;
6830         }
6831         break;
6832     case SANY:
6833         if (utf8_target) {
6834             while (scan < loceol && hardcount < max) {
6835                 scan += UTF8SKIP(scan);
6836                 hardcount++;
6837             }
6838         }
6839         else
6840             scan = loceol;
6841         break;
6842     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6843         if (utf8_target && loceol - scan > max) {
6844
6845             /* <loceol> hadn't been adjusted in the UTF-8 case */
6846             scan +=  max;
6847         }
6848         else {
6849             scan = loceol;
6850         }
6851         break;
6852     case EXACT:
6853         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6854
6855         c = (U8)*STRING(p);
6856
6857         /* Can use a simple loop if the pattern char to match on is invariant
6858          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6859          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6860          * true iff it doesn't matter if the argument is in UTF-8 or not */
6861         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6862             if (utf8_target && loceol - scan > max) {
6863                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6864                  * since here, to match at all, 1 char == 1 byte */
6865                 loceol = scan + max;
6866             }
6867             while (scan < loceol && UCHARAT(scan) == c) {
6868                 scan++;
6869             }
6870         }
6871         else if (reginfo->is_utf8_pat) {
6872             if (utf8_target) {
6873                 STRLEN scan_char_len;
6874
6875                 /* When both target and pattern are UTF-8, we have to do
6876                  * string EQ */
6877                 while (hardcount < max
6878                        && scan < loceol
6879                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6880                        && memEQ(scan, STRING(p), scan_char_len))
6881                 {
6882                     scan += scan_char_len;
6883                     hardcount++;
6884                 }
6885             }
6886             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6887
6888                 /* Target isn't utf8; convert the character in the UTF-8
6889                  * pattern to non-UTF8, and do a simple loop */
6890                 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
6891                 while (scan < loceol && UCHARAT(scan) == c) {
6892                     scan++;
6893                 }
6894             } /* else pattern char is above Latin1, can't possibly match the
6895                  non-UTF-8 target */
6896         }
6897         else {
6898
6899             /* Here, the string must be utf8; pattern isn't, and <c> is
6900              * different in utf8 than not, so can't compare them directly.
6901              * Outside the loop, find the two utf8 bytes that represent c, and
6902              * then look for those in sequence in the utf8 string */
6903             U8 high = UTF8_TWO_BYTE_HI(c);
6904             U8 low = UTF8_TWO_BYTE_LO(c);
6905
6906             while (hardcount < max
6907                     && scan + 1 < loceol
6908                     && UCHARAT(scan) == high
6909                     && UCHARAT(scan + 1) == low)
6910             {
6911                 scan += 2;
6912                 hardcount++;
6913             }
6914         }
6915         break;
6916
6917     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
6918         assert(! reginfo->is_utf8_pat);
6919         /* FALL THROUGH */
6920     case EXACTFA:
6921         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6922         goto do_exactf;
6923
6924     case EXACTFL:
6925         RXp_MATCH_TAINTED_on(prog);
6926         utf8_flags = FOLDEQ_UTF8_LOCALE;
6927         goto do_exactf;
6928
6929     case EXACTF:   /* This node only generated for non-utf8 patterns */
6930         assert(! reginfo->is_utf8_pat);
6931         utf8_flags = 0;
6932         goto do_exactf;
6933
6934     case EXACTFU_SS:
6935     case EXACTFU:
6936         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6937
6938     do_exactf: {
6939         int c1, c2;
6940         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6941
6942         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6943
6944         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6945                                         reginfo))
6946         {
6947             if (c1 == CHRTEST_VOID) {
6948                 /* Use full Unicode fold matching */
6949                 char *tmpeol = reginfo->strend;
6950                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6951                 while (hardcount < max
6952                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6953                                              STRING(p), NULL, pat_len,
6954                                              reginfo->is_utf8_pat, utf8_flags))
6955                 {
6956                     scan = tmpeol;
6957                     tmpeol = reginfo->strend;
6958                     hardcount++;
6959                 }
6960             }
6961             else if (utf8_target) {
6962                 if (c1 == c2) {
6963                     while (scan < loceol
6964                            && hardcount < max
6965                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6966                     {
6967                         scan += UTF8SKIP(scan);
6968                         hardcount++;
6969                     }
6970                 }
6971                 else {
6972                     while (scan < loceol
6973                            && hardcount < max
6974                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6975                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6976                     {
6977                         scan += UTF8SKIP(scan);
6978                         hardcount++;
6979                     }
6980                 }
6981             }
6982             else if (c1 == c2) {
6983                 while (scan < loceol && UCHARAT(scan) == c1) {
6984                     scan++;
6985                 }
6986             }
6987             else {
6988                 while (scan < loceol &&
6989                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6990                 {
6991                     scan++;
6992                 }
6993             }
6994         }
6995         break;
6996     }
6997     case ANYOF:
6998     case ANYOF_WARN_SUPER:
6999         if (utf8_target) {
7000             while (hardcount < max
7001                    && scan < loceol
7002                    && reginclass(prog, p, (U8*)scan, utf8_target))
7003             {
7004                 scan += UTF8SKIP(scan);
7005                 hardcount++;
7006             }
7007         } else {
7008             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7009                 scan++;
7010         }
7011         break;
7012
7013     /* The argument (FLAGS) to all the POSIX node types is the class number */
7014
7015     case NPOSIXL:
7016         to_complement = 1;
7017         /* FALLTHROUGH */
7018
7019     case POSIXL:
7020         RXp_MATCH_TAINTED_on(prog);
7021         if (! utf8_target) {
7022             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7023                                                                    *scan)))
7024             {
7025                 scan++;
7026             }
7027         } else {
7028             while (hardcount < max && scan < loceol
7029                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7030                                                                   (U8 *) scan)))
7031             {
7032                 scan += UTF8SKIP(scan);
7033                 hardcount++;
7034             }
7035         }
7036         break;
7037
7038     case POSIXD:
7039         if (utf8_target) {
7040             goto utf8_posix;
7041         }
7042         /* FALLTHROUGH */
7043
7044     case POSIXA:
7045         if (utf8_target && loceol - scan > max) {
7046
7047             /* We didn't adjust <loceol> at the beginning of this routine
7048              * because is UTF-8, but it is actually ok to do so, since here, to
7049              * match, 1 char == 1 byte. */
7050             loceol = scan + max;
7051         }
7052         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7053             scan++;
7054         }
7055         break;
7056
7057     case NPOSIXD:
7058         if (utf8_target) {
7059             to_complement = 1;
7060             goto utf8_posix;
7061         }
7062         /* FALL THROUGH */
7063
7064     case NPOSIXA:
7065         if (! utf8_target) {
7066             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7067                 scan++;
7068             }
7069         }
7070         else {
7071
7072             /* The complement of something that matches only ASCII matches all
7073              * UTF-8 variant code points, plus everything in ASCII that isn't
7074              * in the class. */
7075             while (hardcount < max && scan < loceol
7076                    && (! UTF8_IS_INVARIANT(*scan)
7077                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7078             {
7079                 scan += UTF8SKIP(scan);
7080                 hardcount++;
7081             }
7082         }
7083         break;
7084
7085     case NPOSIXU:
7086         to_complement = 1;
7087         /* FALLTHROUGH */
7088
7089     case POSIXU:
7090         if (! utf8_target) {
7091             while (scan < loceol && to_complement
7092                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7093             {
7094                 scan++;
7095             }
7096         }
7097         else {
7098       utf8_posix:
7099             classnum = (_char_class_number) FLAGS(p);
7100             if (classnum < _FIRST_NON_SWASH_CC) {
7101
7102                 /* Here, a swash is needed for above-Latin1 code points.
7103                  * Process as many Latin1 code points using the built-in rules.
7104                  * Go to another loop to finish processing upon encountering
7105                  * the first Latin1 code point.  We could do that in this loop
7106                  * as well, but the other way saves having to test if the swash
7107                  * has been loaded every time through the loop: extra space to
7108                  * save a test. */
7109                 while (hardcount < max && scan < loceol) {
7110                     if (UTF8_IS_INVARIANT(*scan)) {
7111                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7112                                                                    classnum))))
7113                         {
7114                             break;
7115                         }
7116                         scan++;
7117                     }
7118                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7119                         if (! (to_complement
7120                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7121                                                                      *(scan + 1)),
7122                                                     classnum))))
7123                         {
7124                             break;
7125                         }
7126                         scan += 2;
7127                     }
7128                     else {
7129                         goto found_above_latin1;
7130                     }
7131
7132                     hardcount++;
7133                 }
7134             }
7135             else {
7136                 /* For these character classes, the knowledge of how to handle
7137                  * every code point is compiled in to Perl via a macro.  This
7138                  * code is written for making the loops as tight as possible.
7139                  * It could be refactored to save space instead */
7140                 switch (classnum) {
7141                     case _CC_ENUM_SPACE:    /* XXX would require separate code
7142                                                if we revert the change of \v
7143                                                matching this */
7144                         /* FALL THROUGH */
7145                     case _CC_ENUM_PSXSPC:
7146                         while (hardcount < max
7147                                && scan < loceol
7148                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7149                         {
7150                             scan += UTF8SKIP(scan);
7151                             hardcount++;
7152                         }
7153                         break;
7154                     case _CC_ENUM_BLANK:
7155                         while (hardcount < max
7156                                && scan < loceol
7157                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7158                         {
7159                             scan += UTF8SKIP(scan);
7160                             hardcount++;
7161                         }
7162                         break;
7163                     case _CC_ENUM_XDIGIT:
7164                         while (hardcount < max
7165                                && scan < loceol
7166                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7167                         {
7168                             scan += UTF8SKIP(scan);
7169                             hardcount++;
7170                         }
7171                         break;
7172                     case _CC_ENUM_VERTSPACE:
7173                         while (hardcount < max
7174                                && scan < loceol
7175                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7176                         {
7177                             scan += UTF8SKIP(scan);
7178                             hardcount++;
7179                         }
7180                         break;
7181                     case _CC_ENUM_CNTRL:
7182                         while (hardcount < max
7183                                && scan < loceol
7184                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7185                         {
7186                             scan += UTF8SKIP(scan);
7187                             hardcount++;
7188                         }
7189                         break;
7190                     default:
7191                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7192                 }
7193             }
7194         }
7195         break;
7196
7197       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7198
7199         /* Load the swash if not already present */
7200         if (! PL_utf8_swash_ptrs[classnum]) {
7201             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7202             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7203                                         "utf8", swash_property_names[classnum],
7204                                         &PL_sv_undef, 1, 0, NULL, &flags);
7205         }
7206
7207         while (hardcount < max && scan < loceol
7208                && to_complement ^ cBOOL(_generic_utf8(
7209                                        classnum,
7210                                        scan,
7211                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
7212                                                    (U8 *) scan,
7213                                                    TRUE))))
7214         {
7215             scan += UTF8SKIP(scan);
7216             hardcount++;
7217         }
7218         break;
7219
7220     case LNBREAK:
7221         if (utf8_target) {
7222             while (hardcount < max && scan < loceol &&
7223                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7224                 scan += c;
7225                 hardcount++;
7226             }
7227         } else {
7228             /* LNBREAK can match one or two latin chars, which is ok, but we
7229              * have to use hardcount in this situation, and throw away the
7230              * adjustment to <loceol> done before the switch statement */
7231             loceol = reginfo->strend;
7232             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7233                 scan+=c;
7234                 hardcount++;
7235             }
7236         }
7237         break;
7238
7239     case BOUND:
7240     case BOUNDA:
7241     case BOUNDL:
7242     case BOUNDU:
7243     case EOS:
7244     case GPOS:
7245     case KEEPS:
7246     case NBOUND:
7247     case NBOUNDA:
7248     case NBOUNDL:
7249     case NBOUNDU:
7250     case OPFAIL:
7251     case SBOL:
7252     case SEOL:
7253         /* These are all 0 width, so match right here or not at all. */
7254         break;
7255
7256     default:
7257         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7258         assert(0); /* NOTREACHED */
7259
7260     }
7261
7262     if (hardcount)
7263         c = hardcount;
7264     else
7265         c = scan - *startposp;
7266     *startposp = scan;
7267
7268     DEBUG_r({
7269         GET_RE_DEBUG_FLAGS_DECL;
7270         DEBUG_EXECUTE_r({
7271             SV * const prop = sv_newmortal();
7272             regprop(prog, prop, p);
7273             PerlIO_printf(Perl_debug_log,
7274                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7275                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7276         });
7277     });
7278
7279     return(c);
7280 }
7281
7282
7283 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7284 /*
7285 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7286 create a copy so that changes the caller makes won't change the shared one.
7287 If <altsvp> is non-null, will return NULL in it, for back-compat.
7288  */
7289 SV *
7290 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7291 {
7292     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7293
7294     if (altsvp) {
7295         *altsvp = NULL;
7296     }
7297
7298     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7299 }
7300 #endif
7301
7302 STATIC SV *
7303 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7304 {
7305     /* Returns the swash for the input 'node' in the regex 'prog'.
7306      * If <doinit> is 'true', will attempt to create the swash if not already
7307      *    done.
7308      * If <listsvp> is non-null, will return the printable contents of the
7309      *    swash.  This can be used to get debugging information even before the
7310      *    swash exists, by calling this function with 'doinit' set to false, in
7311      *    which case the components that will be used to eventually create the
7312      *    swash are returned  (in a printable form).
7313      * Tied intimately to how regcomp.c sets up the data structure */
7314
7315     dVAR;
7316     SV *sw  = NULL;
7317     SV *si  = NULL;         /* Input swash initialization string */
7318     SV*  invlist = NULL;
7319
7320     RXi_GET_DECL(prog,progi);
7321     const struct reg_data * const data = prog ? progi->data : NULL;
7322
7323     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7324
7325     assert(ANYOF_NONBITMAP(node));
7326
7327     if (data && data->count) {
7328         const U32 n = ARG(node);
7329
7330         if (data->what[n] == 's') {
7331             SV * const rv = MUTABLE_SV(data->data[n]);
7332             AV * const av = MUTABLE_AV(SvRV(rv));
7333             SV **const ary = AvARRAY(av);
7334             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7335
7336             si = *ary;  /* ary[0] = the string to initialize the swash with */
7337
7338             /* Elements 2 and 3 are either both present or both absent. [2] is
7339              * any inversion list generated at compile time; [3] indicates if
7340              * that inversion list has any user-defined properties in it. */
7341             if (av_len(av) >= 2) {
7342                 invlist = ary[2];
7343                 if (SvUV(ary[3])) {
7344                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7345                 }
7346             }
7347             else {
7348                 invlist = NULL;
7349             }
7350
7351             /* Element [1] is reserved for the set-up swash.  If already there,
7352              * return it; if not, create it and store it there */
7353             if (ary[1] && SvROK(ary[1])) {
7354                 sw = ary[1];
7355             }
7356             else if (si && doinit) {
7357
7358                 sw = _core_swash_init("utf8", /* the utf8 package */
7359                                       "", /* nameless */
7360                                       si,
7361                                       1, /* binary */
7362                                       0, /* not from tr/// */
7363                                       invlist,
7364                                       &swash_init_flags);
7365                 (void)av_store(av, 1, sw);
7366             }
7367         }
7368     }
7369
7370     /* If requested, return a printable version of what this swash matches */
7371     if (listsvp) {
7372         SV* matches_string = newSVpvn("", 0);
7373
7374         /* The swash should be used, if possible, to get the data, as it
7375          * contains the resolved data.  But this function can be called at
7376          * compile-time, before everything gets resolved, in which case we
7377          * return the currently best available information, which is the string
7378          * that will eventually be used to do that resolving, 'si' */
7379         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7380             && (si && si != &PL_sv_undef))
7381         {
7382             sv_catsv(matches_string, si);
7383         }
7384
7385         /* Add the inversion list to whatever we have.  This may have come from
7386          * the swash, or from an input parameter */
7387         if (invlist) {
7388             sv_catsv(matches_string, _invlist_contents(invlist));
7389         }
7390         *listsvp = matches_string;
7391     }
7392
7393     return sw;
7394 }
7395
7396 /*
7397  - reginclass - determine if a character falls into a character class
7398
7399   n is the ANYOF regnode
7400   p is the target string
7401   utf8_target tells whether p is in UTF-8.
7402
7403   Returns true if matched; false otherwise.
7404
7405   Note that this can be a synthetic start class, a combination of various
7406   nodes, so things you think might be mutually exclusive, such as locale,
7407   aren't.  It can match both locale and non-locale
7408
7409  */
7410
7411 STATIC bool
7412 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7413 {
7414     dVAR;
7415     const char flags = ANYOF_FLAGS(n);
7416     bool match = FALSE;
7417     UV c = *p;
7418
7419     PERL_ARGS_ASSERT_REGINCLASS;
7420
7421     /* If c is not already the code point, get it.  Note that
7422      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7423     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7424         STRLEN c_len = 0;
7425         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7426                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7427                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7428                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7429                  * UTF8_ALLOW_FFFF */
7430         if (c_len == (STRLEN)-1)
7431             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7432     }
7433
7434     /* If this character is potentially in the bitmap, check it */
7435     if (c < 256) {
7436         if (ANYOF_BITMAP_TEST(n, c))
7437             match = TRUE;
7438         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7439                 && ! utf8_target
7440                 && ! isASCII(c))
7441         {
7442             match = TRUE;
7443         }
7444         else if (flags & ANYOF_LOCALE) {
7445             RXp_MATCH_TAINTED_on(prog);
7446
7447             if ((flags & ANYOF_LOC_FOLD)
7448                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7449             {
7450                 match = TRUE;
7451             }
7452             else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7453
7454                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7455                  * if the class includes the Posix character class given by
7456                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7457                  * complemented Posix class given by int(bit/2).  So we loop
7458                  * through the bits, each time changing whether we complement
7459                  * the result or not.  Suppose for the sake of illustration
7460                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7461                  * is set, it means there is a match for this ANYOF node if the
7462                  * character is in the class given by the expression (0 / 2 = 0
7463                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7464                  * and since 'to_complement' is 0, the result will stay TRUE,
7465                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7466                  * bit 1 is 1.  That means there is a match if the character
7467                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7468                  * but will on bit 1.  On the second iteration 'to_complement'
7469                  * will be 1, so the exclusive or will reverse things, so we
7470                  * are testing for \W.  On the third iteration, 'to_complement'
7471                  * will be 0, and we would be testing for \s; the fourth
7472                  * iteration would test for \S, etc.
7473                  *
7474                  * Note that this code assumes that all the classes are closed
7475                  * under folding.  For example, if a character matches \w, then
7476                  * its fold does too; and vice versa.  This should be true for
7477                  * any well-behaved locale for all the currently defined Posix
7478                  * classes, except for :lower: and :upper:, which are handled
7479                  * by the pseudo-class :cased: which matches if either of the
7480                  * other two does.  To get rid of this assumption, an outer
7481                  * loop could be used below to iterate over both the source
7482                  * character, and its fold (if different) */
7483
7484                 int count = 0;
7485                 int to_complement = 0;
7486                 while (count < ANYOF_MAX) {
7487                     if (ANYOF_CLASS_TEST(n, count)
7488                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7489                     {
7490                         match = TRUE;
7491                         break;
7492                     }
7493                     count++;
7494                     to_complement ^= 1;
7495                 }
7496             }
7497         }
7498     }
7499
7500     /* If the bitmap didn't (or couldn't) match, and something outside the
7501      * bitmap could match, try that.  Locale nodes specify completely the
7502      * behavior of code points in the bit map (otherwise, a utf8 target would
7503      * cause them to be treated as Unicode and not locale), except in
7504      * the very unlikely event when this node is a synthetic start class, which
7505      * could be a combination of locale and non-locale nodes.  So allow locale
7506      * to match for the synthetic start class, which will give a false
7507      * positive that will be resolved when the match is done again as not part
7508      * of the synthetic start class */
7509     if (!match) {
7510         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7511             match = TRUE;       /* Everything above 255 matches */
7512         }
7513         else if (ANYOF_NONBITMAP(n)
7514                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7515                      || (utf8_target
7516                          && (c >=256
7517                              || (! (flags & ANYOF_LOCALE))
7518                              || OP(n) == ANYOF_SYNTHETIC))))
7519         {
7520             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7521             if (sw) {
7522                 U8 * utf8_p;
7523                 if (utf8_target) {
7524                     utf8_p = (U8 *) p;
7525                 } else { /* Convert to utf8 */
7526                     STRLEN len = 1;
7527                     utf8_p = bytes_to_utf8(p, &len);
7528                 }
7529
7530                 if (swash_fetch(sw, utf8_p, TRUE)) {
7531                     match = TRUE;
7532                 }
7533
7534                 /* If we allocated a string above, free it */
7535                 if (! utf8_target) Safefree(utf8_p);
7536             }
7537         }
7538
7539         if (UNICODE_IS_SUPER(c)
7540             && OP(n) == ANYOF_WARN_SUPER
7541             && ckWARN_d(WARN_NON_UNICODE))
7542         {
7543             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7544                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7545         }
7546     }
7547
7548     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7549     return cBOOL(flags & ANYOF_INVERT) ^ match;
7550 }
7551
7552 STATIC U8 *
7553 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7554 {
7555     /* return the position 'off' UTF-8 characters away from 's', forward if
7556      * 'off' >= 0, backwards if negative.  But don't go outside of position
7557      * 'lim', which better be < s  if off < 0 */
7558
7559     dVAR;
7560
7561     PERL_ARGS_ASSERT_REGHOP3;
7562
7563     if (off >= 0) {
7564         while (off-- && s < lim) {
7565             /* XXX could check well-formedness here */
7566             s += UTF8SKIP(s);
7567         }
7568     }
7569     else {
7570         while (off++ && s > lim) {
7571             s--;
7572             if (UTF8_IS_CONTINUED(*s)) {
7573                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7574                     s--;
7575             }
7576             /* XXX could check well-formedness here */
7577         }
7578     }
7579     return s;
7580 }
7581
7582 #ifdef XXX_dmq
7583 /* there are a bunch of places where we use two reghop3's that should
7584    be replaced with this routine. but since thats not done yet
7585    we ifdef it out - dmq
7586 */
7587 STATIC U8 *
7588 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7589 {
7590     dVAR;
7591
7592     PERL_ARGS_ASSERT_REGHOP4;
7593
7594     if (off >= 0) {
7595         while (off-- && s < rlim) {
7596             /* XXX could check well-formedness here */
7597             s += UTF8SKIP(s);
7598         }
7599     }
7600     else {
7601         while (off++ && s > llim) {
7602             s--;
7603             if (UTF8_IS_CONTINUED(*s)) {
7604                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7605                     s--;
7606             }
7607             /* XXX could check well-formedness here */
7608         }
7609     }
7610     return s;
7611 }
7612 #endif
7613
7614 STATIC U8 *
7615 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7616 {
7617     dVAR;
7618
7619     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7620
7621     if (off >= 0) {
7622         while (off-- && s < lim) {
7623             /* XXX could check well-formedness here */
7624             s += UTF8SKIP(s);
7625         }
7626         if (off >= 0)
7627             return NULL;
7628     }
7629     else {
7630         while (off++ && s > lim) {
7631             s--;
7632             if (UTF8_IS_CONTINUED(*s)) {
7633                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7634                     s--;
7635             }
7636             /* XXX could check well-formedness here */
7637         }
7638         if (off <= 0)
7639             return NULL;
7640     }
7641     return s;
7642 }
7643
7644
7645 /* when executing a regex that may have (?{}), extra stuff needs setting
7646    up that will be visible to the called code, even before the current
7647    match has finished. In particular:
7648
7649    * $_ is localised to the SV currently being matched;
7650    * pos($_) is created if necessary, ready to be updated on each call-out
7651      to code;
7652    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7653      isn't set until the current pattern is successfully finished), so that
7654      $1 etc of the match-so-far can be seen;
7655    * save the old values of subbeg etc of the current regex, and  set then
7656      to the current string (again, this is normally only done at the end
7657      of execution)
7658 */
7659
7660 static void
7661 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7662 {
7663     MAGIC *mg;
7664     regexp *const rex = ReANY(reginfo->prog);
7665     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7666
7667     eval_state->rex = rex;
7668
7669     if (reginfo->sv) {
7670         /* Make $_ available to executed code. */
7671         if (reginfo->sv != DEFSV) {
7672             SAVE_DEFSV;
7673             DEFSV_set(reginfo->sv);
7674         }
7675
7676         if (!(mg = mg_find_mglob(reginfo->sv))) {
7677             /* prepare for quick setting of pos */
7678             mg = sv_magicext_mglob(reginfo->sv);
7679             mg->mg_len = -1;
7680         }
7681         eval_state->pos_magic = mg;
7682         eval_state->pos       = mg->mg_len;
7683         eval_state->pos_flags = mg->mg_flags;
7684     }
7685     else
7686         eval_state->pos_magic = NULL;
7687
7688     if (!PL_reg_curpm) {
7689         /* PL_reg_curpm is a fake PMOP that we can attach the current
7690          * regex to and point PL_curpm at, so that $1 et al are visible
7691          * within a /(?{})/. It's just allocated once per interpreter the
7692          * first time its needed */
7693         Newxz(PL_reg_curpm, 1, PMOP);
7694 #ifdef USE_ITHREADS
7695         {
7696             SV* const repointer = &PL_sv_undef;
7697             /* this regexp is also owned by the new PL_reg_curpm, which
7698                will try to free it.  */
7699             av_push(PL_regex_padav, repointer);
7700             PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7701             PL_regex_pad = AvARRAY(PL_regex_padav);
7702         }
7703 #endif
7704     }
7705     SET_reg_curpm(reginfo->prog);
7706     eval_state->curpm = PL_curpm;
7707     PL_curpm = PL_reg_curpm;
7708     if (RXp_MATCH_COPIED(rex)) {
7709         /*  Here is a serious problem: we cannot rewrite subbeg,
7710             since it may be needed if this match fails.  Thus
7711             $` inside (?{}) could fail... */
7712         eval_state->subbeg     = rex->subbeg;
7713         eval_state->sublen     = rex->sublen;
7714         eval_state->suboffset  = rex->suboffset;
7715         eval_state->subcoffset = rex->subcoffset;
7716 #ifdef PERL_ANY_COW
7717         eval_state->saved_copy = rex->saved_copy;
7718 #endif
7719         RXp_MATCH_COPIED_off(rex);
7720     }
7721     else
7722         eval_state->subbeg = NULL;
7723     rex->subbeg = (char *)reginfo->strbeg;
7724     rex->suboffset = 0;
7725     rex->subcoffset = 0;
7726     rex->sublen = reginfo->strend - reginfo->strbeg;
7727 }
7728
7729
7730 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7731
7732 static void
7733 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7734 {
7735     dVAR;
7736     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7737     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
7738     regmatch_slab *s;
7739
7740     Safefree(aux->poscache);
7741
7742     if (eval_state) {
7743
7744         /* undo the effects of S_setup_eval_state() */
7745
7746         if (eval_state->subbeg) {
7747             regexp * const rex = eval_state->rex;
7748             rex->subbeg     = eval_state->subbeg;
7749             rex->sublen     = eval_state->sublen;
7750             rex->suboffset  = eval_state->suboffset;
7751             rex->subcoffset = eval_state->subcoffset;
7752 #ifdef PERL_ANY_COW
7753             rex->saved_copy = eval_state->saved_copy;
7754 #endif
7755             RXp_MATCH_COPIED_on(rex);
7756         }
7757         if (eval_state->pos_magic)
7758         {
7759             eval_state->pos_magic->mg_len = eval_state->pos;
7760             eval_state->pos_magic->mg_flags =
7761                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
7762                | (eval_state->pos_flags & MGf_BYTES);
7763         }
7764
7765         PL_curpm = eval_state->curpm;
7766     }
7767
7768     PL_regmatch_state = aux->old_regmatch_state;
7769     PL_regmatch_slab  = aux->old_regmatch_slab;
7770
7771     /* free all slabs above current one - this must be the last action
7772      * of this function, as aux and eval_state are allocated within
7773      * slabs and may be freed here */
7774
7775     s = PL_regmatch_slab->next;
7776     if (s) {
7777         PL_regmatch_slab->next = NULL;
7778         while (s) {
7779             regmatch_slab * const osl = s;
7780             s = s->next;
7781             Safefree(osl);
7782         }
7783     }
7784 }
7785
7786
7787 STATIC void
7788 S_to_utf8_substr(pTHX_ regexp *prog)
7789 {
7790     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7791      * on the converted value */
7792
7793     int i = 1;
7794
7795     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7796
7797     do {
7798         if (prog->substrs->data[i].substr
7799             && !prog->substrs->data[i].utf8_substr) {
7800             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7801             prog->substrs->data[i].utf8_substr = sv;
7802             sv_utf8_upgrade(sv);
7803             if (SvVALID(prog->substrs->data[i].substr)) {
7804                 if (SvTAIL(prog->substrs->data[i].substr)) {
7805                     /* Trim the trailing \n that fbm_compile added last
7806                        time.  */
7807                     SvCUR_set(sv, SvCUR(sv) - 1);
7808                     /* Whilst this makes the SV technically "invalid" (as its
7809                        buffer is no longer followed by "\0") when fbm_compile()
7810                        adds the "\n" back, a "\0" is restored.  */
7811                     fbm_compile(sv, FBMcf_TAIL);
7812                 } else
7813                     fbm_compile(sv, 0);
7814             }
7815             if (prog->substrs->data[i].substr == prog->check_substr)
7816                 prog->check_utf8 = sv;
7817         }
7818     } while (i--);
7819 }
7820
7821 STATIC bool
7822 S_to_byte_substr(pTHX_ regexp *prog)
7823 {
7824     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7825      * on the converted value; returns FALSE if can't be converted. */
7826
7827     dVAR;
7828     int i = 1;
7829
7830     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7831
7832     do {
7833         if (prog->substrs->data[i].utf8_substr
7834             && !prog->substrs->data[i].substr) {
7835             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7836             if (! sv_utf8_downgrade(sv, TRUE)) {
7837                 return FALSE;
7838             }
7839             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7840                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7841                     /* Trim the trailing \n that fbm_compile added last
7842                         time.  */
7843                     SvCUR_set(sv, SvCUR(sv) - 1);
7844                     fbm_compile(sv, FBMcf_TAIL);
7845                 } else
7846                     fbm_compile(sv, 0);
7847             }
7848             prog->substrs->data[i].substr = sv;
7849             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7850                 prog->check_substr = sv;
7851         }
7852     } while (i--);
7853
7854     return TRUE;
7855 }
7856
7857 /*
7858  * Local variables:
7859  * c-indentation-style: bsd
7860  * c-basic-offset: 4
7861  * indent-tabs-mode: nil
7862  * End:
7863  *
7864  * ex: set ts=8 sts=4 sw=4 et:
7865  */