regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102 /* Valid for non-utf8 strings: avoids the reginclass
 103  * call if there are no complications: i.e., if everything matchable is
 104  * straight forward in the bitmap */
 105 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 106                                               : ANYOF_BITMAP_TEST(p,*(c)))
 107
 108 /*
 109  * Forwards.
 110  */
 111
 112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 113 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 114
 115 #define HOPc(pos,off) \
 116         (char *)(reginfo->is_utf8_target \
 117             ? reghop3((U8*)pos, off, \
 118                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 119             : (U8*)(pos + off))
 120 #define HOPBACKc(pos, off) \
 121         (char*)(reginfo->is_utf8_target \
 122             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 123             : (pos - off >= reginfo->strbeg)    \
 124                 ? (U8*)pos - off                \
 125                 : NULL)
 126
 127 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 129
 130
 131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 132 #define NEXTCHR_IS_EOS (nextchr < 0)
 133
 134 #define SET_nextchr \
 135     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 136
 137 #define SET_locinput(p) \
 138     locinput = (p);  \
 139     SET_nextchr
 140
 141
 142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
 143         if (!swash_ptr) {                                                     \
 144             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 145             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 146                                          1, 0, NULL, &flags);                 \
 147             assert(swash_ptr);                                                \
 148         }                                                                     \
 149     } STMT_END
 150
 151 /* If in debug mode, we test that a known character properly matches */
 152 #ifdef DEBUGGING
 153 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 154                                           property_name,                      \
 155                                           utf8_char_in_property)              \
 156         LOAD_UTF8_CHARCLASS(swash_ptr, property_name);                        \
 157         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 158 #else
 159 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 160                                           property_name,                      \
 161                                           utf8_char_in_property)              \
 162         LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
 163 #endif
 164
 165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 166                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 167                                         swash_property_names[_CC_WORDCHAR],   \
 168                                         LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 169
 170 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 171     STMT_START {                                                              \
 172         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 173                                        "_X_regular_begin",                    \
 174                                        LATIN_CAPITAL_LETTER_SHARP_S_UTF8);    \
 175         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 176                                        "_X_extend",                           \
 177                                        COMBINING_GRAVE_ACCENT_UTF8);          \
 178     } STMT_END
 179
 180 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 181 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 182
 183 /* for use after a quantifier and before an EXACT-like node -- japhy */
 184 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 185  *
 186  * NOTE that *nothing* that affects backtracking should be in here, specifically
 187  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 188  * node that is in between two EXACT like nodes when ascertaining what the required
 189  * "follow" character is. This should probably be moved to regex compile time
 190  * although it may be done at run time beause of the REF possibility - more
 191  * investigation required. -- demerphq
 192 */
 193 #define JUMPABLE(rn) (      \
 194     OP(rn) == OPEN ||       \
 195     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 196     OP(rn) == EVAL ||   \
 197     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 198     OP(rn) == PLUS || OP(rn) == MINMOD || \
 199     OP(rn) == KEEPS || \
 200     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 201 )
 202 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 203
 204 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 205
 206 #if 0
 207 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 208    we don't need this definition. */
 209 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 210 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 211 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 212
 213 #else
 214 /* ... so we use this as its faster. */
 215 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 216 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 217 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 218 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 219
 220 #endif
 221
 222 /*
 223   Search for mandatory following text node; for lookahead, the text must
 224   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 225 */
 226 #define FIND_NEXT_IMPT(rn) STMT_START { \
 227     while (JUMPABLE(rn)) { \
 228         const OPCODE type = OP(rn); \
 229         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 230             rn = NEXTOPER(NEXTOPER(rn)); \
 231         else if (type == PLUS) \
 232             rn = NEXTOPER(rn); \
 233         else if (type == IFMATCH) \
 234             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 235         else rn += NEXT_OFF(rn); \
 236     } \
 237 } STMT_END
 238
 239 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 240  * These are for the pre-composed Hangul syllables, which are all in a
 241  * contiguous block and arranged there in such a way so as to facilitate
 242  * alorithmic determination of their characteristics.  As such, they don't need
 243  * a swash, but can be determined by simple arithmetic.  Almost all are
 244  * GCB=LVT, but every 28th one is a GCB=LV */
 245 #define SBASE 0xAC00    /* Start of block */
 246 #define SCount 11172    /* Length of block */
 247 #define TCount 28
 248
 249 #define SLAB_FIRST(s) (&(s)->states[0])
 250 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 251
 252 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 253 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 254 static regmatch_state * S_push_slab(pTHX);
 255
 256 #define REGCP_PAREN_ELEMS 3
 257 #define REGCP_OTHER_ELEMS 3
 258 #define REGCP_FRAME_ELEMS 1
 259 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 260  * are needed for the regexp context stack bookkeeping. */
 261
 262 STATIC CHECKPOINT
 263 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 264 {
 265     dVAR;
 266     const int retval = PL_savestack_ix;
 267     const int paren_elems_to_push =
 268                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 269     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 270     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 271     I32 p;
 272     GET_RE_DEBUG_FLAGS_DECL;
 273
 274     PERL_ARGS_ASSERT_REGCPPUSH;
 275
 276     if (paren_elems_to_push < 0)
 277         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 278                    paren_elems_to_push);
 279
 280     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 281         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 282                    " out of range (%lu-%ld)",
 283                    total_elems,
 284                    (unsigned long)maxopenparen,
 285                    (long)parenfloor);
 286
 287     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 288
 289     DEBUG_BUFFERS_r(
 290         if ((int)maxopenparen > (int)parenfloor)
 291             PerlIO_printf(Perl_debug_log,
 292                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 293                 PTR2UV(rex),
 294                 PTR2UV(rex->offs)
 295             );
 296     );
 297     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 298 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 299         SSPUSHIV(rex->offs[p].end);
 300         SSPUSHIV(rex->offs[p].start);
 301         SSPUSHINT(rex->offs[p].start_tmp);
 302         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 303             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 304             (UV)p,
 305             (IV)rex->offs[p].start,
 306             (IV)rex->offs[p].start_tmp,
 307             (IV)rex->offs[p].end
 308         ));
 309     }
 310 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 311     SSPUSHINT(maxopenparen);
 312     SSPUSHINT(rex->lastparen);
 313     SSPUSHINT(rex->lastcloseparen);
 314     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 315
 316     return retval;
 317 }
 318
 319 /* These are needed since we do not localize EVAL nodes: */
 320 #define REGCP_SET(cp)                                           \
 321     DEBUG_STATE_r(                                              \
 322             PerlIO_printf(Perl_debug_log,                       \
 323                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 324                 (IV)PL_savestack_ix));                          \
 325     cp = PL_savestack_ix
 326
 327 #define REGCP_UNWIND(cp)                                        \
 328     DEBUG_STATE_r(                                              \
 329         if (cp != PL_savestack_ix)                              \
 330             PerlIO_printf(Perl_debug_log,                       \
 331                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 332                 (IV)(cp), (IV)PL_savestack_ix));                \
 333     regcpblow(cp)
 334
 335 #define UNWIND_PAREN(lp, lcp)               \
 336     for (n = rex->lastparen; n > lp; n--)   \
 337         rex->offs[n].end = -1;              \
 338     rex->lastparen = n;                     \
 339     rex->lastcloseparen = lcp;
 340
 341
 342 STATIC void
 343 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 344 {
 345     dVAR;
 346     UV i;
 347     U32 paren;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     PERL_ARGS_ASSERT_REGCPPOP;
 351
 352     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 353     i = SSPOPUV;
 354     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 355     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 356     rex->lastcloseparen = SSPOPINT;
 357     rex->lastparen = SSPOPINT;
 358     *maxopenparen_p = SSPOPINT;
 359
 360     i -= REGCP_OTHER_ELEMS;
 361     /* Now restore the parentheses context. */
 362     DEBUG_BUFFERS_r(
 363         if (i || rex->lastparen + 1 <= rex->nparens)
 364             PerlIO_printf(Perl_debug_log,
 365                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 366                 PTR2UV(rex),
 367                 PTR2UV(rex->offs)
 368             );
 369     );
 370     paren = *maxopenparen_p;
 371     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 372         SSize_t tmps;
 373         rex->offs[paren].start_tmp = SSPOPINT;
 374         rex->offs[paren].start = SSPOPIV;
 375         tmps = SSPOPIV;
 376         if (paren <= rex->lastparen)
 377             rex->offs[paren].end = tmps;
 378         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 379             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 380             (UV)paren,
 381             (IV)rex->offs[paren].start,
 382             (IV)rex->offs[paren].start_tmp,
 383             (IV)rex->offs[paren].end,
 384             (paren > rex->lastparen ? "(skipped)" : ""));
 385         );
 386         paren--;
 387     }
 388 #if 1
 389     /* It would seem that the similar code in regtry()
 390      * already takes care of this, and in fact it is in
 391      * a better location to since this code can #if 0-ed out
 392      * but the code in regtry() is needed or otherwise tests
 393      * requiring null fields (pat.t#187 and split.t#{13,14}
 394      * (as of patchlevel 7877)  will fail.  Then again,
 395      * this code seems to be necessary or otherwise
 396      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 397      * --jhi updated by dapm */
 398     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 399         if (i > *maxopenparen_p)
 400             rex->offs[i].start = -1;
 401         rex->offs[i].end = -1;
 402         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 403             "    \\%"UVuf": %s   ..-1 undeffing\n",
 404             (UV)i,
 405             (i > *maxopenparen_p) ? "-1" : "  "
 406         ));
 407     }
 408 #endif
 409 }
 410
 411 /* restore the parens and associated vars at savestack position ix,
 412  * but without popping the stack */
 413
 414 STATIC void
 415 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 416 {
 417     I32 tmpix = PL_savestack_ix;
 418     PL_savestack_ix = ix;
 419     regcppop(rex, maxopenparen_p);
 420     PL_savestack_ix = tmpix;
 421 }
 422
 423 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 424
 425 STATIC bool
 426 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 427 {
 428     /* Returns a boolean as to whether or not 'character' is a member of the
 429      * Posix character class given by 'classnum' that should be equivalent to a
 430      * value in the typedef '_char_class_number'.
 431      *
 432      * Ideally this could be replaced by a just an array of function pointers
 433      * to the C library functions that implement the macros this calls.
 434      * However, to compile, the precise function signatures are required, and
 435      * these may vary from platform to to platform.  To avoid having to figure
 436      * out what those all are on each platform, I (khw) am using this method,
 437      * which adds an extra layer of function call overhead (unless the C
 438      * optimizer strips it away).  But we don't particularly care about
 439      * performance with locales anyway. */
 440
 441     switch ((_char_class_number) classnum) {
 442         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 443         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 444         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 445         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 446         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 447                                         || isUPPER_LC(character);
 448         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 449         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 450         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 451         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 452         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 453         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 454         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 455         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 456         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 457         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 458         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 459         default:    /* VERTSPACE should never occur in locales */
 460             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 461     }
 462
 463     assert(0); /* NOTREACHED */
 464     return FALSE;
 465 }
 466
 467 STATIC bool
 468 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 469 {
 470     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 471      * 'character' is a member of the Posix character class given by 'classnum'
 472      * that should be equivalent to a value in the typedef
 473      * '_char_class_number'.
 474      *
 475      * This just calls isFOO_lc on the code point for the character if it is in
 476      * the range 0-255.  Outside that range, all characters avoid Unicode
 477      * rules, ignoring any locale.  So use the Unicode function if this class
 478      * requires a swash, and use the Unicode macro otherwise. */
 479
 480     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 481
 482     if (UTF8_IS_INVARIANT(*character)) {
 483         return isFOO_lc(classnum, *character);
 484     }
 485     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 486         return isFOO_lc(classnum,
 487                         TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 488     }
 489
 490     if (classnum < _FIRST_NON_SWASH_CC) {
 491
 492         /* Initialize the swash unless done already */
 493         if (! PL_utf8_swash_ptrs[classnum]) {
 494             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 495             PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
 496                 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
 497         }
 498
 499         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 500                                  character,
 501                                  TRUE /* is UTF */ ));
 502     }
 503
 504     switch ((_char_class_number) classnum) {
 505         case _CC_ENUM_SPACE:
 506         case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 507
 508         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 509         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 510         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 511         default:                 return 0;  /* Things like CNTRL are always
 512                                                below 256 */
 513     }
 514
 515     assert(0); /* NOTREACHED */
 516     return FALSE;
 517 }
 518
 519 /*
 520  * pregexec and friends
 521  */
 522
 523 #ifndef PERL_IN_XSUB_RE
 524 /*
 525  - pregexec - match a regexp against a string
 526  */
 527 I32
 528 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 529          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 530 /* stringarg: the point in the string at which to begin matching */
 531 /* strend:    pointer to null at end of string */
 532 /* strbeg:    real beginning of string */
 533 /* minend:    end of match must be >= minend bytes after stringarg. */
 534 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 535  *            itself is accessed via the pointers above */
 536 /* nosave:    For optimizations. */
 537 {
 538     PERL_ARGS_ASSERT_PREGEXEC;
 539
 540     return
 541         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 542                       nosave ? 0 : REXEC_COPY_STR);
 543 }
 544 #endif
 545
 546 /*
 547  * Need to implement the following flags for reg_anch:
 548  *
 549  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 550  * USE_INTUIT_ML
 551  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 552  * INTUIT_AUTORITATIVE_ML
 553  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 554  * INTUIT_ONCE_ML
 555  *
 556  * Another flag for this function: SECOND_TIME (so that float substrs
 557  * with giant delta may be not rechecked).
 558  */
 559
 560 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 561    Otherwise, only SvCUR(sv) is used to get strbeg. */
 562
 563 /* XXXX Some places assume that there is a fixed substring.
 564         An update may be needed if optimizer marks as "INTUITable"
 565         RExen without fixed substrings.  Similarly, it is assumed that
 566         lengths of all the strings are no more than minlen, thus they
 567         cannot come from lookahead.
 568         (Or minlen should take into account lookahead.)
 569   NOTE: Some of this comment is not correct. minlen does now take account
 570   of lookahead/behind. Further research is required. -- demerphq
 571
 572 */
 573
 574 /* A failure to find a constant substring means that there is no need to make
 575    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 576    finding a substring too deep into the string means that fewer calls to
 577    regtry() should be needed.
 578
 579    REx compiler's optimizer found 4 possible hints:
 580         a) Anchored substring;
 581         b) Fixed substring;
 582         c) Whether we are anchored (beginning-of-line or \G);
 583         d) First node (of those at offset 0) which may distinguish positions;
 584    We use a)b)d) and multiline-part of c), and try to find a position in the
 585    string which does not contradict any of them.
 586  */
 587
 588 /* Most of decisions we do here should have been done at compile time.
 589    The nodes of the REx which we used for the search should have been
 590    deleted from the finite automaton. */
 591
 592 /* args:
 593  * rx:     the regex to match against
 594  * sv:     the SV being matched: only used for utf8 flag; the string
 595  *         itself is accessed via the pointers below. Note that on
 596  *         something like an overloaded SV, SvPOK(sv) may be false
 597  *         and the string pointers may point to something unrelated to
 598  *         the SV itself.
 599  * strbeg: real beginning of string
 600  * strpos: the point in the string at which to begin matching
 601  * strend: pointer to the byte following the last char of the string
 602  * flags   currently unused; set to 0
 603  * data:   currently unused; set to NULL
 604  */
 605
 606 char *
 607 Perl_re_intuit_start(pTHX_
 608                     REGEXP * const rx,
 609                     SV *sv,
 610                     const char * const strbeg,
 611                     char *strpos,
 612                     char *strend,
 613                     const U32 flags,
 614                     re_scream_pos_data *data)
 615 {
 616     dVAR;
 617     struct regexp *const prog = ReANY(rx);
 618     SSize_t start_shift = 0;
 619     /* Should be nonnegative! */
 620     SSize_t end_shift   = 0;
 621     char *s;
 622     SV *check;
 623     char *t;
 624     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 625     I32 ml_anch;
 626     char *other_last = NULL;    /* other substr checked before this */
 627     char *check_at = NULL;              /* check substr found at this pos */
 628     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 629     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 630     RXi_GET_DECL(prog,progi);
 631     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 632     regmatch_info *const reginfo = &reginfo_buf;
 633 #ifdef DEBUGGING
 634     const char * const i_strpos = strpos;
 635 #endif
 636     GET_RE_DEBUG_FLAGS_DECL;
 637
 638     PERL_ARGS_ASSERT_RE_INTUIT_START;
 639     PERL_UNUSED_ARG(flags);
 640     PERL_UNUSED_ARG(data);
 641
 642     /* CHR_DIST() would be more correct here but it makes things slow. */
 643     if (prog->minlen > strend - strpos) {
 644         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 645                               "String too short... [re_intuit_start]\n"));
 646         goto fail;
 647     }
 648
 649     reginfo->is_utf8_target = cBOOL(utf8_target);
 650     reginfo->info_aux = NULL;
 651     reginfo->strbeg = strbeg;
 652     reginfo->strend = strend;
 653     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 654     reginfo->intuit = 1;
 655     /* not actually used within intuit, but zero for safety anyway */
 656     reginfo->poscache_maxiter = 0;
 657
 658     if (utf8_target) {
 659         if (!prog->check_utf8 && prog->check_substr)
 660             to_utf8_substr(prog);
 661         check = prog->check_utf8;
 662     } else {
 663         if (!prog->check_substr && prog->check_utf8) {
 664             if (! to_byte_substr(prog)) {
 665                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 666             }
 667         }
 668         check = prog->check_substr;
 669     }
 670     if ((prog->extflags & RXf_ANCH)     /* Match at beg-of-str or after \n */
 671          && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
 672     {
 673         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 674                      || ( (prog->extflags & RXf_ANCH_BOL)
 675                           && !multiline ) );    /* Check after \n? */
 676
 677         if (!ml_anch) {
 678           if (    !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 679                && (strpos != strbeg)) {
 680               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 681               goto fail;
 682           }
 683           if (prog->check_offset_min == prog->check_offset_max
 684               && !(prog->extflags & RXf_CANY_SEEN)
 685               && ! multiline)   /* /m can cause \n's to match that aren't
 686                                    accounted for in the string max length.
 687                                    See [perl #115242] */
 688           {
 689             /* Substring at constant offset from beg-of-str... */
 690             SSize_t slen;
 691
 692             s = HOP3c(strpos, prog->check_offset_min, strend);
 693
 694             if (SvTAIL(check)) {
 695                 slen = SvCUR(check);    /* >= 1 */
 696
 697                 if ( strend - s > slen || strend - s < slen - 1
 698                      || (strend - s == slen && strend[-1] != '\n')) {
 699                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 700                     goto fail_finish;
 701                 }
 702                 /* Now should match s[0..slen-2] */
 703                 slen--;
 704                 if (slen && (*SvPVX_const(check) != *s
 705                              || (slen > 1
 706                                  && memNE(SvPVX_const(check), s, slen)))) {
 707                   report_neq:
 708                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 709                     goto fail_finish;
 710                 }
 711             }
 712             else if (*SvPVX_const(check) != *s
 713                      || ((slen = SvCUR(check)) > 1
 714                          && memNE(SvPVX_const(check), s, slen)))
 715                 goto report_neq;
 716             check_at = s;
 717             goto success_at_start;
 718           }
 719         }
 720         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 721         s = strpos;
 722         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 723         end_shift = prog->check_end_shift;
 724
 725         if (!ml_anch) {
 726             const SSize_t end = prog->check_offset_max + CHR_SVLEN(check)
 727                                          - (SvTAIL(check) != 0);
 728             const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 729
 730             if (end_shift < eshift)
 731                 end_shift = eshift;
 732         }
 733     }
 734     else {                              /* Can match at random position */
 735         ml_anch = 0;
 736         s = strpos;
 737         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 738         end_shift = prog->check_end_shift;
 739
 740         /* end shift should be non negative here */
 741     }
 742
 743 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 744     if (end_shift < 0)
 745         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 746                    (IV)end_shift, RX_PRECOMP(prog));
 747 #endif
 748
 749   restart:
 750     /* Find a possible match in the region s..strend by looking for
 751        the "check" substring in the region corrected by start/end_shift. */
 752
 753     {
 754         SSize_t srch_start_shift = start_shift;
 755         SSize_t srch_end_shift = end_shift;
 756         U8* start_point;
 757         U8* end_point;
 758         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 759             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 760             srch_start_shift = strbeg - s;
 761         }
 762     DEBUG_OPTIMISE_MORE_r({
 763         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 764             (IV)prog->check_offset_min,
 765             (IV)srch_start_shift,
 766             (IV)srch_end_shift,
 767             (IV)prog->check_end_shift);
 768     });
 769
 770         if (prog->extflags & RXf_CANY_SEEN) {
 771             start_point= (U8*)(s + srch_start_shift);
 772             end_point= (U8*)(strend - srch_end_shift);
 773         } else {
 774             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 775             end_point= HOP3(strend, -srch_end_shift, strbeg);
 776         }
 777         DEBUG_OPTIMISE_MORE_r({
 778             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 779                 (int)(end_point - start_point),
 780                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 781                 start_point);
 782         });
 783
 784         s = fbm_instr( start_point, end_point,
 785                       check, multiline ? FBMrf_MULTILINE : 0);
 786     }
 787     /* Update the count-of-usability, remove useless subpatterns,
 788         unshift s.  */
 789
 790     DEBUG_EXECUTE_r({
 791         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 792             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 793         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 794                           (s ? "Found" : "Did not find"),
 795             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 796                 ? "anchored" : "floating"),
 797             quoted,
 798             RE_SV_TAIL(check),
 799             (s ? " at offset " : "...\n") );
 800     });
 801
 802     if (!s)
 803         goto fail_finish;
 804     /* Finish the diagnostic message */
 805     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 806
 807     /* XXX dmq: first branch is for positive lookbehind...
 808        Our check string is offset from the beginning of the pattern.
 809        So we need to do any stclass tests offset forward from that
 810        point. I think. :-(
 811      */
 812
 813
 814
 815     check_at=s;
 816
 817
 818     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 819        Start with the other substr.
 820        XXXX no SCREAM optimization yet - and a very coarse implementation
 821        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 822                 *always* match.  Probably should be marked during compile...
 823        Probably it is right to do no SCREAM here...
 824      */
 825
 826     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 827                 : (prog->float_substr && prog->anchored_substr))
 828     {
 829         /* Take into account the "other" substring. */
 830         /* XXXX May be hopelessly wrong for UTF... */
 831         if (!other_last)
 832             other_last = strpos;
 833         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 834           do_other_anchored:
 835             {
 836                 char * const last = HOP3c(s, -start_shift, strbeg);
 837                 char *last1, *last2;
 838                 char * const saved_s = s;
 839                 SV* must;
 840
 841                 t = s - prog->check_offset_max;
 842                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 843                     && (!utf8_target
 844                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 845                             && t > strpos)))
 846                     NOOP;
 847                 else
 848                     t = strpos;
 849                 t = HOP3c(t, prog->anchored_offset, strend);
 850                 if (t < other_last)     /* These positions already checked */
 851                     t = other_last;
 852                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 853                 if (last < last1)
 854                     last1 = last;
 855                 /* XXXX It is not documented what units *_offsets are in.
 856                    We assume bytes, but this is clearly wrong.
 857                    Meaning this code needs to be carefully reviewed for errors.
 858                    dmq.
 859                   */
 860
 861                 /* On end-of-str: see comment below. */
 862                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 863                 if (must == &PL_sv_undef) {
 864                     s = (char*)NULL;
 865                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 866                 }
 867                 else
 868                     s = fbm_instr(
 869                         (unsigned char*)t,
 870                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 871                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 872                         must,
 873                         multiline ? FBMrf_MULTILINE : 0
 874                     );
 875                 DEBUG_EXECUTE_r({
 876                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 877                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 878                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 879                         (s ? "Found" : "Contradicts"),
 880                         quoted, RE_SV_TAIL(must));
 881                 });
 882
 883
 884                 if (!s) {
 885                     if (last1 >= last2) {
 886                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 887                                                 ", giving up...\n"));
 888                         goto fail_finish;
 889                     }
 890                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 891                         ", trying floating at offset %ld...\n",
 892                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 893                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 894                     s = HOP3c(last, 1, strend);
 895                     goto restart;
 896                 }
 897                 else {
 898                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 899                           (long)(s - i_strpos)));
 900                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 901                     other_last = HOP3c(s, 1, strend);
 902                     s = saved_s;
 903                     if (t == strpos)
 904                         goto try_at_start;
 905                     goto try_at_offset;
 906                 }
 907             }
 908         }
 909         else {          /* Take into account the floating substring. */
 910             char *last, *last1;
 911             char * const saved_s = s;
 912             SV* must;
 913
 914             t = HOP3c(s, -start_shift, strbeg);
 915             last1 = last =
 916                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 917             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 918                 last = HOP3c(t, prog->float_max_offset, strend);
 919             s = HOP3c(t, prog->float_min_offset, strend);
 920             if (s < other_last)
 921                 s = other_last;
 922  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 923             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 924             /* fbm_instr() takes into account exact value of end-of-str
 925                if the check is SvTAIL(ed).  Since false positives are OK,
 926                and end-of-str is not later than strend we are OK. */
 927             if (must == &PL_sv_undef) {
 928                 s = (char*)NULL;
 929                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 930             }
 931             else
 932                 s = fbm_instr((unsigned char*)s,
 933                               (unsigned char*)last + SvCUR(must)
 934                                   - (SvTAIL(must)!=0),
 935                               must, multiline ? FBMrf_MULTILINE : 0);
 936             DEBUG_EXECUTE_r({
 937                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 938                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 939                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 940                     (s ? "Found" : "Contradicts"),
 941                     quoted, RE_SV_TAIL(must));
 942             });
 943             if (!s) {
 944                 if (last1 == last) {
 945                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 946                                             ", giving up...\n"));
 947                     goto fail_finish;
 948                 }
 949                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 950                     ", trying anchored starting at offset %ld...\n",
 951                     (long)(saved_s + 1 - i_strpos)));
 952                 other_last = last;
 953                 s = HOP3c(t, 1, strend);
 954                 goto restart;
 955             }
 956             else {
 957                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 958                       (long)(s - i_strpos)));
 959                 other_last = s; /* Fix this later. --Hugo */
 960                 s = saved_s;
 961                 if (t == strpos)
 962                     goto try_at_start;
 963                 goto try_at_offset;
 964             }
 965         }
 966     }
 967
 968
 969     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 970
 971     DEBUG_OPTIMISE_MORE_r(
 972         PerlIO_printf(Perl_debug_log,
 973             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 974             (IV)prog->check_offset_min,
 975             (IV)prog->check_offset_max,
 976             (IV)(s-strpos),
 977             (IV)(t-strpos),
 978             (IV)(t-s),
 979             (IV)(strend-strpos)
 980         )
 981     );
 982
 983     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 984         && (!utf8_target
 985             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 986                  && t > strpos)))
 987     {
 988         /* Fixed substring is found far enough so that the match
 989            cannot start at strpos. */
 990       try_at_offset:
 991         if (ml_anch && t[-1] != '\n') {
 992             /* Eventually fbm_*() should handle this, but often
 993                anchored_offset is not 0, so this check will not be wasted. */
 994             /* XXXX In the code below we prefer to look for "^" even in
 995                presence of anchored substrings.  And we search even
 996                beyond the found float position.  These pessimizations
 997                are historical artefacts only.  */
 998           find_anchor:
 999             while (t < strend - prog->minlen) {
1000                 if (*t == '\n') {
1001                     if (t < check_at - prog->check_offset_min) {
1002                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1003                             /* Since we moved from the found position,
1004                                we definitely contradict the found anchored
1005                                substr.  Due to the above check we do not
1006                                contradict "check" substr.
1007                                Thus we can arrive here only if check substr
1008                                is float.  Redo checking for "other"=="fixed".
1009                              */
1010                             strpos = t + 1;
1011                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1012                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1013                             goto do_other_anchored;
1014                         }
1015                         /* We don't contradict the found floating substring. */
1016                         /* XXXX Why not check for STCLASS? */
1017                         s = t + 1;
1018                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1019                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1020                         goto set_useful;
1021                     }
1022                     /* Position contradicts check-string */
1023                     /* XXXX probably better to look for check-string
1024                        than for "\n", so one should lower the limit for t? */
1025                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1026                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1027                     other_last = strpos = s = t + 1;
1028                     goto restart;
1029                 }
1030                 t++;
1031             }
1032             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1033                         PL_colors[0], PL_colors[1]));
1034             goto fail_finish;
1035         }
1036         else {
1037             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1038                         PL_colors[0], PL_colors[1]));
1039         }
1040         s = t;
1041       set_useful:
1042         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1043     }
1044     else {
1045         /* The found string does not prohibit matching at strpos,
1046            - no optimization of calling REx engine can be performed,
1047            unless it was an MBOL and we are not after MBOL,
1048            or a future STCLASS check will fail this. */
1049       try_at_start:
1050         /* Even in this situation we may use MBOL flag if strpos is offset
1051            wrt the start of the string. */
1052         if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1053             /* May be due to an implicit anchor of m{.*foo}  */
1054             && !(prog->intflags & PREGf_IMPLICIT))
1055         {
1056             t = strpos;
1057             goto find_anchor;
1058         }
1059         DEBUG_EXECUTE_r( if (ml_anch)
1060             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1061                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1062         );
1063       success_at_start:
1064         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1065             && (utf8_target ? (
1066                 prog->check_utf8                /* Could be deleted already */
1067                 && --BmUSEFUL(prog->check_utf8) < 0
1068                 && (prog->check_utf8 == prog->float_utf8)
1069             ) : (
1070                 prog->check_substr              /* Could be deleted already */
1071                 && --BmUSEFUL(prog->check_substr) < 0
1072                 && (prog->check_substr == prog->float_substr)
1073             )))
1074         {
1075             /* If flags & SOMETHING - do not do it many times on the same match */
1076             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1077             /* XXX Does the destruction order has to change with utf8_target? */
1078             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1079             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1080             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1081             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1082             check = NULL;                       /* abort */
1083             s = strpos;
1084             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1085                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1086             if (prog->intflags & PREGf_IMPLICIT)
1087                 prog->extflags &= ~RXf_ANCH_MBOL;
1088             /* XXXX This is a remnant of the old implementation.  It
1089                     looks wasteful, since now INTUIT can use many
1090                     other heuristics. */
1091             prog->extflags &= ~RXf_USE_INTUIT;
1092             /* XXXX What other flags might need to be cleared in this branch? */
1093         }
1094         else
1095             s = strpos;
1096     }
1097
1098     /* Last resort... */
1099     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1100     /* trie stclasses are too expensive to use here, we are better off to
1101        leave it to regmatch itself */
1102     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1103         /* minlen == 0 is possible if regstclass is \b or \B,
1104            and the fixed substr is ''$.
1105            Since minlen is already taken into account, s+1 is before strend;
1106            accidentally, minlen >= 1 guaranties no false positives at s + 1
1107            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1108            regstclass does not come from lookahead...  */
1109         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1110            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1111         const U8* const str = (U8*)STRING(progi->regstclass);
1112         /* XXX this value could be pre-computed */
1113         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1114                     ?  (reginfo->is_utf8_pat
1115                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1116                         : STR_LEN(progi->regstclass))
1117                     : 1);
1118         char * endpos;
1119         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1120             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1121         else if (prog->float_substr || prog->float_utf8)
1122             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1123         else
1124             endpos= strend;
1125
1126         if (checked_upto < s)
1127            checked_upto = s;
1128         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1129                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1130
1131         t = s;
1132         s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1133                             reginfo);
1134         if (s) {
1135             checked_upto = s;
1136         } else {
1137 #ifdef DEBUGGING
1138             const char *what = NULL;
1139 #endif
1140             if (endpos == strend) {
1141                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142                                 "Could not match STCLASS...\n") );
1143                 goto fail;
1144             }
1145             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1146                                    "This position contradicts STCLASS...\n") );
1147             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1148                 goto fail;
1149             checked_upto = HOPBACKc(endpos, start_shift);
1150             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1151                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1152             /* Contradict one of substrings */
1153             if (prog->anchored_substr || prog->anchored_utf8) {
1154                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1155                     DEBUG_EXECUTE_r( what = "anchored" );
1156                   hop_and_restart:
1157                     s = HOP3c(t, 1, strend);
1158                     if (s + start_shift + end_shift > strend) {
1159                         /* XXXX Should be taken into account earlier? */
1160                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1161                                                "Could not match STCLASS...\n") );
1162                         goto fail;
1163                     }
1164                     if (!check)
1165                         goto giveup;
1166                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1167                                 "Looking for %s substr starting at offset %ld...\n",
1168                                  what, (long)(s + start_shift - i_strpos)) );
1169                     goto restart;
1170                 }
1171                 /* Have both, check_string is floating */
1172                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1173                     goto retry_floating_check;
1174                 /* Recheck anchored substring, but not floating... */
1175                 s = check_at;
1176                 if (!check)
1177                     goto giveup;
1178                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1179                           "Looking for anchored substr starting at offset %ld...\n",
1180                           (long)(other_last - i_strpos)) );
1181                 goto do_other_anchored;
1182             }
1183             /* Another way we could have checked stclass at the
1184                current position only: */
1185             if (ml_anch) {
1186                 s = t = t + 1;
1187                 if (!check)
1188                     goto giveup;
1189                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1190                           "Looking for /%s^%s/m starting at offset %ld...\n",
1191                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1192                 goto try_at_offset;
1193             }
1194             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1195                 goto fail;
1196             /* Check is floating substring. */
1197           retry_floating_check:
1198             t = check_at - start_shift;
1199             DEBUG_EXECUTE_r( what = "floating" );
1200             goto hop_and_restart;
1201         }
1202         if (t != s) {
1203             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1204                         "By STCLASS: moving %ld --> %ld\n",
1205                                   (long)(t - i_strpos), (long)(s - i_strpos))
1206                    );
1207         }
1208         else {
1209             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1210                                   "Does not contradict STCLASS...\n");
1211                    );
1212         }
1213     }
1214   giveup:
1215     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1216                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1217                           PL_colors[5], (long)(s - i_strpos)) );
1218     return s;
1219
1220   fail_finish:                          /* Substring not found */
1221     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1222         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1223   fail:
1224     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1225                           PL_colors[4], PL_colors[5]));
1226     return NULL;
1227 }
1228
1229 #define DECL_TRIE_TYPE(scan) \
1230     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1231                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1232                     trie_type = ((scan->flags == EXACT) \
1233                               ? (utf8_target ? trie_utf8 : trie_plain) \
1234                               : (scan->flags == EXACTFA) \
1235                                 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1236                                 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1237
1238 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1239 STMT_START {                               \
1240     STRLEN skiplen;                                                                 \
1241     U8 flags = FOLD_FLAGS_FULL; \
1242     switch (trie_type) {                                                            \
1243     case trie_utf8_exactfa_fold:                                                            \
1244         flags |= FOLD_FLAGS_NOMIX_ASCII; \
1245         /* FALL THROUGH */ \
1246     case trie_utf8_fold:                                                            \
1247         if ( foldlen>0 ) {                                                          \
1248             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1249             foldlen -= len;                                                         \
1250             uscan += len;                                                           \
1251             len=0;                                                                  \
1252         } else {                                                                    \
1253             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL);                \
1254             len = UTF8SKIP(uc);                                                     \
1255             skiplen = UNISKIP( uvc );                                               \
1256             foldlen -= skiplen;                                                     \
1257             uscan = foldbuf + skiplen;                                              \
1258         }                                                                           \
1259         break;                                                                      \
1260     case trie_latin_utf8_exactfa_fold: \
1261         flags |= FOLD_FLAGS_NOMIX_ASCII; \
1262         /* FALL THROUGH */ \
1263     case trie_latin_utf8_fold:                                                      \
1264         if ( foldlen>0 ) {                                                          \
1265             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1266             foldlen -= len;                                                         \
1267             uscan += len;                                                           \
1268             len=0;                                                                  \
1269         } else {                                                                    \
1270             len = 1;                                                                \
1271             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);   \
1272             skiplen = UNISKIP( uvc );                                               \
1273             foldlen -= skiplen;                                                     \
1274             uscan = foldbuf + skiplen;                                              \
1275         }                                                                           \
1276         break;                                                                      \
1277     case trie_utf8:                                                                 \
1278         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1279         break;                                                                      \
1280     case trie_plain:                                                                \
1281         uvc = (UV)*uc;                                                              \
1282         len = 1;                                                                    \
1283     }                                                                               \
1284     if (uvc < 256) {                                                                \
1285         charid = trie->charmap[ uvc ];                                              \
1286     }                                                                               \
1287     else {                                                                          \
1288         charid = 0;                                                                 \
1289         if (widecharmap) {                                                          \
1290             SV** const svpp = hv_fetch(widecharmap,                                 \
1291                         (char*)&uvc, sizeof(UV), 0);                                \
1292             if (svpp)                                                               \
1293                 charid = (U16)SvIV(*svpp);                                          \
1294         }                                                                           \
1295     }                                                                               \
1296 } STMT_END
1297
1298 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1299 STMT_START {                                              \
1300     while (s <= e) {                                      \
1301         if ( (CoNd)                                       \
1302              && (ln == 1 || folder(s, pat_string, ln))    \
1303              && (reginfo->intuit || regtry(reginfo, &s)) )\
1304             goto got_it;                                  \
1305         s++;                                              \
1306     }                                                     \
1307 } STMT_END
1308
1309 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1310 STMT_START {                                          \
1311     while (s < strend) {                              \
1312         CoDe                                          \
1313         s += UTF8SKIP(s);                             \
1314     }                                                 \
1315 } STMT_END
1316
1317 #define REXEC_FBC_SCAN(CoDe)                          \
1318 STMT_START {                                          \
1319     while (s < strend) {                              \
1320         CoDe                                          \
1321         s++;                                          \
1322     }                                                 \
1323 } STMT_END
1324
1325 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1326 REXEC_FBC_UTF8_SCAN(                                  \
1327     if (CoNd) {                                       \
1328         if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1329             goto got_it;                              \
1330         else                                          \
1331             tmp = doevery;                            \
1332     }                                                 \
1333     else                                              \
1334         tmp = 1;                                      \
1335 )
1336
1337 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1338 REXEC_FBC_SCAN(                                       \
1339     if (CoNd) {                                       \
1340         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))  \
1341             goto got_it;                              \
1342         else                                          \
1343             tmp = doevery;                            \
1344     }                                                 \
1345     else                                              \
1346         tmp = 1;                                      \
1347 )
1348
1349 #define REXEC_FBC_TRYIT               \
1350 if ((reginfo->intuit || regtry(reginfo, &s))) \
1351     goto got_it
1352
1353 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1354     if (utf8_target) {                                             \
1355         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1356     }                                                          \
1357     else {                                                     \
1358         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1359     }
1360
1361 #define DUMP_EXEC_POS(li,s,doutf8) \
1362     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1363                 startpos, doutf8)
1364
1365
1366 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1367         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1368         tmp = TEST_NON_UTF8(tmp);                                              \
1369         REXEC_FBC_UTF8_SCAN(                                                   \
1370             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1371                 tmp = !tmp;                                                    \
1372                 IF_SUCCESS;                                                    \
1373             }                                                                  \
1374             else {                                                             \
1375                 IF_FAIL;                                                       \
1376             }                                                                  \
1377         );                                                                     \
1378
1379 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1380         if (s == reginfo->strbeg) {                                            \
1381             tmp = '\n';                                                        \
1382         }                                                                      \
1383         else {                                                                 \
1384             U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
1385             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1386         }                                                                      \
1387         tmp = TeSt1_UtF8;                                                      \
1388         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1389         REXEC_FBC_UTF8_SCAN(                                                   \
1390             if (tmp == ! (TeSt2_UtF8)) { \
1391                 tmp = !tmp;                                                    \
1392                 IF_SUCCESS;                                                    \
1393             }                                                                  \
1394             else {                                                             \
1395                 IF_FAIL;                                                       \
1396             }                                                                  \
1397         );                                                                     \
1398
1399 /* The only difference between the BOUND and NBOUND cases is that
1400  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1401  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1402  * with the other one being empty */
1403 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1405
1406 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1407     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1408
1409 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1410     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1411
1412 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1413     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1414
1415
1416 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1417  * be passed in completely with the variable name being tested, which isn't
1418  * such a clean interface, but this is easier to read than it was before.  We
1419  * are looking for the boundary (or non-boundary between a word and non-word
1420  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1421  * must be different.  Find the "wordness" of the character just prior to this
1422  * one, and compare it with the wordness of this one.  If they differ, we have
1423  * a boundary.  At the beginning of the string, pretend that the previous
1424  * character was a new-line */
1425 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1426     if (utf8_target) {                                                         \
1427                 UTF8_CODE \
1428     }                                                                          \
1429     else {  /* Not utf8 */                                                     \
1430         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1431         tmp = TEST_NON_UTF8(tmp);                                              \
1432         REXEC_FBC_SCAN(                                                        \
1433             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1434                 tmp = !tmp;                                                    \
1435                 IF_SUCCESS;                                                    \
1436             }                                                                  \
1437             else {                                                             \
1438                 IF_FAIL;                                                       \
1439             }                                                                  \
1440         );                                                                     \
1441     }                                                                          \
1442     if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s)))           \
1443         goto got_it;
1444
1445 /* We know what class REx starts with.  Try to find this position... */
1446 /* if reginfo->intuit, its a dryrun */
1447 /* annoyingly all the vars in this routine have different names from their counterparts
1448    in regmatch. /grrr */
1449
1450 STATIC char *
1451 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1452     const char *strend, regmatch_info *reginfo)
1453 {
1454     dVAR;
1455     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1456     char *pat_string;   /* The pattern's exactish string */
1457     char *pat_end;          /* ptr to end char of pat_string */
1458     re_fold_t folder;   /* Function for computing non-utf8 folds */
1459     const U8 *fold_array;   /* array for folding ords < 256 */
1460     STRLEN ln;
1461     STRLEN lnc;
1462     U8 c1;
1463     U8 c2;
1464     char *e;
1465     I32 tmp = 1;        /* Scratch variable? */
1466     const bool utf8_target = reginfo->is_utf8_target;
1467     UV utf8_fold_flags = 0;
1468     const bool is_utf8_pat = reginfo->is_utf8_pat;
1469     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1470                                    with a result inverts that result, as 0^1 =
1471                                    1 and 1^1 = 0 */
1472     _char_class_number classnum;
1473
1474     RXi_GET_DECL(prog,progi);
1475
1476     PERL_ARGS_ASSERT_FIND_BYCLASS;
1477
1478     /* We know what class it must start with. */
1479     switch (OP(c)) {
1480     case ANYOF:
1481     case ANYOF_SYNTHETIC:
1482     case ANYOF_WARN_SUPER:
1483         if (utf8_target) {
1484             REXEC_FBC_UTF8_CLASS_SCAN(
1485                       reginclass(prog, c, (U8*)s, utf8_target));
1486         }
1487         else {
1488             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1489         }
1490         break;
1491     case CANY:
1492         REXEC_FBC_SCAN(
1493             if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1494                 goto got_it;
1495             else
1496                 tmp = doevery;
1497         );
1498         break;
1499
1500     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1501         assert(! is_utf8_pat);
1502         /* FALL THROUGH */
1503     case EXACTFA:
1504         if (is_utf8_pat || utf8_target) {
1505             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1506             goto do_exactf_utf8;
1507         }
1508         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1509         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1510         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1511
1512     case EXACTF:   /* This node only generated for non-utf8 patterns */
1513         assert(! is_utf8_pat);
1514         if (utf8_target) {
1515             utf8_fold_flags = 0;
1516             goto do_exactf_utf8;
1517         }
1518         fold_array = PL_fold;
1519         folder = foldEQ;
1520         goto do_exactf_non_utf8;
1521
1522     case EXACTFL:
1523         if (is_utf8_pat || utf8_target) {
1524             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1525             goto do_exactf_utf8;
1526         }
1527         fold_array = PL_fold_locale;
1528         folder = foldEQ_locale;
1529         goto do_exactf_non_utf8;
1530
1531     case EXACTFU_SS:
1532         if (is_utf8_pat) {
1533             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1534         }
1535         goto do_exactf_utf8;
1536
1537     case EXACTFU:
1538         if (is_utf8_pat || utf8_target) {
1539             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1540             goto do_exactf_utf8;
1541         }
1542
1543         /* Any 'ss' in the pattern should have been replaced by regcomp,
1544          * so we don't have to worry here about this single special case
1545          * in the Latin1 range */
1546         fold_array = PL_fold_latin1;
1547         folder = foldEQ_latin1;
1548
1549         /* FALL THROUGH */
1550
1551     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1552                            are no glitches with fold-length differences
1553                            between the target string and pattern */
1554
1555         /* The idea in the non-utf8 EXACTF* cases is to first find the
1556          * first character of the EXACTF* node and then, if necessary,
1557          * case-insensitively compare the full text of the node.  c1 is the
1558          * first character.  c2 is its fold.  This logic will not work for
1559          * Unicode semantics and the german sharp ss, which hence should
1560          * not be compiled into a node that gets here. */
1561         pat_string = STRING(c);
1562         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1563
1564         /* We know that we have to match at least 'ln' bytes (which is the
1565          * same as characters, since not utf8).  If we have to match 3
1566          * characters, and there are only 2 availabe, we know without
1567          * trying that it will fail; so don't start a match past the
1568          * required minimum number from the far end */
1569         e = HOP3c(strend, -((SSize_t)ln), s);
1570
1571         if (reginfo->intuit && e < s) {
1572             e = s;                      /* Due to minlen logic of intuit() */
1573         }
1574
1575         c1 = *pat_string;
1576         c2 = fold_array[c1];
1577         if (c1 == c2) { /* If char and fold are the same */
1578             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1579         }
1580         else {
1581             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1582         }
1583         break;
1584
1585     do_exactf_utf8:
1586     {
1587         unsigned expansion;
1588
1589         /* If one of the operands is in utf8, we can't use the simpler folding
1590          * above, due to the fact that many different characters can have the
1591          * same fold, or portion of a fold, or different- length fold */
1592         pat_string = STRING(c);
1593         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1594         pat_end = pat_string + ln;
1595         lnc = is_utf8_pat       /* length to match in characters */
1596                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1597                 : ln;
1598
1599         /* We have 'lnc' characters to match in the pattern, but because of
1600          * multi-character folding, each character in the target can match
1601          * up to 3 characters (Unicode guarantees it will never exceed
1602          * this) if it is utf8-encoded; and up to 2 if not (based on the
1603          * fact that the Latin 1 folds are already determined, and the
1604          * only multi-char fold in that range is the sharp-s folding to
1605          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1606          * string character.  Adjust lnc accordingly, rounding up, so that
1607          * if we need to match at least 4+1/3 chars, that really is 5. */
1608         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1609         lnc = (lnc + expansion - 1) / expansion;
1610
1611         /* As in the non-UTF8 case, if we have to match 3 characters, and
1612          * only 2 are left, it's guaranteed to fail, so don't start a
1613          * match that would require us to go beyond the end of the string
1614          */
1615         e = HOP3c(strend, -((SSize_t)lnc), s);
1616
1617         if (reginfo->intuit && e < s) {
1618             e = s;                      /* Due to minlen logic of intuit() */
1619         }
1620
1621         /* XXX Note that we could recalculate e to stop the loop earlier,
1622          * as the worst case expansion above will rarely be met, and as we
1623          * go along we would usually find that e moves further to the left.
1624          * This would happen only after we reached the point in the loop
1625          * where if there were no expansion we should fail.  Unclear if
1626          * worth the expense */
1627
1628         while (s <= e) {
1629             char *my_strend= (char *)strend;
1630             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1631                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1632                 && (reginfo->intuit || regtry(reginfo, &s)) )
1633             {
1634                 goto got_it;
1635             }
1636             s += (utf8_target) ? UTF8SKIP(s) : 1;
1637         }
1638         break;
1639     }
1640     case BOUNDL:
1641         RXp_MATCH_TAINTED_on(prog);
1642         FBC_BOUND(isWORDCHAR_LC,
1643                   isWORDCHAR_LC_uvchr(tmp),
1644                   isWORDCHAR_LC_utf8((U8*)s));
1645         break;
1646     case NBOUNDL:
1647         RXp_MATCH_TAINTED_on(prog);
1648         FBC_NBOUND(isWORDCHAR_LC,
1649                    isWORDCHAR_LC_uvchr(tmp),
1650                    isWORDCHAR_LC_utf8((U8*)s));
1651         break;
1652     case BOUND:
1653         FBC_BOUND(isWORDCHAR,
1654                   isWORDCHAR_uni(tmp),
1655                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1656         break;
1657     case BOUNDA:
1658         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1659                          isWORDCHAR_A(tmp),
1660                          isWORDCHAR_A((U8*)s));
1661         break;
1662     case NBOUND:
1663         FBC_NBOUND(isWORDCHAR,
1664                    isWORDCHAR_uni(tmp),
1665                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1666         break;
1667     case NBOUNDA:
1668         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1669                           isWORDCHAR_A(tmp),
1670                           isWORDCHAR_A((U8*)s));
1671         break;
1672     case BOUNDU:
1673         FBC_BOUND(isWORDCHAR_L1,
1674                   isWORDCHAR_uni(tmp),
1675                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1676         break;
1677     case NBOUNDU:
1678         FBC_NBOUND(isWORDCHAR_L1,
1679                    isWORDCHAR_uni(tmp),
1680                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1681         break;
1682     case LNBREAK:
1683         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1684                         is_LNBREAK_latin1_safe(s, strend)
1685         );
1686         break;
1687
1688     /* The argument to all the POSIX node types is the class number to pass to
1689      * _generic_isCC() to build a mask for searching in PL_charclass[] */
1690
1691     case NPOSIXL:
1692         to_complement = 1;
1693         /* FALLTHROUGH */
1694
1695     case POSIXL:
1696         RXp_MATCH_TAINTED_on(prog);
1697         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1698                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1699         break;
1700
1701     case NPOSIXD:
1702         to_complement = 1;
1703         /* FALLTHROUGH */
1704
1705     case POSIXD:
1706         if (utf8_target) {
1707             goto posix_utf8;
1708         }
1709         goto posixa;
1710
1711     case NPOSIXA:
1712         if (utf8_target) {
1713             /* The complement of something that matches only ASCII matches all
1714              * UTF-8 variant code points, plus everything in ASCII that isn't
1715              * in the class */
1716             REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1717                                       || ! _generic_isCC_A(*s, FLAGS(c)));
1718             break;
1719         }
1720
1721         to_complement = 1;
1722         /* FALLTHROUGH */
1723
1724     case POSIXA:
1725       posixa:
1726         /* Don't need to worry about utf8, as it can match only a single
1727          * byte invariant character. */
1728         REXEC_FBC_CLASS_SCAN(
1729                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1730         break;
1731
1732     case NPOSIXU:
1733         to_complement = 1;
1734         /* FALLTHROUGH */
1735
1736     case POSIXU:
1737         if (! utf8_target) {
1738             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1739                                                                     FLAGS(c))));
1740         }
1741         else {
1742
1743       posix_utf8:
1744             classnum = (_char_class_number) FLAGS(c);
1745             if (classnum < _FIRST_NON_SWASH_CC) {
1746                 while (s < strend) {
1747
1748                     /* We avoid loading in the swash as long as possible, but
1749                      * should we have to, we jump to a separate loop.  This
1750                      * extra 'if' statement is what keeps this code from being
1751                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1752                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
1753                         goto found_above_latin1;
1754                     }
1755                     if ((UTF8_IS_INVARIANT(*s)
1756                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1757                                                                 classnum)))
1758                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
1759                             && to_complement ^ cBOOL(
1760                                 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1761                                                                       *(s + 1)),
1762                                               classnum))))
1763                     {
1764                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1765                             goto got_it;
1766                         else {
1767                             tmp = doevery;
1768                         }
1769                     }
1770                     else {
1771                         tmp = 1;
1772                     }
1773                     s += UTF8SKIP(s);
1774                 }
1775             }
1776             else switch (classnum) {    /* These classes are implemented as
1777                                            macros */
1778                 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1779                                         revert the change of \v matching this */
1780                     /* FALL THROUGH */
1781
1782                 case _CC_ENUM_PSXSPC:
1783                     REXEC_FBC_UTF8_CLASS_SCAN(
1784                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
1785                     break;
1786
1787                 case _CC_ENUM_BLANK:
1788                     REXEC_FBC_UTF8_CLASS_SCAN(
1789                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
1790                     break;
1791
1792                 case _CC_ENUM_XDIGIT:
1793                     REXEC_FBC_UTF8_CLASS_SCAN(
1794                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1795                     break;
1796
1797                 case _CC_ENUM_VERTSPACE:
1798                     REXEC_FBC_UTF8_CLASS_SCAN(
1799                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
1800                     break;
1801
1802                 case _CC_ENUM_CNTRL:
1803                     REXEC_FBC_UTF8_CLASS_SCAN(
1804                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
1805                     break;
1806
1807                 default:
1808                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1809                     assert(0); /* NOTREACHED */
1810             }
1811         }
1812         break;
1813
1814       found_above_latin1:   /* Here we have to load a swash to get the result
1815                                for the current code point */
1816         if (! PL_utf8_swash_ptrs[classnum]) {
1817             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1818             PL_utf8_swash_ptrs[classnum] =
1819                     _core_swash_init("utf8", swash_property_names[classnum],
1820                                      &PL_sv_undef, 1, 0, NULL, &flags);
1821         }
1822
1823         /* This is a copy of the loop above for swash classes, though using the
1824          * FBC macro instead of being expanded out.  Since we've loaded the
1825          * swash, we don't have to check for that each time through the loop */
1826         REXEC_FBC_UTF8_CLASS_SCAN(
1827                 to_complement ^ cBOOL(_generic_utf8(
1828                                       classnum,
1829                                       s,
1830                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
1831                                                   (U8 *) s, TRUE))));
1832         break;
1833
1834     case AHOCORASICKC:
1835     case AHOCORASICK:
1836         {
1837             DECL_TRIE_TYPE(c);
1838             /* what trie are we using right now */
1839             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1840             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1841             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1842
1843             const char *last_start = strend - trie->minlen;
1844 #ifdef DEBUGGING
1845             const char *real_start = s;
1846 #endif
1847             STRLEN maxlen = trie->maxlen;
1848             SV *sv_points;
1849             U8 **points; /* map of where we were in the input string
1850                             when reading a given char. For ASCII this
1851                             is unnecessary overhead as the relationship
1852                             is always 1:1, but for Unicode, especially
1853                             case folded Unicode this is not true. */
1854             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1855             U8 *bitmap=NULL;
1856
1857
1858             GET_RE_DEBUG_FLAGS_DECL;
1859
1860             /* We can't just allocate points here. We need to wrap it in
1861              * an SV so it gets freed properly if there is a croak while
1862              * running the match */
1863             ENTER;
1864             SAVETMPS;
1865             sv_points=newSV(maxlen * sizeof(U8 *));
1866             SvCUR_set(sv_points,
1867                 maxlen * sizeof(U8 *));
1868             SvPOK_on(sv_points);
1869             sv_2mortal(sv_points);
1870             points=(U8**)SvPV_nolen(sv_points );
1871             if ( trie_type != trie_utf8_fold
1872                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
1873             {
1874                 if (trie->bitmap)
1875                     bitmap=(U8*)trie->bitmap;
1876                 else
1877                     bitmap=(U8*)ANYOF_BITMAP(c);
1878             }
1879             /* this is the Aho-Corasick algorithm modified a touch
1880                to include special handling for long "unknown char" sequences.
1881                The basic idea being that we use AC as long as we are dealing
1882                with a possible matching char, when we encounter an unknown char
1883                (and we have not encountered an accepting state) we scan forward
1884                until we find a legal starting char.
1885                AC matching is basically that of trie matching, except that when
1886                we encounter a failing transition, we fall back to the current
1887                states "fail state", and try the current char again, a process
1888                we repeat until we reach the root state, state 1, or a legal
1889                transition. If we fail on the root state then we can either
1890                terminate if we have reached an accepting state previously, or
1891                restart the entire process from the beginning if we have not.
1892
1893              */
1894             while (s <= last_start) {
1895                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1896                 U8 *uc = (U8*)s;
1897                 U16 charid = 0;
1898                 U32 base = 1;
1899                 U32 state = 1;
1900                 UV uvc = 0;
1901                 STRLEN len = 0;
1902                 STRLEN foldlen = 0;
1903                 U8 *uscan = (U8*)NULL;
1904                 U8 *leftmost = NULL;
1905 #ifdef DEBUGGING
1906                 U32 accepted_word= 0;
1907 #endif
1908                 U32 pointpos = 0;
1909
1910                 while ( state && uc <= (U8*)strend ) {
1911                     int failed=0;
1912                     U32 word = aho->states[ state ].wordnum;
1913
1914                     if( state==1 ) {
1915                         if ( bitmap ) {
1916                             DEBUG_TRIE_EXECUTE_r(
1917                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1918                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1919                                         (char *)uc, utf8_target );
1920                                     PerlIO_printf( Perl_debug_log,
1921                                         " Scanning for legal start char...\n");
1922                                 }
1923                             );
1924                             if (utf8_target) {
1925                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1926                                     uc += UTF8SKIP(uc);
1927                                 }
1928                             } else {
1929                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1930                                     uc++;
1931                                 }
1932                             }
1933                             s= (char *)uc;
1934                         }
1935                         if (uc >(U8*)last_start) break;
1936                     }
1937
1938                     if ( word ) {
1939                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1940                         if (!leftmost || lpos < leftmost) {
1941                             DEBUG_r(accepted_word=word);
1942                             leftmost= lpos;
1943                         }
1944                         if (base==0) break;
1945
1946                     }
1947                     points[pointpos++ % maxlen]= uc;
1948                     if (foldlen || uc < (U8*)strend) {
1949                         REXEC_TRIE_READ_CHAR(trie_type, trie,
1950                                          widecharmap, uc,
1951                                          uscan, len, uvc, charid, foldlen,
1952                                          foldbuf, uniflags);
1953                         DEBUG_TRIE_EXECUTE_r({
1954                             dump_exec_pos( (char *)uc, c, strend,
1955                                         real_start, s, utf8_target);
1956                             PerlIO_printf(Perl_debug_log,
1957                                 " Charid:%3u CP:%4"UVxf" ",
1958                                  charid, uvc);
1959                         });
1960                     }
1961                     else {
1962                         len = 0;
1963                         charid = 0;
1964                     }
1965
1966
1967                     do {
1968 #ifdef DEBUGGING
1969                         word = aho->states[ state ].wordnum;
1970 #endif
1971                         base = aho->states[ state ].trans.base;
1972
1973                         DEBUG_TRIE_EXECUTE_r({
1974                             if (failed)
1975                                 dump_exec_pos( (char *)uc, c, strend, real_start,
1976                                     s,   utf8_target );
1977                             PerlIO_printf( Perl_debug_log,
1978                                 "%sState: %4"UVxf", word=%"UVxf,
1979                                 failed ? " Fail transition to " : "",
1980                                 (UV)state, (UV)word);
1981                         });
1982                         if ( base ) {
1983                             U32 tmp;
1984                             I32 offset;
1985                             if (charid &&
1986                                  ( ((offset = base + charid
1987                                     - 1 - trie->uniquecharcount)) >= 0)
1988                                  && ((U32)offset < trie->lasttrans)
1989                                  && trie->trans[offset].check == state
1990                                  && (tmp=trie->trans[offset].next))
1991                             {
1992                                 DEBUG_TRIE_EXECUTE_r(
1993                                     PerlIO_printf( Perl_debug_log," - legal\n"));
1994                                 state = tmp;
1995                                 break;
1996                             }
1997                             else {
1998                                 DEBUG_TRIE_EXECUTE_r(
1999                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2000                                 failed = 1;
2001                                 state = aho->fail[state];
2002                             }
2003                         }
2004                         else {
2005                             /* we must be accepting here */
2006                             DEBUG_TRIE_EXECUTE_r(
2007                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2008                             failed = 1;
2009                             break;
2010                         }
2011                     } while(state);
2012                     uc += len;
2013                     if (failed) {
2014                         if (leftmost)
2015                             break;
2016                         if (!state) state = 1;
2017                     }
2018                 }
2019                 if ( aho->states[ state ].wordnum ) {
2020                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2021                     if (!leftmost || lpos < leftmost) {
2022                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2023                         leftmost = lpos;
2024                     }
2025                 }
2026                 if (leftmost) {
2027                     s = (char*)leftmost;
2028                     DEBUG_TRIE_EXECUTE_r({
2029                         PerlIO_printf(
2030                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2031                             (UV)accepted_word, (IV)(s - real_start)
2032                         );
2033                     });
2034                     if (reginfo->intuit || regtry(reginfo, &s)) {
2035                         FREETMPS;
2036                         LEAVE;
2037                         goto got_it;
2038                     }
2039                     s = HOPc(s,1);
2040                     DEBUG_TRIE_EXECUTE_r({
2041                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2042                     });
2043                 } else {
2044                     DEBUG_TRIE_EXECUTE_r(
2045                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2046                     break;
2047                 }
2048             }
2049             FREETMPS;
2050             LEAVE;
2051         }
2052         break;
2053     default:
2054         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2055         break;
2056     }
2057     return 0;
2058   got_it:
2059     return s;
2060 }
2061
2062 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2063  * flags have same meanings as with regexec_flags() */
2064
2065 static void
2066 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2067                             char *strbeg,
2068                             char *strend,
2069                             SV *sv,
2070                             U32 flags,
2071                             bool utf8_target)
2072 {
2073     struct regexp *const prog = ReANY(rx);
2074
2075     if (flags & REXEC_COPY_STR) {
2076 #ifdef PERL_ANY_COW
2077         if (SvCANCOW(sv)) {
2078             if (DEBUG_C_TEST) {
2079                 PerlIO_printf(Perl_debug_log,
2080                               "Copy on write: regexp capture, type %d\n",
2081                               (int) SvTYPE(sv));
2082             }
2083             /* Create a new COW SV to share the match string and store
2084              * in saved_copy, unless the current COW SV in saved_copy
2085              * is valid and suitable for our purpose */
2086             if ((   prog->saved_copy
2087                  && SvIsCOW(prog->saved_copy)
2088                  && SvPOKp(prog->saved_copy)
2089                  && SvIsCOW(sv)
2090                  && SvPOKp(sv)
2091                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2092             {
2093                 /* just reuse saved_copy SV */
2094                 if (RXp_MATCH_COPIED(prog)) {
2095                     Safefree(prog->subbeg);
2096                     RXp_MATCH_COPIED_off(prog);
2097                 }
2098             }
2099             else {
2100                 /* create new COW SV to share string */
2101                 RX_MATCH_COPY_FREE(rx);
2102                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2103             }
2104             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2105             assert (SvPOKp(prog->saved_copy));
2106             prog->sublen  = strend - strbeg;
2107             prog->suboffset = 0;
2108             prog->subcoffset = 0;
2109         } else
2110 #endif
2111         {
2112             SSize_t min = 0;
2113             SSize_t max = strend - strbeg;
2114             SSize_t sublen;
2115
2116             if (    (flags & REXEC_COPY_SKIP_POST)
2117                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2118                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2119             ) { /* don't copy $' part of string */
2120                 U32 n = 0;
2121                 max = -1;
2122                 /* calculate the right-most part of the string covered
2123                  * by a capture. Due to look-ahead, this may be to
2124                  * the right of $&, so we have to scan all captures */
2125                 while (n <= prog->lastparen) {
2126                     if (prog->offs[n].end > max)
2127                         max = prog->offs[n].end;
2128                     n++;
2129                 }
2130                 if (max == -1)
2131                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2132                             ? prog->offs[0].start
2133                             : 0;
2134                 assert(max >= 0 && max <= strend - strbeg);
2135             }
2136
2137             if (    (flags & REXEC_COPY_SKIP_PRE)
2138                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2139                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2140             ) { /* don't copy $` part of string */
2141                 U32 n = 0;
2142                 min = max;
2143                 /* calculate the left-most part of the string covered
2144                  * by a capture. Due to look-behind, this may be to
2145                  * the left of $&, so we have to scan all captures */
2146                 while (min && n <= prog->lastparen) {
2147                     if (   prog->offs[n].start != -1
2148                         && prog->offs[n].start < min)
2149                     {
2150                         min = prog->offs[n].start;
2151                     }
2152                     n++;
2153                 }
2154                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2155                     && min >  prog->offs[0].end
2156                 )
2157                     min = prog->offs[0].end;
2158
2159             }
2160
2161             assert(min >= 0 && min <= max && min <= strend - strbeg);
2162             sublen = max - min;
2163
2164             if (RX_MATCH_COPIED(rx)) {
2165                 if (sublen > prog->sublen)
2166                     prog->subbeg =
2167                             (char*)saferealloc(prog->subbeg, sublen+1);
2168             }
2169             else
2170                 prog->subbeg = (char*)safemalloc(sublen+1);
2171             Copy(strbeg + min, prog->subbeg, sublen, char);
2172             prog->subbeg[sublen] = '\0';
2173             prog->suboffset = min;
2174             prog->sublen = sublen;
2175             RX_MATCH_COPIED_on(rx);
2176         }
2177         prog->subcoffset = prog->suboffset;
2178         if (prog->suboffset && utf8_target) {
2179             /* Convert byte offset to chars.
2180              * XXX ideally should only compute this if @-/@+
2181              * has been seen, a la PL_sawampersand ??? */
2182
2183             /* If there's a direct correspondence between the
2184              * string which we're matching and the original SV,
2185              * then we can use the utf8 len cache associated with
2186              * the SV. In particular, it means that under //g,
2187              * sv_pos_b2u() will use the previously cached
2188              * position to speed up working out the new length of
2189              * subcoffset, rather than counting from the start of
2190              * the string each time. This stops
2191              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2192              * from going quadratic */
2193             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2194                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2195                                                 SV_GMAGIC|SV_CONST_RETURN);
2196             else
2197                 prog->subcoffset = utf8_length((U8*)strbeg,
2198                                     (U8*)(strbeg+prog->suboffset));
2199         }
2200     }
2201     else {
2202         RX_MATCH_COPY_FREE(rx);
2203         prog->subbeg = strbeg;
2204         prog->suboffset = 0;
2205         prog->subcoffset = 0;
2206         prog->sublen = strend - strbeg;
2207     }
2208 }
2209
2210
2211
2212
2213 /*
2214  - regexec_flags - match a regexp against a string
2215  */
2216 I32
2217 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2218               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2219 /* stringarg: the point in the string at which to begin matching */
2220 /* strend:    pointer to null at end of string */
2221 /* strbeg:    real beginning of string */
2222 /* minend:    end of match must be >= minend bytes after stringarg. */
2223 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2224  *            itself is accessed via the pointers above */
2225 /* data:      May be used for some additional optimizations.
2226               Currently unused. */
2227 /* flags:     For optimizations. See REXEC_* in regexp.h */
2228
2229 {
2230     dVAR;
2231     struct regexp *const prog = ReANY(rx);
2232     char *s;
2233     regnode *c;
2234     char *startpos;
2235     SSize_t minlen;             /* must match at least this many chars */
2236     SSize_t dontbother = 0;     /* how many characters not to try at end */
2237     const bool utf8_target = cBOOL(DO_UTF8(sv));
2238     I32 multiline;
2239     RXi_GET_DECL(prog,progi);
2240     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2241     regmatch_info *const reginfo = &reginfo_buf;
2242     regexp_paren_pair *swap = NULL;
2243     I32 oldsave;
2244     GET_RE_DEBUG_FLAGS_DECL;
2245
2246     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2247     PERL_UNUSED_ARG(data);
2248
2249     /* Be paranoid... */
2250     if (prog == NULL || stringarg == NULL) {
2251         Perl_croak(aTHX_ "NULL regexp parameter");
2252         return 0;
2253     }
2254
2255     DEBUG_EXECUTE_r(
2256         debug_start_match(rx, utf8_target, stringarg, strend,
2257         "Matching");
2258     );
2259
2260     startpos = stringarg;
2261
2262     if (prog->extflags & RXf_GPOS_SEEN) {
2263         MAGIC *mg;
2264
2265         /* set reginfo->ganch, the position where \G can match */
2266
2267         reginfo->ganch =
2268             (flags & REXEC_IGNOREPOS)
2269             ? stringarg /* use start pos rather than pos() */
2270             : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2271               /* Defined pos(): */
2272             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2273             : strbeg; /* pos() not defined; use start of string */
2274
2275         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2276             "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
2277
2278         /* in the presence of \G, we may need to start looking earlier in
2279          * the string than the suggested start point of stringarg:
2280          * if gofs->prog is set, then that's a known, fixed minimum
2281          * offset, such as
2282          * /..\G/:   gofs = 2
2283          * /ab|c\G/: gofs = 1
2284          * or if the minimum offset isn't known, then we have to go back
2285          * to the start of the string, e.g. /w+\G/
2286          */
2287
2288         if (prog->extflags & RXf_ANCH_GPOS) {
2289             startpos  = reginfo->ganch - prog->gofs;
2290             if (startpos <
2291                 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2292             {
2293                 DEBUG_r(PerlIO_printf(Perl_debug_log,
2294                         "fail: ganch-gofs before earliest possible start\n"));
2295                 return 0;
2296             }
2297         }
2298         else if (prog->gofs) {
2299             if (startpos - prog->gofs < strbeg)
2300                 startpos = strbeg;
2301             else
2302                 startpos -= prog->gofs;
2303         }
2304         else if (prog->extflags & RXf_GPOS_FLOAT)
2305             startpos = strbeg;
2306     }
2307
2308     minlen = prog->minlen;
2309     if ((startpos + minlen) > strend || startpos < strbeg) {
2310         DEBUG_r(PerlIO_printf(Perl_debug_log,
2311                     "Regex match can't succeed, so not even tried\n"));
2312         return 0;
2313     }
2314
2315     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2316      * which will call destuctors to reset PL_regmatch_state, free higher
2317      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2318      * regmatch_info_aux_eval */
2319
2320     oldsave = PL_savestack_ix;
2321
2322     s = startpos;
2323
2324     if ((prog->extflags & RXf_USE_INTUIT)
2325         && !(flags & REXEC_CHECKED))
2326     {
2327         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2328                                     flags, NULL);
2329         if (!s)
2330             return 0;
2331
2332         if (prog->extflags & RXf_CHECK_ALL) {
2333             /* we can match based purely on the result of INTUIT.
2334              * Set up captures etc just for $& and $-[0]
2335              * (an intuit-only match wont have $1,$2,..) */
2336             assert(!prog->nparens);
2337
2338             /* s/// doesn't like it if $& is earlier than where we asked it to
2339              * start searching (which can happen on something like /.\G/) */
2340             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2341                     && (s < stringarg))
2342             {
2343                 /* this should only be possible under \G */
2344                 assert(prog->extflags & RXf_GPOS_SEEN);
2345                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2346                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2347                 goto phooey;
2348             }
2349
2350             /* match via INTUIT shouldn't have any captures.
2351              * Let @-, @+, $^N know */
2352             prog->lastparen = prog->lastcloseparen = 0;
2353             RX_MATCH_UTF8_set(rx, utf8_target);
2354             prog->offs[0].start = s - strbeg;
2355             prog->offs[0].end = utf8_target
2356                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2357                 : s - strbeg + prog->minlenret;
2358             if ( !(flags & REXEC_NOT_FIRST) )
2359                 S_reg_set_capture_string(aTHX_ rx,
2360                                         strbeg, strend,
2361                                         sv, flags, utf8_target);
2362
2363             return 1;
2364         }
2365     }
2366
2367     multiline = prog->extflags & RXf_PMf_MULTILINE;
2368
2369     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2370         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2371                               "String too short [regexec_flags]...\n"));
2372         goto phooey;
2373     }
2374
2375     /* Check validity of program. */
2376     if (UCHARAT(progi->program) != REG_MAGIC) {
2377         Perl_croak(aTHX_ "corrupted regexp program");
2378     }
2379
2380     RX_MATCH_TAINTED_off(rx);
2381
2382     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2383     reginfo->intuit = 0;
2384     reginfo->is_utf8_target = cBOOL(utf8_target);
2385     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2386     reginfo->warned = FALSE;
2387     reginfo->strbeg  = strbeg;
2388     reginfo->sv = sv;
2389     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2390     reginfo->strend = strend;
2391     /* see how far we have to get to not match where we matched before */
2392     reginfo->till = stringarg + minend;
2393
2394     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2395         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2396            S_cleanup_regmatch_info_aux has executed (registered by
2397            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2398            magic belonging to this SV.
2399            Not newSVsv, either, as it does not COW.
2400         */
2401         reginfo->sv = newSV(0);
2402         sv_setsv(reginfo->sv, sv);
2403         SAVEFREESV(reginfo->sv);
2404     }
2405
2406     /* reserve next 2 or 3 slots in PL_regmatch_state:
2407      * slot N+0: may currently be in use: skip it
2408      * slot N+1: use for regmatch_info_aux struct
2409      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2410      * slot N+3: ready for use by regmatch()
2411      */
2412
2413     {
2414         regmatch_state *old_regmatch_state;
2415         regmatch_slab  *old_regmatch_slab;
2416         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2417
2418         /* on first ever match, allocate first slab */
2419         if (!PL_regmatch_slab) {
2420             Newx(PL_regmatch_slab, 1, regmatch_slab);
2421             PL_regmatch_slab->prev = NULL;
2422             PL_regmatch_slab->next = NULL;
2423             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2424         }
2425
2426         old_regmatch_state = PL_regmatch_state;
2427         old_regmatch_slab  = PL_regmatch_slab;
2428
2429         for (i=0; i <= max; i++) {
2430             if (i == 1)
2431                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2432             else if (i ==2)
2433                 reginfo->info_aux_eval =
2434                 reginfo->info_aux->info_aux_eval =
2435                             &(PL_regmatch_state->u.info_aux_eval);
2436
2437             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2438                 PL_regmatch_state = S_push_slab(aTHX);
2439         }
2440
2441         /* note initial PL_regmatch_state position; at end of match we'll
2442          * pop back to there and free any higher slabs */
2443
2444         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2445         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2446         reginfo->info_aux->poscache = NULL;
2447
2448         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2449
2450         if ((prog->extflags & RXf_EVAL_SEEN))
2451             S_setup_eval_state(aTHX_ reginfo);
2452         else
2453             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2454     }
2455
2456     /* If there is a "must appear" string, look for it. */
2457
2458     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2459         /* We have to be careful. If the previous successful match
2460            was from this regex we don't want a subsequent partially
2461            successful match to clobber the old results.
2462            So when we detect this possibility we add a swap buffer
2463            to the re, and switch the buffer each match. If we fail,
2464            we switch it back; otherwise we leave it swapped.
2465         */
2466         swap = prog->offs;
2467         /* do we need a save destructor here for eval dies? */
2468         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2469         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2470             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2471             PTR2UV(prog),
2472             PTR2UV(swap),
2473             PTR2UV(prog->offs)
2474         ));
2475     }
2476
2477     /* Simplest case:  anchored match need be tried only once. */
2478     /*  [unless only anchor is BOL and multiline is set] */
2479     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2480         if (s == startpos && regtry(reginfo, &s))
2481             goto got_it;
2482         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2483                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2484         {
2485             char *end;
2486
2487             if (minlen)
2488                 dontbother = minlen - 1;
2489             end = HOP3c(strend, -dontbother, strbeg) - 1;
2490             /* for multiline we only have to try after newlines */
2491             if (prog->check_substr || prog->check_utf8) {
2492                 /* because of the goto we can not easily reuse the macros for bifurcating the
2493                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2494                 if (utf8_target) {
2495                     if (s == startpos)
2496                         goto after_try_utf8;
2497                     while (1) {
2498                         if (regtry(reginfo, &s)) {
2499                             goto got_it;
2500                         }
2501                       after_try_utf8:
2502                         if (s > end) {
2503                             goto phooey;
2504                         }
2505                         if (prog->extflags & RXf_USE_INTUIT) {
2506                             s = re_intuit_start(rx, sv, strbeg,
2507                                     s + UTF8SKIP(s), strend, flags, NULL);
2508                             if (!s) {
2509                                 goto phooey;
2510                             }
2511                         }
2512                         else {
2513                             s += UTF8SKIP(s);
2514                         }
2515                     }
2516                 } /* end search for check string in unicode */
2517                 else {
2518                     if (s == startpos) {
2519                         goto after_try_latin;
2520                     }
2521                     while (1) {
2522                         if (regtry(reginfo, &s)) {
2523                             goto got_it;
2524                         }
2525                       after_try_latin:
2526                         if (s > end) {
2527                             goto phooey;
2528                         }
2529                         if (prog->extflags & RXf_USE_INTUIT) {
2530                             s = re_intuit_start(rx, sv, strbeg,
2531                                         s + 1, strend, flags, NULL);
2532                             if (!s) {
2533                                 goto phooey;
2534                             }
2535                         }
2536                         else {
2537                             s++;
2538                         }
2539                     }
2540                 } /* end search for check string in latin*/
2541             } /* end search for check string */
2542             else { /* search for newline */
2543                 if (s > startpos) {
2544                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2545                     s--;
2546                 }
2547                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2548                 while (s <= end) { /* note it could be possible to match at the end of the string */
2549                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2550                         if (regtry(reginfo, &s))
2551                             goto got_it;
2552                     }
2553                 }
2554             } /* end search for newline */
2555         } /* end anchored/multiline check string search */
2556         goto phooey;
2557     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2558     {
2559         /* For anchored \G, the only position it can match from is
2560          * (ganch-gofs); we already set startpos to this above; if intuit
2561          * moved us on from there, we can't possibly succeed */
2562         assert(startpos == reginfo->ganch - prog->gofs);
2563         if (s == startpos && regtry(reginfo, &s))
2564             goto got_it;
2565         goto phooey;
2566     }
2567
2568     /* Messy cases:  unanchored match. */
2569     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2570         /* we have /x+whatever/ */
2571         /* it must be a one character string (XXXX Except is_utf8_pat?) */
2572         char ch;
2573 #ifdef DEBUGGING
2574         int did_match = 0;
2575 #endif
2576         if (utf8_target) {
2577             if (! prog->anchored_utf8) {
2578                 to_utf8_substr(prog);
2579             }
2580             ch = SvPVX_const(prog->anchored_utf8)[0];
2581             REXEC_FBC_SCAN(
2582                 if (*s == ch) {
2583                     DEBUG_EXECUTE_r( did_match = 1 );
2584                     if (regtry(reginfo, &s)) goto got_it;
2585                     s += UTF8SKIP(s);
2586                     while (s < strend && *s == ch)
2587                         s += UTF8SKIP(s);
2588                 }
2589             );
2590
2591         }
2592         else {
2593             if (! prog->anchored_substr) {
2594                 if (! to_byte_substr(prog)) {
2595                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2596                 }
2597             }
2598             ch = SvPVX_const(prog->anchored_substr)[0];
2599             REXEC_FBC_SCAN(
2600                 if (*s == ch) {
2601                     DEBUG_EXECUTE_r( did_match = 1 );
2602                     if (regtry(reginfo, &s)) goto got_it;
2603                     s++;
2604                     while (s < strend && *s == ch)
2605                         s++;
2606                 }
2607             );
2608         }
2609         DEBUG_EXECUTE_r(if (!did_match)
2610                 PerlIO_printf(Perl_debug_log,
2611                                   "Did not find anchored character...\n")
2612                );
2613     }
2614     else if (prog->anchored_substr != NULL
2615               || prog->anchored_utf8 != NULL
2616               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2617                   && prog->float_max_offset < strend - s)) {
2618         SV *must;
2619         SSize_t back_max;
2620         SSize_t back_min;
2621         char *last;
2622         char *last1;            /* Last position checked before */
2623 #ifdef DEBUGGING
2624         int did_match = 0;
2625 #endif
2626         if (prog->anchored_substr || prog->anchored_utf8) {
2627             if (utf8_target) {
2628                 if (! prog->anchored_utf8) {
2629                     to_utf8_substr(prog);
2630                 }
2631                 must = prog->anchored_utf8;
2632             }
2633             else {
2634                 if (! prog->anchored_substr) {
2635                     if (! to_byte_substr(prog)) {
2636                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2637                     }
2638                 }
2639                 must = prog->anchored_substr;
2640             }
2641             back_max = back_min = prog->anchored_offset;
2642         } else {
2643             if (utf8_target) {
2644                 if (! prog->float_utf8) {
2645                     to_utf8_substr(prog);
2646                 }
2647                 must = prog->float_utf8;
2648             }
2649             else {
2650                 if (! prog->float_substr) {
2651                     if (! to_byte_substr(prog)) {
2652                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2653                     }
2654                 }
2655                 must = prog->float_substr;
2656             }
2657             back_max = prog->float_max_offset;
2658             back_min = prog->float_min_offset;
2659         }
2660
2661         if (back_min<0) {
2662             last = strend;
2663         } else {
2664             last = HOP3c(strend,        /* Cannot start after this */
2665                   -(SSize_t)(CHR_SVLEN(must)
2666                          - (SvTAIL(must) != 0) + back_min), strbeg);
2667         }
2668         if (s > reginfo->strbeg)
2669             last1 = HOPc(s, -1);
2670         else
2671             last1 = s - 1;      /* bogus */
2672
2673         /* XXXX check_substr already used to find "s", can optimize if
2674            check_substr==must. */
2675         dontbother = 0;
2676         strend = HOPc(strend, -dontbother);
2677         while ( (s <= last) &&
2678                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2679                                   (unsigned char*)strend, must,
2680                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2681             DEBUG_EXECUTE_r( did_match = 1 );
2682             if (HOPc(s, -back_max) > last1) {
2683                 last1 = HOPc(s, -back_min);
2684                 s = HOPc(s, -back_max);
2685             }
2686             else {
2687                 char * const t = (last1 >= reginfo->strbeg)
2688                                     ? HOPc(last1, 1) : last1 + 1;
2689
2690                 last1 = HOPc(s, -back_min);
2691                 s = t;
2692             }
2693             if (utf8_target) {
2694                 while (s <= last1) {
2695                     if (regtry(reginfo, &s))
2696                         goto got_it;
2697                     if (s >= last1) {
2698                         s++; /* to break out of outer loop */
2699                         break;
2700                     }
2701                     s += UTF8SKIP(s);
2702                 }
2703             }
2704             else {
2705                 while (s <= last1) {
2706                     if (regtry(reginfo, &s))
2707                         goto got_it;
2708                     s++;
2709                 }
2710             }
2711         }
2712         DEBUG_EXECUTE_r(if (!did_match) {
2713             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2714                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2715             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2716                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2717                                ? "anchored" : "floating"),
2718                 quoted, RE_SV_TAIL(must));
2719         });
2720         goto phooey;
2721     }
2722     else if ( (c = progi->regstclass) ) {
2723         if (minlen) {
2724             const OPCODE op = OP(progi->regstclass);
2725             /* don't bother with what can't match */
2726             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2727                 strend = HOPc(strend, -(minlen - 1));
2728         }
2729         DEBUG_EXECUTE_r({
2730             SV * const prop = sv_newmortal();
2731             regprop(prog, prop, c);
2732             {
2733                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2734                     s,strend-s,60);
2735                 PerlIO_printf(Perl_debug_log,
2736                     "Matching stclass %.*s against %s (%d bytes)\n",
2737                     (int)SvCUR(prop), SvPVX_const(prop),
2738                      quoted, (int)(strend - s));
2739             }
2740         });
2741         if (find_byclass(prog, c, s, strend, reginfo))
2742             goto got_it;
2743         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2744     }
2745     else {
2746         dontbother = 0;
2747         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2748             /* Trim the end. */
2749             char *last= NULL;
2750             SV* float_real;
2751             STRLEN len;
2752             const char *little;
2753
2754             if (utf8_target) {
2755                 if (! prog->float_utf8) {
2756                     to_utf8_substr(prog);
2757                 }
2758                 float_real = prog->float_utf8;
2759             }
2760             else {
2761                 if (! prog->float_substr) {
2762                     if (! to_byte_substr(prog)) {
2763                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2764                     }
2765                 }
2766                 float_real = prog->float_substr;
2767             }
2768
2769             little = SvPV_const(float_real, len);
2770             if (SvTAIL(float_real)) {
2771                     /* This means that float_real contains an artificial \n on
2772                      * the end due to the presence of something like this:
2773                      * /foo$/ where we can match both "foo" and "foo\n" at the
2774                      * end of the string.  So we have to compare the end of the
2775                      * string first against the float_real without the \n and
2776                      * then against the full float_real with the string.  We
2777                      * have to watch out for cases where the string might be
2778                      * smaller than the float_real or the float_real without
2779                      * the \n. */
2780                     char *checkpos= strend - len;
2781                     DEBUG_OPTIMISE_r(
2782                         PerlIO_printf(Perl_debug_log,
2783                             "%sChecking for float_real.%s\n",
2784                             PL_colors[4], PL_colors[5]));
2785                     if (checkpos + 1 < strbeg) {
2786                         /* can't match, even if we remove the trailing \n
2787                          * string is too short to match */
2788                         DEBUG_EXECUTE_r(
2789                             PerlIO_printf(Perl_debug_log,
2790                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2791                                 PL_colors[4], PL_colors[5]));
2792                         goto phooey;
2793                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2794                         /* can match, the end of the string matches without the
2795                          * "\n" */
2796                         last = checkpos + 1;
2797                     } else if (checkpos < strbeg) {
2798                         /* cant match, string is too short when the "\n" is
2799                          * included */
2800                         DEBUG_EXECUTE_r(
2801                             PerlIO_printf(Perl_debug_log,
2802                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2803                                 PL_colors[4], PL_colors[5]));
2804                         goto phooey;
2805                     } else if (!multiline) {
2806                         /* non multiline match, so compare with the "\n" at the
2807                          * end of the string */
2808                         if (memEQ(checkpos, little, len)) {
2809                             last= checkpos;
2810                         } else {
2811                             DEBUG_EXECUTE_r(
2812                                 PerlIO_printf(Perl_debug_log,
2813                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2814                                     PL_colors[4], PL_colors[5]));
2815                             goto phooey;
2816                         }
2817                     } else {
2818                         /* multiline match, so we have to search for a place
2819                          * where the full string is located */
2820                         goto find_last;
2821                     }
2822             } else {
2823                   find_last:
2824                     if (len)
2825                         last = rninstr(s, strend, little, little + len);
2826                     else
2827                         last = strend;  /* matching "$" */
2828             }
2829             if (!last) {
2830                 /* at one point this block contained a comment which was
2831                  * probably incorrect, which said that this was a "should not
2832                  * happen" case.  Even if it was true when it was written I am
2833                  * pretty sure it is not anymore, so I have removed the comment
2834                  * and replaced it with this one. Yves */
2835                 DEBUG_EXECUTE_r(
2836                     PerlIO_printf(Perl_debug_log,
2837                         "String does not contain required substring, cannot match.\n"
2838                     ));
2839                 goto phooey;
2840             }
2841             dontbother = strend - last + prog->float_min_offset;
2842         }
2843         if (minlen && (dontbother < minlen))
2844             dontbother = minlen - 1;
2845         strend -= dontbother;              /* this one's always in bytes! */
2846         /* We don't know much -- general case. */
2847         if (utf8_target) {
2848             for (;;) {
2849                 if (regtry(reginfo, &s))
2850                     goto got_it;
2851                 if (s >= strend)
2852                     break;
2853                 s += UTF8SKIP(s);
2854             };
2855         }
2856         else {
2857             do {
2858                 if (regtry(reginfo, &s))
2859                     goto got_it;
2860             } while (s++ < strend);
2861         }
2862     }
2863
2864     /* Failure. */
2865     goto phooey;
2866
2867 got_it:
2868     /* s/// doesn't like it if $& is earlier than where we asked it to
2869      * start searching (which can happen on something like /.\G/) */
2870     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2871             && (prog->offs[0].start < stringarg - strbeg))
2872     {
2873         /* this should only be possible under \G */
2874         assert(prog->extflags & RXf_GPOS_SEEN);
2875         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2876             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2877         goto phooey;
2878     }
2879
2880     DEBUG_BUFFERS_r(
2881         if (swap)
2882             PerlIO_printf(Perl_debug_log,
2883                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2884                 PTR2UV(prog),
2885                 PTR2UV(swap)
2886             );
2887     );
2888     Safefree(swap);
2889
2890     /* clean up; this will trigger destructors that will free all slabs
2891      * above the current one, and cleanup the regmatch_info_aux
2892      * and regmatch_info_aux_eval sructs */
2893
2894     LEAVE_SCOPE(oldsave);
2895
2896     if (RXp_PAREN_NAMES(prog))
2897         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2898
2899     RX_MATCH_UTF8_set(rx, utf8_target);
2900
2901     /* make sure $`, $&, $', and $digit will work later */
2902     if ( !(flags & REXEC_NOT_FIRST) )
2903         S_reg_set_capture_string(aTHX_ rx,
2904                                     strbeg, reginfo->strend,
2905                                     sv, flags, utf8_target);
2906
2907     return 1;
2908
2909 phooey:
2910     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2911                           PL_colors[4], PL_colors[5]));
2912
2913     /* clean up; this will trigger destructors that will free all slabs
2914      * above the current one, and cleanup the regmatch_info_aux
2915      * and regmatch_info_aux_eval sructs */
2916
2917     LEAVE_SCOPE(oldsave);
2918
2919     if (swap) {
2920         /* we failed :-( roll it back */
2921         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2922             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2923             PTR2UV(prog),
2924             PTR2UV(prog->offs),
2925             PTR2UV(swap)
2926         ));
2927         Safefree(prog->offs);
2928         prog->offs = swap;
2929     }
2930     return 0;
2931 }
2932
2933
2934 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2935  * Do inc before dec, in case old and new rex are the same */
2936 #define SET_reg_curpm(Re2) \
2937     if (reginfo->info_aux_eval) {                   \
2938         (void)ReREFCNT_inc(Re2);                    \
2939         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2940         PM_SETRE((PL_reg_curpm), (Re2));            \
2941     }
2942
2943
2944 /*
2945  - regtry - try match at specific point
2946  */
2947 STATIC I32                      /* 0 failure, 1 success */
2948 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2949 {
2950     dVAR;
2951     CHECKPOINT lastcp;
2952     REGEXP *const rx = reginfo->prog;
2953     regexp *const prog = ReANY(rx);
2954     SSize_t result;
2955     RXi_GET_DECL(prog,progi);
2956     GET_RE_DEBUG_FLAGS_DECL;
2957
2958     PERL_ARGS_ASSERT_REGTRY;
2959
2960     reginfo->cutpoint=NULL;
2961
2962     prog->offs[0].start = *startposp - reginfo->strbeg;
2963     prog->lastparen = 0;
2964     prog->lastcloseparen = 0;
2965
2966     /* XXXX What this code is doing here?!!!  There should be no need
2967        to do this again and again, prog->lastparen should take care of
2968        this!  --ilya*/
2969
2970     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2971      * Actually, the code in regcppop() (which Ilya may be meaning by
2972      * prog->lastparen), is not needed at all by the test suite
2973      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2974      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2975      * Meanwhile, this code *is* needed for the
2976      * above-mentioned test suite tests to succeed.  The common theme
2977      * on those tests seems to be returning null fields from matches.
2978      * --jhi updated by dapm */
2979 #if 1
2980     if (prog->nparens) {
2981         regexp_paren_pair *pp = prog->offs;
2982         I32 i;
2983         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2984             ++pp;
2985             pp->start = -1;
2986             pp->end = -1;
2987         }
2988     }
2989 #endif
2990     REGCP_SET(lastcp);
2991     result = regmatch(reginfo, *startposp, progi->program + 1);
2992     if (result != -1) {
2993         prog->offs[0].end = result;
2994         return 1;
2995     }
2996     if (reginfo->cutpoint)
2997         *startposp= reginfo->cutpoint;
2998     REGCP_UNWIND(lastcp);
2999     return 0;
3000 }
3001
3002
3003 #define sayYES goto yes
3004 #define sayNO goto no
3005 #define sayNO_SILENT goto no_silent
3006
3007 /* we dont use STMT_START/END here because it leads to
3008    "unreachable code" warnings, which are bogus, but distracting. */
3009 #define CACHEsayNO \
3010     if (ST.cache_mask) \
3011        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3012     sayNO
3013
3014 /* this is used to determine how far from the left messages like
3015    'failed...' are printed. It should be set such that messages
3016    are inline with the regop output that created them.
3017 */
3018 #define REPORT_CODE_OFF 32
3019
3020
3021 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3022 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3023 #define CHRTEST_NOT_A_CP_1 -999
3024 #define CHRTEST_NOT_A_CP_2 -998
3025
3026 /* grab a new slab and return the first slot in it */
3027
3028 STATIC regmatch_state *
3029 S_push_slab(pTHX)
3030 {
3031 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3032     dMY_CXT;
3033 #endif
3034     regmatch_slab *s = PL_regmatch_slab->next;
3035     if (!s) {
3036         Newx(s, 1, regmatch_slab);
3037         s->prev = PL_regmatch_slab;
3038         s->next = NULL;
3039         PL_regmatch_slab->next = s;
3040     }
3041     PL_regmatch_slab = s;
3042     return SLAB_FIRST(s);
3043 }
3044
3045
3046 /* push a new state then goto it */
3047
3048 #define PUSH_STATE_GOTO(state, node, input) \
3049     pushinput = input; \
3050     scan = node; \
3051     st->resume_state = state; \
3052     goto push_state;
3053
3054 /* push a new state with success backtracking, then goto it */
3055
3056 #define PUSH_YES_STATE_GOTO(state, node, input) \
3057     pushinput = input; \
3058     scan = node; \
3059     st->resume_state = state; \
3060     goto push_yes_state;
3061
3062
3063
3064
3065 /*
3066
3067 regmatch() - main matching routine
3068
3069 This is basically one big switch statement in a loop. We execute an op,
3070 set 'next' to point the next op, and continue. If we come to a point which
3071 we may need to backtrack to on failure such as (A|B|C), we push a
3072 backtrack state onto the backtrack stack. On failure, we pop the top
3073 state, and re-enter the loop at the state indicated. If there are no more
3074 states to pop, we return failure.
3075
3076 Sometimes we also need to backtrack on success; for example /A+/, where
3077 after successfully matching one A, we need to go back and try to
3078 match another one; similarly for lookahead assertions: if the assertion
3079 completes successfully, we backtrack to the state just before the assertion
3080 and then carry on.  In these cases, the pushed state is marked as
3081 'backtrack on success too'. This marking is in fact done by a chain of
3082 pointers, each pointing to the previous 'yes' state. On success, we pop to
3083 the nearest yes state, discarding any intermediate failure-only states.
3084 Sometimes a yes state is pushed just to force some cleanup code to be
3085 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3086 it to free the inner regex.
3087
3088 Note that failure backtracking rewinds the cursor position, while
3089 success backtracking leaves it alone.
3090
3091 A pattern is complete when the END op is executed, while a subpattern
3092 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3093 ops trigger the "pop to last yes state if any, otherwise return true"
3094 behaviour.
3095
3096 A common convention in this function is to use A and B to refer to the two
3097 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3098 the subpattern to be matched possibly multiple times, while B is the entire
3099 rest of the pattern. Variable and state names reflect this convention.
3100
3101 The states in the main switch are the union of ops and failure/success of
3102 substates associated with with that op.  For example, IFMATCH is the op
3103 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3104 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3105 successfully matched A and IFMATCH_A_fail is a state saying that we have
3106 just failed to match A. Resume states always come in pairs. The backtrack
3107 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3108 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3109 on success or failure.
3110
3111 The struct that holds a backtracking state is actually a big union, with
3112 one variant for each major type of op. The variable st points to the
3113 top-most backtrack struct. To make the code clearer, within each
3114 block of code we #define ST to alias the relevant union.
3115
3116 Here's a concrete example of a (vastly oversimplified) IFMATCH
3117 implementation:
3118
3119     switch (state) {
3120     ....
3121
3122 #define ST st->u.ifmatch
3123
3124     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3125         ST.foo = ...; // some state we wish to save
3126         ...
3127         // push a yes backtrack state with a resume value of
3128         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3129         // first node of A:
3130         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3131         // NOTREACHED
3132
3133     case IFMATCH_A: // we have successfully executed A; now continue with B
3134         next = B;
3135         bar = ST.foo; // do something with the preserved value
3136         break;
3137
3138     case IFMATCH_A_fail: // A failed, so the assertion failed
3139         ...;   // do some housekeeping, then ...
3140         sayNO; // propagate the failure
3141
3142 #undef ST
3143
3144     ...
3145     }
3146
3147 For any old-timers reading this who are familiar with the old recursive
3148 approach, the code above is equivalent to:
3149
3150     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3151     {
3152         int foo = ...
3153         ...
3154         if (regmatch(A)) {
3155             next = B;
3156             bar = foo;
3157             break;
3158         }
3159         ...;   // do some housekeeping, then ...
3160         sayNO; // propagate the failure
3161     }
3162
3163 The topmost backtrack state, pointed to by st, is usually free. If you
3164 want to claim it, populate any ST.foo fields in it with values you wish to
3165 save, then do one of
3166
3167         PUSH_STATE_GOTO(resume_state, node, newinput);
3168         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3169
3170 which sets that backtrack state's resume value to 'resume_state', pushes a
3171 new free entry to the top of the backtrack stack, then goes to 'node'.
3172 On backtracking, the free slot is popped, and the saved state becomes the
3173 new free state. An ST.foo field in this new top state can be temporarily
3174 accessed to retrieve values, but once the main loop is re-entered, it
3175 becomes available for reuse.
3176
3177 Note that the depth of the backtrack stack constantly increases during the
3178 left-to-right execution of the pattern, rather than going up and down with
3179 the pattern nesting. For example the stack is at its maximum at Z at the
3180 end of the pattern, rather than at X in the following:
3181
3182     /(((X)+)+)+....(Y)+....Z/
3183
3184 The only exceptions to this are lookahead/behind assertions and the cut,
3185 (?>A), which pop all the backtrack states associated with A before
3186 continuing.
3187
3188 Backtrack state structs are allocated in slabs of about 4K in size.
3189 PL_regmatch_state and st always point to the currently active state,
3190 and PL_regmatch_slab points to the slab currently containing
3191 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3192 allocated, and is never freed until interpreter destruction. When the slab
3193 is full, a new one is allocated and chained to the end. At exit from
3194 regmatch(), slabs allocated since entry are freed.
3195
3196 */
3197
3198
3199 #define DEBUG_STATE_pp(pp)                                  \
3200     DEBUG_STATE_r({                                         \
3201         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3202         PerlIO_printf(Perl_debug_log,                       \
3203             "    %*s"pp" %s%s%s%s%s\n",                     \
3204             depth*2, "",                                    \
3205             PL_reg_name[st->resume_state],                     \
3206             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3207             ((st==yes_state) ? "Y" : ""),                   \
3208             ((st==mark_state) ? "M" : ""),                  \
3209             ((st==yes_state||st==mark_state) ? "]" : "")    \
3210         );                                                  \
3211     });
3212
3213
3214 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3215
3216 #ifdef DEBUGGING
3217
3218 STATIC void
3219 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3220     const char *start, const char *end, const char *blurb)
3221 {
3222     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3223
3224     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3225
3226     if (!PL_colorset)
3227             reginitcolors();
3228     {
3229         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3230             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3231
3232         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3233             start, end - start, 60);
3234
3235         PerlIO_printf(Perl_debug_log,
3236             "%s%s REx%s %s against %s\n",
3237                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3238
3239         if (utf8_target||utf8_pat)
3240             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3241                 utf8_pat ? "pattern" : "",
3242                 utf8_pat && utf8_target ? " and " : "",
3243                 utf8_target ? "string" : ""
3244             );
3245     }
3246 }
3247
3248 STATIC void
3249 S_dump_exec_pos(pTHX_ const char *locinput,
3250                       const regnode *scan,
3251                       const char *loc_regeol,
3252                       const char *loc_bostr,
3253                       const char *loc_reg_starttry,
3254                       const bool utf8_target)
3255 {
3256     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3257     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3258     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3259     /* The part of the string before starttry has one color
3260        (pref0_len chars), between starttry and current
3261        position another one (pref_len - pref0_len chars),
3262        after the current position the third one.
3263        We assume that pref0_len <= pref_len, otherwise we
3264        decrease pref0_len.  */
3265     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3266         ? (5 + taill) - l : locinput - loc_bostr;
3267     int pref0_len;
3268
3269     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3270
3271     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3272         pref_len++;
3273     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3274     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3275         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3276               ? (5 + taill) - pref_len : loc_regeol - locinput);
3277     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3278         l--;
3279     if (pref0_len < 0)
3280         pref0_len = 0;
3281     if (pref0_len > pref_len)
3282         pref0_len = pref_len;
3283     {
3284         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3285
3286         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3287             (locinput - pref_len),pref0_len, 60, 4, 5);
3288
3289         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3290                     (locinput - pref_len + pref0_len),
3291                     pref_len - pref0_len, 60, 2, 3);
3292
3293         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3294                     locinput, loc_regeol - locinput, 10, 0, 1);
3295
3296         const STRLEN tlen=len0+len1+len2;
3297         PerlIO_printf(Perl_debug_log,
3298                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3299                     (IV)(locinput - loc_bostr),
3300                     len0, s0,
3301                     len1, s1,
3302                     (docolor ? "" : "> <"),
3303                     len2, s2,
3304                     (int)(tlen > 19 ? 0 :  19 - tlen),
3305                     "");
3306     }
3307 }
3308
3309 #endif
3310
3311 /* reg_check_named_buff_matched()
3312  * Checks to see if a named buffer has matched. The data array of
3313  * buffer numbers corresponding to the buffer is expected to reside
3314  * in the regexp->data->data array in the slot stored in the ARG() of
3315  * node involved. Note that this routine doesn't actually care about the
3316  * name, that information is not preserved from compilation to execution.
3317  * Returns the index of the leftmost defined buffer with the given name
3318  * or 0 if non of the buffers matched.
3319  */
3320 STATIC I32
3321 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3322 {
3323     I32 n;
3324     RXi_GET_DECL(rex,rexi);
3325     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3326     I32 *nums=(I32*)SvPVX(sv_dat);
3327
3328     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3329
3330     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3331         if ((I32)rex->lastparen >= nums[n] &&
3332             rex->offs[nums[n]].end != -1)
3333         {
3334             return nums[n];
3335         }
3336     }
3337     return 0;
3338 }
3339
3340
3341 static bool
3342 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3343         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3344 {
3345     /* This function determines if there are one or two characters that match
3346      * the first character of the passed-in EXACTish node <text_node>, and if
3347      * so, returns them in the passed-in pointers.
3348      *
3349      * If it determines that no possible character in the target string can
3350      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3351      * the first character in <text_node> requires UTF-8 to represent, and the
3352      * target string isn't in UTF-8.)
3353      *
3354      * If there are more than two characters that could match the beginning of
3355      * <text_node>, or if more context is required to determine a match or not,
3356      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3357      *
3358      * The motiviation behind this function is to allow the caller to set up
3359      * tight loops for matching.  If <text_node> is of type EXACT, there is
3360      * only one possible character that can match its first character, and so
3361      * the situation is quite simple.  But things get much more complicated if
3362      * folding is involved.  It may be that the first character of an EXACTFish
3363      * node doesn't participate in any possible fold, e.g., punctuation, so it
3364      * can be matched only by itself.  The vast majority of characters that are
3365      * in folds match just two things, their lower and upper-case equivalents.
3366      * But not all are like that; some have multiple possible matches, or match
3367      * sequences of more than one character.  This function sorts all that out.
3368      *
3369      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3370      * loop of trying to match A*, we know we can't exit where the thing
3371      * following it isn't a B.  And something can't be a B unless it is the
3372      * beginning of B.  By putting a quick test for that beginning in a tight
3373      * loop, we can rule out things that can't possibly be B without having to
3374      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3375      * character, we can make a tight loop matching A*, using the outputs of
3376      * this function.
3377      *
3378      * If the target string to match isn't in UTF-8, and there aren't
3379      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3380      * the one or two possible octets (which are characters in this situation)
3381      * that can match.  In all cases, if there is only one character that can
3382      * match, *<c1p> and *<c2p> will be identical.
3383      *
3384      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3385      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3386      * can match the beginning of <text_node>.  They should be declared with at
3387      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3388      * undefined what these contain.)  If one or both of the buffers are
3389      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3390      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3391      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3392      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3393      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3394
3395     const bool utf8_target = reginfo->is_utf8_target;
3396
3397     UV c1 = CHRTEST_NOT_A_CP_1;
3398     UV c2 = CHRTEST_NOT_A_CP_2;
3399     bool use_chrtest_void = FALSE;
3400     const bool is_utf8_pat = reginfo->is_utf8_pat;
3401
3402     /* Used when we have both utf8 input and utf8 output, to avoid converting
3403      * to/from code points */
3404     bool utf8_has_been_setup = FALSE;
3405
3406     dVAR;
3407
3408     U8 *pat = (U8*)STRING(text_node);
3409
3410     if (OP(text_node) == EXACT) {
3411
3412         /* In an exact node, only one thing can be matched, that first
3413          * character.  If both the pat and the target are UTF-8, we can just
3414          * copy the input to the output, avoiding finding the code point of
3415          * that character */
3416         if (!is_utf8_pat) {
3417             c2 = c1 = *pat;
3418         }
3419         else if (utf8_target) {
3420             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3421             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3422             utf8_has_been_setup = TRUE;
3423         }
3424         else {
3425             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3426         }
3427     }
3428     else /* an EXACTFish node */
3429          if ((is_utf8_pat
3430                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3431                                                     pat + STR_LEN(text_node)))
3432              || (!is_utf8_pat
3433                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3434                                                     pat + STR_LEN(text_node))))
3435     {
3436         /* Multi-character folds require more context to sort out.  Also
3437          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3438          * handled outside this routine */
3439         use_chrtest_void = TRUE;
3440     }
3441     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3442         c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3443         if (c1 > 256) {
3444             /* Load the folds hash, if not already done */
3445             SV** listp;
3446             if (! PL_utf8_foldclosures) {
3447                 if (! PL_utf8_tofold) {
3448                     U8 dummy[UTF8_MAXBYTES_CASE+1];
3449
3450                     /* Force loading this by folding an above-Latin1 char */
3451                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3452                     assert(PL_utf8_tofold); /* Verify that worked */
3453                 }
3454                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3455             }
3456
3457             /* The fold closures data structure is a hash with the keys being
3458              * the UTF-8 of every character that is folded to, like 'k', and
3459              * the values each an array of all code points that fold to its
3460              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3461              * not included */
3462             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3463                                      (char *) pat,
3464                                      UTF8SKIP(pat),
3465                                      FALSE))))
3466             {
3467                 /* Not found in the hash, therefore there are no folds
3468                  * containing it, so there is only a single character that
3469                  * could match */
3470                 c2 = c1;
3471             }
3472             else {  /* Does participate in folds */
3473                 AV* list = (AV*) *listp;
3474                 if (av_len(list) != 1) {
3475
3476                     /* If there aren't exactly two folds to this, it is outside
3477                      * the scope of this function */
3478                     use_chrtest_void = TRUE;
3479                 }
3480                 else {  /* There are two.  Get them */
3481                     SV** c_p = av_fetch(list, 0, FALSE);
3482                     if (c_p == NULL) {
3483                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3484                     }
3485                     c1 = SvUV(*c_p);
3486
3487                     c_p = av_fetch(list, 1, FALSE);
3488                     if (c_p == NULL) {
3489                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3490                     }
3491                     c2 = SvUV(*c_p);
3492
3493                     /* Folds that cross the 255/256 boundary are forbidden if
3494                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3495                      * pattern character is above 256, and its only other match
3496                      * is below 256, the only legal match will be to itself.
3497                      * We have thrown away the original, so have to compute
3498                      * which is the one above 255 */
3499                     if ((c1 < 256) != (c2 < 256)) {
3500                         if (OP(text_node) == EXACTFL
3501                             || ((OP(text_node) == EXACTFA
3502                                  || OP(text_node) == EXACTFA_NO_TRIE)
3503                                 && (isASCII(c1) || isASCII(c2))))
3504                         {
3505                             if (c1 < 256) {
3506                                 c1 = c2;
3507                             }
3508                             else {
3509                                 c2 = c1;
3510                             }
3511                         }
3512                     }
3513                 }
3514             }
3515         }
3516         else /* Here, c1 is < 255 */
3517              if (utf8_target
3518                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3519                  && OP(text_node) != EXACTFL
3520                  && ((OP(text_node) != EXACTFA
3521                       && OP(text_node) != EXACTFA_NO_TRIE)
3522                      || ! isASCII(c1)))
3523         {
3524             /* Here, there could be something above Latin1 in the target which
3525              * folds to this character in the pattern.  All such cases except
3526              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3527              * involved in their folds, so are outside the scope of this
3528              * function */
3529             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3530                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3531             }
3532             else {
3533                 use_chrtest_void = TRUE;
3534             }
3535         }
3536         else { /* Here nothing above Latin1 can fold to the pattern character */
3537             switch (OP(text_node)) {
3538
3539                 case EXACTFL:   /* /l rules */
3540                     c2 = PL_fold_locale[c1];
3541                     break;
3542
3543                 case EXACTF:   /* This node only generated for non-utf8
3544                                   patterns */
3545                     assert(! is_utf8_pat);
3546                     if (! utf8_target) {    /* /d rules */
3547                         c2 = PL_fold[c1];
3548                         break;
3549                     }
3550                     /* FALLTHROUGH */
3551                     /* /u rules for all these.  This happens to work for
3552                      * EXACTFA as nothing in Latin1 folds to ASCII */
3553                 case EXACTFA_NO_TRIE:   /* This node only generated for
3554                                            non-utf8 patterns */
3555                     assert(! is_utf8_pat);
3556                     /* FALL THROUGH */
3557                 case EXACTFA:
3558                 case EXACTFU_SS:
3559                 case EXACTFU:
3560                     c2 = PL_fold_latin1[c1];
3561                     break;
3562
3563                 default:
3564                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3565                     assert(0); /* NOTREACHED */
3566             }
3567         }
3568     }
3569
3570     /* Here have figured things out.  Set up the returns */
3571     if (use_chrtest_void) {
3572         *c2p = *c1p = CHRTEST_VOID;
3573     }
3574     else if (utf8_target) {
3575         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3576             uvchr_to_utf8(c1_utf8, c1);
3577             uvchr_to_utf8(c2_utf8, c2);
3578         }
3579
3580         /* Invariants are stored in both the utf8 and byte outputs; Use
3581          * negative numbers otherwise for the byte ones.  Make sure that the
3582          * byte ones are the same iff the utf8 ones are the same */
3583         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3584         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3585                 ? *c2_utf8
3586                 : (c1 == c2)
3587                   ? CHRTEST_NOT_A_CP_1
3588                   : CHRTEST_NOT_A_CP_2;
3589     }
3590     else if (c1 > 255) {
3591        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3592                            can represent */
3593            return FALSE;
3594        }
3595
3596        *c1p = *c2p = c2;    /* c2 is the only representable value */
3597     }
3598     else {  /* c1 is representable; see about c2 */
3599        *c1p = c1;
3600        *c2p = (c2 < 256) ? c2 : c1;
3601     }
3602
3603     return TRUE;
3604 }
3605
3606 /* returns -1 on failure, $+[0] on success */
3607 STATIC SSize_t
3608 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3609 {
3610 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3611     dMY_CXT;
3612 #endif
3613     dVAR;
3614     const bool utf8_target = reginfo->is_utf8_target;
3615     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3616     REGEXP *rex_sv = reginfo->prog;
3617     regexp *rex = ReANY(rex_sv);
3618     RXi_GET_DECL(rex,rexi);
3619     /* the current state. This is a cached copy of PL_regmatch_state */
3620     regmatch_state *st;
3621     /* cache heavy used fields of st in registers */
3622     regnode *scan;
3623     regnode *next;
3624     U32 n = 0;  /* general value; init to avoid compiler warning */
3625     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
3626     char *locinput = startpos;
3627     char *pushinput; /* where to continue after a PUSH */
3628     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3629
3630     bool result = 0;        /* return value of S_regmatch */
3631     int depth = 0;          /* depth of backtrack stack */
3632     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3633     const U32 max_nochange_depth =
3634         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3635         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3636     regmatch_state *yes_state = NULL; /* state to pop to on success of
3637                                                             subpattern */
3638     /* mark_state piggy backs on the yes_state logic so that when we unwind
3639        the stack on success we can update the mark_state as we go */
3640     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3641     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3642     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3643     U32 state_num;
3644     bool no_final = 0;      /* prevent failure from backtracking? */
3645     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3646     char *startpoint = locinput;
3647     SV *popmark = NULL;     /* are we looking for a mark? */
3648     SV *sv_commit = NULL;   /* last mark name seen in failure */
3649     SV *sv_yes_mark = NULL; /* last mark name we have seen
3650                                during a successful match */
3651     U32 lastopen = 0;       /* last open we saw */
3652     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3653     SV* const oreplsv = GvSVn(PL_replgv);
3654     /* these three flags are set by various ops to signal information to
3655      * the very next op. They have a useful lifetime of exactly one loop
3656      * iteration, and are not preserved or restored by state pushes/pops
3657      */
3658     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3659     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3660     int logical = 0;        /* the following EVAL is:
3661                                 0: (?{...})
3662                                 1: (?(?{...})X|Y)
3663                                 2: (??{...})
3664                                or the following IFMATCH/UNLESSM is:
3665                                 false: plain (?=foo)
3666                                 true:  used as a condition: (?(?=foo))
3667                             */
3668     PAD* last_pad = NULL;
3669     dMULTICALL;
3670     I32 gimme = G_SCALAR;
3671     CV *caller_cv = NULL;       /* who called us */
3672     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3673     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3674     U32 maxopenparen = 0;       /* max '(' index seen so far */
3675     int to_complement;  /* Invert the result? */
3676     _char_class_number classnum;
3677     bool is_utf8_pat = reginfo->is_utf8_pat;
3678
3679 #ifdef DEBUGGING
3680     GET_RE_DEBUG_FLAGS_DECL;
3681 #endif
3682
3683     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3684     multicall_oldcatch = 0;
3685     multicall_cv = NULL;
3686     cx = NULL;
3687     PERL_UNUSED_VAR(multicall_cop);
3688     PERL_UNUSED_VAR(newsp);
3689
3690
3691     PERL_ARGS_ASSERT_REGMATCH;
3692
3693     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3694             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3695     }));
3696
3697     st = PL_regmatch_state;
3698
3699     /* Note that nextchr is a byte even in UTF */
3700     SET_nextchr;
3701     scan = prog;
3702     while (scan != NULL) {
3703
3704         DEBUG_EXECUTE_r( {
3705             SV * const prop = sv_newmortal();
3706             regnode *rnext=regnext(scan);
3707             DUMP_EXEC_POS( locinput, scan, utf8_target );
3708             regprop(rex, prop, scan);
3709
3710             PerlIO_printf(Perl_debug_log,
3711                     "%3"IVdf":%*s%s(%"IVdf")\n",
3712                     (IV)(scan - rexi->program), depth*2, "",
3713                     SvPVX_const(prop),
3714                     (PL_regkind[OP(scan)] == END || !rnext) ?
3715                         0 : (IV)(rnext - rexi->program));
3716         });
3717
3718         next = scan + NEXT_OFF(scan);
3719         if (next == scan)
3720             next = NULL;
3721         state_num = OP(scan);
3722
3723       reenter_switch:
3724         to_complement = 0;
3725
3726         SET_nextchr;
3727         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3728
3729         switch (state_num) {
3730         case BOL: /*  /^../  */
3731             if (locinput == reginfo->strbeg)
3732                 break;
3733             sayNO;
3734
3735         case MBOL: /*  /^../m  */
3736             if (locinput == reginfo->strbeg ||
3737                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3738             {
3739                 break;
3740             }
3741             sayNO;
3742
3743         case SBOL: /*  /^../s  */
3744             if (locinput == reginfo->strbeg)
3745                 break;
3746             sayNO;
3747
3748         case GPOS: /*  \G  */
3749             if (locinput == reginfo->ganch)
3750                 break;
3751             sayNO;
3752
3753         case KEEPS: /*   \K  */
3754             /* update the startpoint */
3755             st->u.keeper.val = rex->offs[0].start;
3756             rex->offs[0].start = locinput - reginfo->strbeg;
3757             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3758             assert(0); /*NOTREACHED*/
3759         case KEEPS_next_fail:
3760             /* rollback the start point change */
3761             rex->offs[0].start = st->u.keeper.val;
3762             sayNO_SILENT;
3763             assert(0); /*NOTREACHED*/
3764
3765         case MEOL: /* /..$/m  */
3766             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3767                 sayNO;
3768             break;
3769
3770         case EOL: /* /..$/  */
3771             /* FALL THROUGH */
3772         case SEOL: /* /..$/s  */
3773             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3774                 sayNO;
3775             if (reginfo->strend - locinput > 1)
3776                 sayNO;
3777             break;
3778
3779         case EOS: /*  \z  */
3780             if (!NEXTCHR_IS_EOS)
3781                 sayNO;
3782             break;
3783
3784         case SANY: /*  /./s  */
3785             if (NEXTCHR_IS_EOS)
3786                 sayNO;
3787             goto increment_locinput;
3788
3789         case CANY: /*  \C  */
3790             if (NEXTCHR_IS_EOS)
3791                 sayNO;
3792             locinput++;
3793             break;
3794
3795         case REG_ANY: /*  /./  */
3796             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3797                 sayNO;
3798             goto increment_locinput;
3799
3800
3801 #undef  ST
3802 #define ST st->u.trie
3803         case TRIEC: /* (ab|cd) with known charclass */
3804             /* In this case the charclass data is available inline so
3805                we can fail fast without a lot of extra overhead.
3806              */
3807             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3808                 DEBUG_EXECUTE_r(
3809                     PerlIO_printf(Perl_debug_log,
3810                               "%*s  %sfailed to match trie start class...%s\n",
3811                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3812                 );
3813                 sayNO_SILENT;
3814                 assert(0); /* NOTREACHED */
3815             }
3816             /* FALL THROUGH */
3817         case TRIE:  /* (ab|cd)  */
3818             /* the basic plan of execution of the trie is:
3819              * At the beginning, run though all the states, and
3820              * find the longest-matching word. Also remember the position
3821              * of the shortest matching word. For example, this pattern:
3822              *    1  2 3 4    5
3823              *    ab|a|x|abcd|abc
3824              * when matched against the string "abcde", will generate
3825              * accept states for all words except 3, with the longest
3826              * matching word being 4, and the shortest being 2 (with
3827              * the position being after char 1 of the string).
3828              *
3829              * Then for each matching word, in word order (i.e. 1,2,4,5),
3830              * we run the remainder of the pattern; on each try setting
3831              * the current position to the character following the word,
3832              * returning to try the next word on failure.
3833              *
3834              * We avoid having to build a list of words at runtime by
3835              * using a compile-time structure, wordinfo[].prev, which
3836              * gives, for each word, the previous accepting word (if any).
3837              * In the case above it would contain the mappings 1->2, 2->0,
3838              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3839              * the longest word (4 above), a list of all words, by
3840              * following the list of prev pointers; this gives us the
3841              * unordered list 4,5,1,2. Then given the current word we have
3842              * just tried, we can go through the list and find the
3843              * next-biggest word to try (so if we just failed on word 2,
3844              * the next in the list is 4).
3845              *
3846              * Since at runtime we don't record the matching position in
3847              * the string for each word, we have to work that out for
3848              * each word we're about to process. The wordinfo table holds
3849              * the character length of each word; given that we recorded
3850              * at the start: the position of the shortest word and its
3851              * length in chars, we just need to move the pointer the
3852              * difference between the two char lengths. Depending on
3853              * Unicode status and folding, that's cheap or expensive.
3854              *
3855              * This algorithm is optimised for the case where are only a
3856              * small number of accept states, i.e. 0,1, or maybe 2.
3857              * With lots of accepts states, and having to try all of them,
3858              * it becomes quadratic on number of accept states to find all
3859              * the next words.
3860              */
3861
3862             {
3863                 /* what type of TRIE am I? (utf8 makes this contextual) */
3864                 DECL_TRIE_TYPE(scan);
3865
3866                 /* what trie are we using right now */
3867                 reg_trie_data * const trie
3868                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3869                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3870                 U32 state = trie->startstate;
3871
3872                 if (   trie->bitmap
3873                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3874                 {
3875                     if (trie->states[ state ].wordnum) {
3876                          DEBUG_EXECUTE_r(
3877                             PerlIO_printf(Perl_debug_log,
3878                                           "%*s  %smatched empty string...%s\n",
3879                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3880                         );
3881                         if (!trie->jump)
3882                             break;
3883                     } else {
3884                         DEBUG_EXECUTE_r(
3885                             PerlIO_printf(Perl_debug_log,
3886                                           "%*s  %sfailed to match trie start class...%s\n",
3887                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3888                         );
3889                         sayNO_SILENT;
3890                    }
3891                 }
3892
3893             {
3894                 U8 *uc = ( U8* )locinput;
3895
3896                 STRLEN len = 0;
3897                 STRLEN foldlen = 0;
3898                 U8 *uscan = (U8*)NULL;
3899                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3900                 U32 charcount = 0; /* how many input chars we have matched */
3901                 U32 accepted = 0; /* have we seen any accepting states? */
3902
3903                 ST.jump = trie->jump;
3904                 ST.me = scan;
3905                 ST.firstpos = NULL;
3906                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3907                 ST.nextword = 0;
3908
3909                 /* fully traverse the TRIE; note the position of the
3910                    shortest accept state and the wordnum of the longest
3911                    accept state */
3912
3913                 while ( state && uc <= (U8*)(reginfo->strend) ) {
3914                     U32 base = trie->states[ state ].trans.base;
3915                     UV uvc = 0;
3916                     U16 charid = 0;
3917                     U16 wordnum;
3918                     wordnum = trie->states[ state ].wordnum;
3919
3920                     if (wordnum) { /* it's an accept state */
3921                         if (!accepted) {
3922                             accepted = 1;
3923                             /* record first match position */
3924                             if (ST.longfold) {
3925                                 ST.firstpos = (U8*)locinput;
3926                                 ST.firstchars = 0;
3927                             }
3928                             else {
3929                                 ST.firstpos = uc;
3930                                 ST.firstchars = charcount;
3931                             }
3932                         }
3933                         if (!ST.nextword || wordnum < ST.nextword)
3934                             ST.nextword = wordnum;
3935                         ST.topword = wordnum;
3936                     }
3937
3938                     DEBUG_TRIE_EXECUTE_r({
3939                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3940                                 PerlIO_printf( Perl_debug_log,
3941                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3942                                     2+depth * 2, "", PL_colors[4],
3943                                     (UV)state, (accepted ? 'Y' : 'N'));
3944                     });
3945
3946                     /* read a char and goto next state */
3947                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3948                         I32 offset;
3949                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3950                                              uscan, len, uvc, charid, foldlen,
3951                                              foldbuf, uniflags);
3952                         charcount++;
3953                         if (foldlen>0)
3954                             ST.longfold = TRUE;
3955                         if (charid &&
3956                              ( ((offset =
3957                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3958
3959                              && ((U32)offset < trie->lasttrans)
3960                              && trie->trans[offset].check == state)
3961                         {
3962                             state = trie->trans[offset].next;
3963                         }
3964                         else {
3965                             state = 0;
3966                         }
3967                         uc += len;
3968
3969                     }
3970                     else {
3971                         state = 0;
3972                     }
3973                     DEBUG_TRIE_EXECUTE_r(
3974                         PerlIO_printf( Perl_debug_log,
3975                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3976                             charid, uvc, (UV)state, PL_colors[5] );
3977                     );
3978                 }
3979                 if (!accepted)
3980                    sayNO;
3981
3982                 /* calculate total number of accept states */
3983                 {
3984                     U16 w = ST.topword;
3985                     accepted = 0;
3986                     while (w) {
3987                         w = trie->wordinfo[w].prev;
3988                         accepted++;
3989                     }
3990                     ST.accepted = accepted;
3991                 }
3992
3993                 DEBUG_EXECUTE_r(
3994                     PerlIO_printf( Perl_debug_log,
3995                         "%*s  %sgot %"IVdf" possible matches%s\n",
3996                         REPORT_CODE_OFF + depth * 2, "",
3997                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3998                 );
3999                 goto trie_first_try; /* jump into the fail handler */
4000             }}
4001             assert(0); /* NOTREACHED */
4002
4003         case TRIE_next_fail: /* we failed - try next alternative */
4004         {
4005             U8 *uc;
4006             if ( ST.jump) {
4007                 REGCP_UNWIND(ST.cp);
4008                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4009             }
4010             if (!--ST.accepted) {
4011                 DEBUG_EXECUTE_r({
4012                     PerlIO_printf( Perl_debug_log,
4013                         "%*s  %sTRIE failed...%s\n",
4014                         REPORT_CODE_OFF+depth*2, "",
4015                         PL_colors[4],
4016                         PL_colors[5] );
4017                 });
4018                 sayNO_SILENT;
4019             }
4020             {
4021                 /* Find next-highest word to process.  Note that this code
4022                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
4023                 U16 min = 0;
4024                 U16 word;
4025                 U16 const nextword = ST.nextword;
4026                 reg_trie_wordinfo * const wordinfo
4027                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4028                 for (word=ST.topword; word; word=wordinfo[word].prev) {
4029                     if (word > nextword && (!min || word < min))
4030                         min = word;
4031                 }
4032                 ST.nextword = min;
4033             }
4034
4035           trie_first_try:
4036             if (do_cutgroup) {
4037                 do_cutgroup = 0;
4038                 no_final = 0;
4039             }
4040
4041             if ( ST.jump) {
4042                 ST.lastparen = rex->lastparen;
4043                 ST.lastcloseparen = rex->lastcloseparen;
4044                 REGCP_SET(ST.cp);
4045             }
4046
4047             /* find start char of end of current word */
4048             {
4049                 U32 chars; /* how many chars to skip */
4050                 reg_trie_data * const trie
4051                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4052
4053                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4054                             >=  ST.firstchars);
4055                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4056                             - ST.firstchars;
4057                 uc = ST.firstpos;
4058
4059                 if (ST.longfold) {
4060                     /* the hard option - fold each char in turn and find
4061                      * its folded length (which may be different */
4062                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4063                     STRLEN foldlen;
4064                     STRLEN len;
4065                     UV uvc;
4066                     U8 *uscan;
4067
4068                     while (chars) {
4069                         if (utf8_target) {
4070                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4071                                                     uniflags);
4072                             uc += len;
4073                         }
4074                         else {
4075                             uvc = *uc;
4076                             uc++;
4077                         }
4078                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4079                         uscan = foldbuf;
4080                         while (foldlen) {
4081                             if (!--chars)
4082                                 break;
4083                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4084                                             uniflags);
4085                             uscan += len;
4086                             foldlen -= len;
4087                         }
4088                     }
4089                 }
4090                 else {
4091                     if (utf8_target)
4092                         while (chars--)
4093                             uc += UTF8SKIP(uc);
4094                     else
4095                         uc += chars;
4096                 }
4097             }
4098
4099             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4100                             ? ST.jump[ST.nextword]
4101                             : NEXT_OFF(ST.me));
4102
4103             DEBUG_EXECUTE_r({
4104                 PerlIO_printf( Perl_debug_log,
4105                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4106                     REPORT_CODE_OFF+depth*2, "",
4107                     PL_colors[4],
4108                     ST.nextword,
4109                     PL_colors[5]
4110                     );
4111             });
4112
4113             if (ST.accepted > 1 || has_cutgroup) {
4114                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4115                 assert(0); /* NOTREACHED */
4116             }
4117             /* only one choice left - just continue */
4118             DEBUG_EXECUTE_r({
4119                 AV *const trie_words
4120                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4121                 SV ** const tmp = av_fetch( trie_words,
4122                     ST.nextword-1, 0 );
4123                 SV *sv= tmp ? sv_newmortal() : NULL;
4124
4125                 PerlIO_printf( Perl_debug_log,
4126                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4127                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4128                     ST.nextword,
4129                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4130                             PL_colors[0], PL_colors[1],
4131                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4132                         )
4133                     : "not compiled under -Dr",
4134                     PL_colors[5] );
4135             });
4136
4137             locinput = (char*)uc;
4138             continue; /* execute rest of RE */
4139             assert(0); /* NOTREACHED */
4140         }
4141 #undef  ST
4142
4143         case EXACT: {            /*  /abc/        */
4144             char *s = STRING(scan);
4145             ln = STR_LEN(scan);
4146             if (utf8_target != is_utf8_pat) {
4147                 /* The target and the pattern have differing utf8ness. */
4148                 char *l = locinput;
4149                 const char * const e = s + ln;
4150
4151                 if (utf8_target) {
4152                     /* The target is utf8, the pattern is not utf8.
4153                      * Above-Latin1 code points can't match the pattern;
4154                      * invariants match exactly, and the other Latin1 ones need
4155                      * to be downgraded to a single byte in order to do the
4156                      * comparison.  (If we could be confident that the target
4157                      * is not malformed, this could be refactored to have fewer
4158                      * tests by just assuming that if the first bytes match, it
4159                      * is an invariant, but there are tests in the test suite
4160                      * dealing with (??{...}) which violate this) */
4161                     while (s < e) {
4162                         if (l >= reginfo->strend
4163                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4164                         {
4165                             sayNO;
4166                         }
4167                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4168                             if (*l != *s) {
4169                                 sayNO;
4170                             }
4171                             l++;
4172                         }
4173                         else {
4174                             if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4175                             {
4176                                 sayNO;
4177                             }
4178                             l += 2;
4179                         }
4180                         s++;
4181                     }
4182                 }
4183                 else {
4184                     /* The target is not utf8, the pattern is utf8. */
4185                     while (s < e) {
4186                         if (l >= reginfo->strend
4187                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4188                         {
4189                             sayNO;
4190                         }
4191                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4192                             if (*s != *l) {
4193                                 sayNO;
4194                             }
4195                             s++;
4196                         }
4197                         else {
4198                             if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4199                             {
4200                                 sayNO;
4201                             }
4202                             s += 2;
4203                         }
4204                         l++;
4205                     }
4206                 }
4207                 locinput = l;
4208             }
4209             else {
4210                 /* The target and the pattern have the same utf8ness. */
4211                 /* Inline the first character, for speed. */
4212                 if (reginfo->strend - locinput < ln
4213                     || UCHARAT(s) != nextchr
4214                     || (ln > 1 && memNE(s, locinput, ln)))
4215                 {
4216                     sayNO;
4217                 }
4218                 locinput += ln;
4219             }
4220             break;
4221             }
4222
4223         case EXACTFL: {          /*  /abc/il      */
4224             re_fold_t folder;
4225             const U8 * fold_array;
4226             const char * s;
4227             U32 fold_utf8_flags;
4228
4229             RX_MATCH_TAINTED_on(reginfo->prog);
4230             folder = foldEQ_locale;
4231             fold_array = PL_fold_locale;
4232             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4233             goto do_exactf;
4234
4235         case EXACTFU_SS:         /*  /\x{df}/iu   */
4236         case EXACTFU:            /*  /abc/iu      */
4237             folder = foldEQ_latin1;
4238             fold_array = PL_fold_latin1;
4239             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4240             goto do_exactf;
4241
4242         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
4243                                    patterns */
4244             assert(! is_utf8_pat);
4245             /* FALL THROUGH */
4246         case EXACTFA:            /*  /abc/iaa     */
4247             folder = foldEQ_latin1;
4248             fold_array = PL_fold_latin1;
4249             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4250             goto do_exactf;
4251
4252         case EXACTF:             /*  /abc/i    This node only generated for
4253                                                non-utf8 patterns */
4254             assert(! is_utf8_pat);
4255             folder = foldEQ;
4256             fold_array = PL_fold;
4257             fold_utf8_flags = 0;
4258
4259           do_exactf:
4260             s = STRING(scan);
4261             ln = STR_LEN(scan);
4262
4263             if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4264               /* Either target or the pattern are utf8, or has the issue where
4265                * the fold lengths may differ. */
4266                 const char * const l = locinput;
4267                 char *e = reginfo->strend;
4268
4269                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4270                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4271                 {
4272                     sayNO;
4273                 }
4274                 locinput = e;
4275                 break;
4276             }
4277
4278             /* Neither the target nor the pattern are utf8 */
4279             if (UCHARAT(s) != nextchr
4280                 && !NEXTCHR_IS_EOS
4281                 && UCHARAT(s) != fold_array[nextchr])
4282             {
4283                 sayNO;
4284             }
4285             if (reginfo->strend - locinput < ln)
4286                 sayNO;
4287             if (ln > 1 && ! folder(s, locinput, ln))
4288                 sayNO;
4289             locinput += ln;
4290             break;
4291         }
4292
4293         /* XXX Could improve efficiency by separating these all out using a
4294          * macro or in-line function.  At that point regcomp.c would no longer
4295          * have to set the FLAGS fields of these */
4296         case BOUNDL:  /*  /\b/l  */
4297         case NBOUNDL: /*  /\B/l  */
4298             RX_MATCH_TAINTED_on(reginfo->prog);
4299             /* FALL THROUGH */
4300         case BOUND:   /*  /\b/   */
4301         case BOUNDU:  /*  /\b/u  */
4302         case BOUNDA:  /*  /\b/a  */
4303         case NBOUND:  /*  /\B/   */
4304         case NBOUNDU: /*  /\B/u  */
4305         case NBOUNDA: /*  /\B/a  */
4306             /* was last char in word? */
4307             if (utf8_target
4308                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4309                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4310             {
4311                 if (locinput == reginfo->strbeg)
4312                     ln = '\n';
4313                 else {
4314                     const U8 * const r =
4315                             reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4316
4317                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4318                 }
4319                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4320                     ln = isWORDCHAR_uni(ln);
4321                     if (NEXTCHR_IS_EOS)
4322                         n = 0;
4323                     else {
4324                         LOAD_UTF8_CHARCLASS_ALNUM();
4325                         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4326                                                                 utf8_target);
4327                     }
4328                 }
4329                 else {
4330                     ln = isWORDCHAR_LC_uvchr(ln);
4331                     n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4332                 }
4333             }
4334             else {
4335
4336                 /* Here the string isn't utf8, or is utf8 and only ascii
4337                  * characters are to match \w.  In the latter case looking at
4338                  * the byte just prior to the current one may be just the final
4339                  * byte of a multi-byte character.  This is ok.  There are two
4340                  * cases:
4341                  * 1) it is a single byte character, and then the test is doing
4342                  *      just what it's supposed to.
4343                  * 2) it is a multi-byte character, in which case the final
4344                  *      byte is never mistakable for ASCII, and so the test
4345                  *      will say it is not a word character, which is the
4346                  *      correct answer. */
4347                 ln = (locinput != reginfo->strbeg) ?
4348                     UCHARAT(locinput - 1) : '\n';
4349                 switch (FLAGS(scan)) {
4350                     case REGEX_UNICODE_CHARSET:
4351                         ln = isWORDCHAR_L1(ln);
4352                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4353                         break;
4354                     case REGEX_LOCALE_CHARSET:
4355                         ln = isWORDCHAR_LC(ln);
4356                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4357                         break;
4358                     case REGEX_DEPENDS_CHARSET:
4359                         ln = isWORDCHAR(ln);
4360                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4361                         break;
4362                     case REGEX_ASCII_RESTRICTED_CHARSET:
4363                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4364                         ln = isWORDCHAR_A(ln);
4365                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4366                         break;
4367                     default:
4368                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4369                         break;
4370                 }
4371             }
4372             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4373              * regcomp.sym */
4374             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4375                     sayNO;
4376             break;
4377
4378         case ANYOF:  /*  /[abc]/       */
4379         case ANYOF_WARN_SUPER:
4380             if (NEXTCHR_IS_EOS)
4381                 sayNO;
4382             if (utf8_target) {
4383                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4384                     sayNO;
4385                 locinput += UTF8SKIP(locinput);
4386             }
4387             else {
4388                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4389                     sayNO;
4390                 locinput++;
4391             }
4392             break;
4393
4394         /* The argument (FLAGS) to all the POSIX node types is the class number
4395          * */
4396
4397         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4398             to_complement = 1;
4399             /* FALLTHROUGH */
4400
4401         case POSIXL:    /* \w or [:punct:] etc. under /l */
4402             if (NEXTCHR_IS_EOS)
4403                 sayNO;
4404
4405             /* The locale hasn't influenced the outcome before this, so defer
4406              * tainting until now */
4407             RX_MATCH_TAINTED_on(reginfo->prog);
4408
4409             /* Use isFOO_lc() for characters within Latin1.  (Note that
4410              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4411              * wouldn't be invariant) */
4412             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4413                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4414                     sayNO;
4415                 }
4416             }
4417             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4418                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4419                                            (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4420                                                             *(locinput + 1))))))
4421                 {
4422                     sayNO;
4423                 }
4424             }
4425             else { /* Here, must be an above Latin-1 code point */
4426                 goto utf8_posix_not_eos;
4427             }
4428
4429             /* Here, must be utf8 */
4430             locinput += UTF8SKIP(locinput);
4431             break;
4432
4433         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4434             to_complement = 1;
4435             /* FALLTHROUGH */
4436
4437         case POSIXD:    /* \w or [:punct:] etc. under /d */
4438             if (utf8_target) {
4439                 goto utf8_posix;
4440             }
4441             goto posixa;
4442
4443         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4444
4445             if (NEXTCHR_IS_EOS) {
4446                 sayNO;
4447             }
4448
4449             /* All UTF-8 variants match */
4450             if (! UTF8_IS_INVARIANT(nextchr)) {
4451                 goto increment_locinput;
4452             }
4453
4454             to_complement = 1;
4455             /* FALLTHROUGH */
4456
4457         case POSIXA:    /* \w or [:punct:] etc. under /a */
4458
4459           posixa:
4460             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4461              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4462              * character is a single byte */
4463
4464             if (NEXTCHR_IS_EOS
4465                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4466                                                             FLAGS(scan)))))
4467             {
4468                 sayNO;
4469             }
4470
4471             /* Here we are either not in utf8, or we matched a utf8-invariant,
4472              * so the next char is the next byte */
4473             locinput++;
4474             break;
4475
4476         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4477             to_complement = 1;
4478             /* FALLTHROUGH */
4479
4480         case POSIXU:    /* \w or [:punct:] etc. under /u */
4481           utf8_posix:
4482             if (NEXTCHR_IS_EOS) {
4483                 sayNO;
4484             }
4485           utf8_posix_not_eos:
4486
4487             /* Use _generic_isCC() for characters within Latin1.  (Note that
4488              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4489              * wouldn't be invariant) */
4490             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4491                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4492                                                            FLAGS(scan)))))
4493                 {
4494                     sayNO;
4495                 }
4496                 locinput++;
4497             }
4498             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4499                 if (! (to_complement
4500                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4501                                                                *(locinput + 1)),
4502                                              FLAGS(scan)))))
4503                 {
4504                     sayNO;
4505                 }
4506                 locinput += 2;
4507             }
4508             else {  /* Handle above Latin-1 code points */
4509                 classnum = (_char_class_number) FLAGS(scan);
4510                 if (classnum < _FIRST_NON_SWASH_CC) {
4511
4512                     /* Here, uses a swash to find such code points.  Load if if
4513                      * not done already */
4514                     if (! PL_utf8_swash_ptrs[classnum]) {
4515                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4516                         PL_utf8_swash_ptrs[classnum]
4517                                 = _core_swash_init("utf8",
4518                                         swash_property_names[classnum],
4519                                         &PL_sv_undef, 1, 0, NULL, &flags);
4520                     }
4521                     if (! (to_complement
4522                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4523                                                (U8 *) locinput, TRUE))))
4524                     {
4525                         sayNO;
4526                     }
4527                 }
4528                 else {  /* Here, uses macros to find above Latin-1 code points */
4529                     switch (classnum) {
4530                         case _CC_ENUM_SPACE:    /* XXX would require separate
4531                                                    code if we revert the change
4532                                                    of \v matching this */
4533                         case _CC_ENUM_PSXSPC:
4534                             if (! (to_complement
4535                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4536                             {
4537                                 sayNO;
4538                             }
4539                             break;
4540                         case _CC_ENUM_BLANK:
4541                             if (! (to_complement
4542                                             ^ cBOOL(is_HORIZWS_high(locinput))))
4543                             {
4544                                 sayNO;
4545                             }
4546                             break;
4547                         case _CC_ENUM_XDIGIT:
4548                             if (! (to_complement
4549                                             ^ cBOOL(is_XDIGIT_high(locinput))))
4550                             {
4551                                 sayNO;
4552                             }
4553                             break;
4554                         case _CC_ENUM_VERTSPACE:
4555                             if (! (to_complement
4556                                             ^ cBOOL(is_VERTWS_high(locinput))))
4557                             {
4558                                 sayNO;
4559                             }
4560                             break;
4561                         default:    /* The rest, e.g. [:cntrl:], can't match
4562                                        above Latin1 */
4563                             if (! to_complement) {
4564                                 sayNO;
4565                             }
4566                             break;
4567                     }
4568                 }
4569                 locinput += UTF8SKIP(locinput);
4570             }
4571             break;
4572
4573         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4574                        a Unicode extended Grapheme Cluster */
4575             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4576               extended Grapheme Cluster is:
4577
4578             CR LF
4579             | Prepend* Begin Extend*
4580             | .
4581
4582             Begin is:           ( Special_Begin | ! Control )
4583             Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4584             Extend is:          ( Grapheme_Extend | Spacing_Mark )
4585             Control is:         [ GCB_Control | CR | LF ]
4586             Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4587
4588                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4589                we can rewrite
4590
4591                    Begin is ( Regular_Begin + Special Begin )
4592
4593                It turns out that 98.4% of all Unicode code points match
4594                Regular_Begin.  Doing it this way eliminates a table match in
4595                the previous implementation for almost all Unicode code points.
4596
4597                There is a subtlety with Prepend* which showed up in testing.
4598                Note that the Begin, and only the Begin is required in:
4599                 | Prepend* Begin Extend*
4600                Also, Begin contains '! Control'.  A Prepend must be a
4601                '!  Control', which means it must also be a Begin.  What it
4602                comes down to is that if we match Prepend* and then find no
4603                suitable Begin afterwards, that if we backtrack the last
4604                Prepend, that one will be a suitable Begin.
4605             */
4606
4607             if (NEXTCHR_IS_EOS)
4608                 sayNO;
4609             if  (! utf8_target) {
4610
4611                 /* Match either CR LF  or '.', as all the other possibilities
4612                  * require utf8 */
4613                 locinput++;         /* Match the . or CR */
4614                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4615                                        match the LF */
4616                     && locinput < reginfo->strend
4617                     && UCHARAT(locinput) == '\n')
4618                 {
4619                     locinput++;
4620                 }
4621             }
4622             else {
4623
4624                 /* Utf8: See if is ( CR LF ); already know that locinput <
4625                  * reginfo->strend, so locinput+1 is in bounds */
4626                 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4627                      && UCHARAT(locinput + 1) == '\n')
4628                 {
4629                     locinput += 2;
4630                 }
4631                 else {
4632                     STRLEN len;
4633
4634                     /* In case have to backtrack to beginning, then match '.' */
4635                     char *starting = locinput;
4636
4637                     /* In case have to backtrack the last prepend */
4638                     char *previous_prepend = NULL;
4639
4640                     LOAD_UTF8_CHARCLASS_GCB();
4641
4642                     /* Match (prepend)*   */
4643                     while (locinput < reginfo->strend
4644                            && (len = is_GCB_Prepend_utf8(locinput)))
4645                     {
4646                         previous_prepend = locinput;
4647                         locinput += len;
4648                     }
4649
4650                     /* As noted above, if we matched a prepend character, but
4651                      * the next thing won't match, back off the last prepend we
4652                      * matched, as it is guaranteed to match the begin */
4653                     if (previous_prepend
4654                         && (locinput >=  reginfo->strend
4655                             || (! swash_fetch(PL_utf8_X_regular_begin,
4656                                              (U8*)locinput, utf8_target)
4657                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4658                         )
4659                     {
4660                         locinput = previous_prepend;
4661                     }
4662
4663                     /* Note that here we know reginfo->strend > locinput, as we
4664                      * tested that upon input to this switch case, and if we
4665                      * moved locinput forward, we tested the result just above
4666                      * and it either passed, or we backed off so that it will
4667                      * now pass */
4668                     if (swash_fetch(PL_utf8_X_regular_begin,
4669                                     (U8*)locinput, utf8_target)) {
4670                         locinput += UTF8SKIP(locinput);
4671                     }
4672                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4673
4674                         /* Here did not match the required 'Begin' in the
4675                          * second term.  So just match the very first
4676                          * character, the '.' of the final term of the regex */
4677                         locinput = starting + UTF8SKIP(starting);
4678                         goto exit_utf8;
4679                     } else {
4680
4681                         /* Here is a special begin.  It can be composed of
4682                          * several individual characters.  One possibility is
4683                          * RI+ */
4684                         if ((len = is_GCB_RI_utf8(locinput))) {
4685                             locinput += len;
4686                             while (locinput < reginfo->strend
4687                                    && (len = is_GCB_RI_utf8(locinput)))
4688                             {
4689                                 locinput += len;
4690                             }
4691                         } else if ((len = is_GCB_T_utf8(locinput))) {
4692                             /* Another possibility is T+ */
4693                             locinput += len;
4694                             while (locinput < reginfo->strend
4695                                 && (len = is_GCB_T_utf8(locinput)))
4696                             {
4697                                 locinput += len;
4698                             }
4699                         } else {
4700
4701                             /* Here, neither RI+ nor T+; must be some other
4702                              * Hangul.  That means it is one of the others: L,
4703                              * LV, LVT or V, and matches:
4704                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4705
4706                             /* Match L*           */
4707                             while (locinput < reginfo->strend
4708                                    && (len = is_GCB_L_utf8(locinput)))
4709                             {
4710                                 locinput += len;
4711                             }
4712
4713                             /* Here, have exhausted L*.  If the next character
4714                              * is not an LV, LVT nor V, it means we had to have
4715                              * at least one L, so matches L+ in the original
4716                              * equation, we have a complete hangul syllable.
4717                              * Are done. */
4718
4719                             if (locinput < reginfo->strend
4720                                 && is_GCB_LV_LVT_V_utf8(locinput))
4721                             {
4722                                 /* Otherwise keep going.  Must be LV, LVT or V.
4723                                  * See if LVT, by first ruling out V, then LV */
4724                                 if (! is_GCB_V_utf8(locinput)
4725                                         /* All but every TCount one is LV */
4726                                     && (valid_utf8_to_uvchr((U8 *) locinput,
4727                                                                          NULL)
4728                                                                         - SBASE)
4729                                         % TCount != 0)
4730                                 {
4731                                     locinput += UTF8SKIP(locinput);
4732                                 } else {
4733
4734                                     /* Must be  V or LV.  Take it, then match
4735                                      * V*     */
4736                                     locinput += UTF8SKIP(locinput);
4737                                     while (locinput < reginfo->strend
4738                                            && (len = is_GCB_V_utf8(locinput)))
4739                                     {
4740                                         locinput += len;
4741                                     }
4742                                 }
4743
4744                                 /* And any of LV, LVT, or V can be followed
4745                                  * by T*            */
4746                                 while (locinput < reginfo->strend
4747                                        && (len = is_GCB_T_utf8(locinput)))
4748                                 {
4749                                     locinput += len;
4750                                 }
4751                             }
4752                         }
4753                     }
4754
4755                     /* Match any extender */
4756                     while (locinput < reginfo->strend
4757                             && swash_fetch(PL_utf8_X_extend,
4758                                             (U8*)locinput, utf8_target))
4759                     {
4760                         locinput += UTF8SKIP(locinput);
4761                     }
4762                 }
4763             exit_utf8:
4764                 if (locinput > reginfo->strend) sayNO;
4765             }
4766             break;
4767
4768         case NREFFL:  /*  /\g{name}/il  */
4769         {   /* The capture buffer cases.  The ones beginning with N for the
4770                named buffers just convert to the equivalent numbered and
4771                pretend they were called as the corresponding numbered buffer
4772                op.  */
4773             /* don't initialize these in the declaration, it makes C++
4774                unhappy */
4775             const char *s;
4776             char type;
4777             re_fold_t folder;
4778             const U8 *fold_array;
4779             UV utf8_fold_flags;
4780
4781             RX_MATCH_TAINTED_on(reginfo->prog);
4782             folder = foldEQ_locale;
4783             fold_array = PL_fold_locale;
4784             type = REFFL;
4785             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4786             goto do_nref;
4787
4788         case NREFFA:  /*  /\g{name}/iaa  */
4789             folder = foldEQ_latin1;
4790             fold_array = PL_fold_latin1;
4791             type = REFFA;
4792             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4793             goto do_nref;
4794
4795         case NREFFU:  /*  /\g{name}/iu  */
4796             folder = foldEQ_latin1;
4797             fold_array = PL_fold_latin1;
4798             type = REFFU;
4799             utf8_fold_flags = 0;
4800             goto do_nref;
4801
4802         case NREFF:  /*  /\g{name}/i  */
4803             folder = foldEQ;
4804             fold_array = PL_fold;
4805             type = REFF;
4806             utf8_fold_flags = 0;
4807             goto do_nref;
4808
4809         case NREF:  /*  /\g{name}/   */
4810             type = REF;
4811             folder = NULL;
4812             fold_array = NULL;
4813             utf8_fold_flags = 0;
4814           do_nref:
4815
4816             /* For the named back references, find the corresponding buffer
4817              * number */
4818             n = reg_check_named_buff_matched(rex,scan);
4819
4820             if ( ! n ) {
4821                 sayNO;
4822             }
4823             goto do_nref_ref_common;
4824
4825         case REFFL:  /*  /\1/il  */
4826             RX_MATCH_TAINTED_on(reginfo->prog);
4827             folder = foldEQ_locale;
4828             fold_array = PL_fold_locale;
4829             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4830             goto do_ref;
4831
4832         case REFFA:  /*  /\1/iaa  */
4833             folder = foldEQ_latin1;
4834             fold_array = PL_fold_latin1;
4835             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4836             goto do_ref;
4837
4838         case REFFU:  /*  /\1/iu  */
4839             folder = foldEQ_latin1;
4840             fold_array = PL_fold_latin1;
4841             utf8_fold_flags = 0;
4842             goto do_ref;
4843
4844         case REFF:  /*  /\1/i  */
4845             folder = foldEQ;
4846             fold_array = PL_fold;
4847             utf8_fold_flags = 0;
4848             goto do_ref;
4849
4850         case REF:  /*  /\1/    */
4851             folder = NULL;
4852             fold_array = NULL;
4853             utf8_fold_flags = 0;
4854
4855           do_ref:
4856             type = OP(scan);
4857             n = ARG(scan);  /* which paren pair */
4858
4859           do_nref_ref_common:
4860             ln = rex->offs[n].start;
4861             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4862             if (rex->lastparen < n || ln == -1)
4863                 sayNO;                  /* Do not match unless seen CLOSEn. */
4864             if (ln == rex->offs[n].end)
4865                 break;
4866
4867             s = reginfo->strbeg + ln;
4868             if (type != REF     /* REF can do byte comparison */
4869                 && (utf8_target || type == REFFU))
4870             { /* XXX handle REFFL better */
4871                 char * limit = reginfo->strend;
4872
4873                 /* This call case insensitively compares the entire buffer
4874                     * at s, with the current input starting at locinput, but
4875                     * not going off the end given by reginfo->strend, and
4876                     * returns in <limit> upon success, how much of the
4877                     * current input was matched */
4878                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4879                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4880                 {
4881                     sayNO;
4882                 }
4883                 locinput = limit;
4884                 break;
4885             }
4886
4887             /* Not utf8:  Inline the first character, for speed. */
4888             if (!NEXTCHR_IS_EOS &&
4889                 UCHARAT(s) != nextchr &&
4890                 (type == REF ||
4891                  UCHARAT(s) != fold_array[nextchr]))
4892                 sayNO;
4893             ln = rex->offs[n].end - ln;
4894             if (locinput + ln > reginfo->strend)
4895                 sayNO;
4896             if (ln > 1 && (type == REF
4897                            ? memNE(s, locinput, ln)
4898                            : ! folder(s, locinput, ln)))
4899                 sayNO;
4900             locinput += ln;
4901             break;
4902         }
4903
4904         case NOTHING: /* null op; e.g. the 'nothing' following
4905                        * the '*' in m{(a+|b)*}' */
4906             break;
4907         case TAIL: /* placeholder while compiling (A|B|C) */
4908             break;
4909
4910         case BACK: /* ??? doesn't appear to be used ??? */
4911             break;
4912
4913 #undef  ST
4914 #define ST st->u.eval
4915         {
4916             SV *ret;
4917             REGEXP *re_sv;
4918             regexp *re;
4919             regexp_internal *rei;
4920             regnode *startpoint;
4921
4922         case GOSTART: /*  (?R)  */
4923         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4924             if (cur_eval && cur_eval->locinput==locinput) {
4925                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4926                     Perl_croak(aTHX_ "Infinite recursion in regex");
4927                 if ( ++nochange_depth > max_nochange_depth )
4928                     Perl_croak(aTHX_
4929                         "Pattern subroutine nesting without pos change"
4930                         " exceeded limit in regex");
4931             } else {
4932                 nochange_depth = 0;
4933             }
4934             re_sv = rex_sv;
4935             re = rex;
4936             rei = rexi;
4937             if (OP(scan)==GOSUB) {
4938                 startpoint = scan + ARG2L(scan);
4939                 ST.close_paren = ARG(scan);
4940             } else {
4941                 startpoint = rei->program+1;
4942                 ST.close_paren = 0;
4943             }
4944             goto eval_recurse_doit;
4945             assert(0); /* NOTREACHED */
4946
4947         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4948             if (cur_eval && cur_eval->locinput==locinput) {
4949                 if ( ++nochange_depth > max_nochange_depth )
4950                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4951             } else {
4952                 nochange_depth = 0;
4953             }
4954             {
4955                 /* execute the code in the {...} */
4956
4957                 dSP;
4958                 IV before;
4959                 OP * const oop = PL_op;
4960                 COP * const ocurcop = PL_curcop;
4961                 OP *nop;
4962                 CV *newcv;
4963
4964                 /* save *all* paren positions */
4965                 regcppush(rex, 0, maxopenparen);
4966                 REGCP_SET(runops_cp);
4967
4968                 if (!caller_cv)
4969                     caller_cv = find_runcv(NULL);
4970
4971                 n = ARG(scan);
4972
4973                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4974                     newcv = (ReANY(
4975                                                 (REGEXP*)(rexi->data->data[n])
4976                                             ))->qr_anoncv
4977                                         ;
4978                     nop = (OP*)rexi->data->data[n+1];
4979                 }
4980                 else if (rexi->data->what[n] == 'l') { /* literal code */
4981                     newcv = caller_cv;
4982                     nop = (OP*)rexi->data->data[n];
4983                     assert(CvDEPTH(newcv));
4984                 }
4985                 else {
4986                     /* literal with own CV */
4987                     assert(rexi->data->what[n] == 'L');
4988                     newcv = rex->qr_anoncv;
4989                     nop = (OP*)rexi->data->data[n];
4990                 }
4991
4992                 /* normally if we're about to execute code from the same
4993                  * CV that we used previously, we just use the existing
4994                  * CX stack entry. However, its possible that in the
4995                  * meantime we may have backtracked, popped from the save
4996                  * stack, and undone the SAVECOMPPAD(s) associated with
4997                  * PUSH_MULTICALL; in which case PL_comppad no longer
4998                  * points to newcv's pad. */
4999                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5000                 {
5001                     U8 flags = (CXp_SUB_RE |
5002                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5003                     if (last_pushed_cv) {
5004                         CHANGE_MULTICALL_FLAGS(newcv, flags);
5005                     }
5006                     else {
5007                         PUSH_MULTICALL_FLAGS(newcv, flags);
5008                     }
5009                     last_pushed_cv = newcv;
5010                 }
5011                 else {
5012                     /* these assignments are just to silence compiler
5013                      * warnings */
5014                     multicall_cop = NULL;
5015                     newsp = NULL;
5016                 }
5017                 last_pad = PL_comppad;
5018
5019                 /* the initial nextstate you would normally execute
5020                  * at the start of an eval (which would cause error
5021                  * messages to come from the eval), may be optimised
5022                  * away from the execution path in the regex code blocks;
5023                  * so manually set PL_curcop to it initially */
5024                 {
5025                     OP *o = cUNOPx(nop)->op_first;
5026                     assert(o->op_type == OP_NULL);
5027                     if (o->op_targ == OP_SCOPE) {
5028                         o = cUNOPo->op_first;
5029                     }
5030                     else {
5031                         assert(o->op_targ == OP_LEAVE);
5032                         o = cUNOPo->op_first;
5033                         assert(o->op_type == OP_ENTER);
5034                         o = o->op_sibling;
5035                     }
5036
5037                     if (o->op_type != OP_STUB) {
5038                         assert(    o->op_type == OP_NEXTSTATE
5039                                 || o->op_type == OP_DBSTATE
5040                                 || (o->op_type == OP_NULL
5041                                     &&  (  o->op_targ == OP_NEXTSTATE
5042                                         || o->op_targ == OP_DBSTATE
5043                                         )
5044                                     )
5045                         );
5046                         PL_curcop = (COP*)o;
5047                     }
5048                 }
5049                 nop = nop->op_next;
5050
5051                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5052                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5053
5054                 rex->offs[0].end = locinput - reginfo->strbeg;
5055                 if (reginfo->info_aux_eval->pos_magic)
5056                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5057                                   reginfo->sv, reginfo->strbeg,
5058                                   locinput - reginfo->strbeg);
5059
5060                 if (sv_yes_mark) {
5061                     SV *sv_mrk = get_sv("REGMARK", 1);
5062                     sv_setsv(sv_mrk, sv_yes_mark);
5063                 }
5064
5065                 /* we don't use MULTICALL here as we want to call the
5066                  * first op of the block of interest, rather than the
5067                  * first op of the sub */
5068                 before = (IV)(SP-PL_stack_base);
5069                 PL_op = nop;
5070                 CALLRUNOPS(aTHX);                       /* Scalar context. */
5071                 SPAGAIN;
5072                 if ((IV)(SP-PL_stack_base) == before)
5073                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
5074                 else {
5075                     ret = POPs;
5076                     PUTBACK;
5077                 }
5078
5079                 /* before restoring everything, evaluate the returned
5080                  * value, so that 'uninit' warnings don't use the wrong
5081                  * PL_op or pad. Also need to process any magic vars
5082                  * (e.g. $1) *before* parentheses are restored */
5083
5084                 PL_op = NULL;
5085
5086                 re_sv = NULL;
5087                 if (logical == 0)        /*   (?{})/   */
5088                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5089                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5090                     sw = cBOOL(SvTRUE(ret));
5091                     logical = 0;
5092                 }
5093                 else {                   /*  /(??{})  */
5094                     /*  if its overloaded, let the regex compiler handle
5095                      *  it; otherwise extract regex, or stringify  */
5096                     if (!SvAMAGIC(ret)) {
5097                         SV *sv = ret;
5098                         if (SvROK(sv))
5099                             sv = SvRV(sv);
5100                         if (SvTYPE(sv) == SVt_REGEXP)
5101                             re_sv = (REGEXP*) sv;
5102                         else if (SvSMAGICAL(sv)) {
5103                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5104                             if (mg)
5105                                 re_sv = (REGEXP *) mg->mg_obj;
5106                         }
5107
5108                         /* force any magic, undef warnings here */
5109                         if (!re_sv) {
5110                             ret = sv_mortalcopy(ret);
5111                             (void) SvPV_force_nolen(ret);
5112                         }
5113                     }
5114
5115                 }
5116
5117                 /* *** Note that at this point we don't restore
5118                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5119                  * be used again soon. This is safe as long as nothing
5120                  * in the regexp code uses the pad ! */
5121                 PL_op = oop;
5122                 PL_curcop = ocurcop;
5123                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5124                 PL_curpm = PL_reg_curpm;
5125
5126                 if (logical != 2)
5127                     break;
5128             }
5129
5130                 /* only /(??{})/  from now on */
5131                 logical = 0;
5132                 {
5133                     /* extract RE object from returned value; compiling if
5134                      * necessary */
5135
5136                     if (re_sv) {
5137                         re_sv = reg_temp_copy(NULL, re_sv);
5138                     }
5139                     else {
5140                         U32 pm_flags = 0;
5141
5142                         if (SvUTF8(ret) && IN_BYTES) {
5143                             /* In use 'bytes': make a copy of the octet
5144                              * sequence, but without the flag on */
5145                             STRLEN len;
5146                             const char *const p = SvPV(ret, len);
5147                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5148                         }
5149                         if (rex->intflags & PREGf_USE_RE_EVAL)
5150                             pm_flags |= PMf_USE_RE_EVAL;
5151
5152                         /* if we got here, it should be an engine which
5153                          * supports compiling code blocks and stuff */
5154                         assert(rex->engine && rex->engine->op_comp);
5155                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5156                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5157                                     rex->engine, NULL, NULL,
5158                                     /* copy /msix etc to inner pattern */
5159                                     scan->flags,
5160                                     pm_flags);
5161
5162                         if (!(SvFLAGS(ret)
5163                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5164                                  | SVs_GMG))) {
5165                             /* This isn't a first class regexp. Instead, it's
5166                                caching a regexp onto an existing, Perl visible
5167                                scalar.  */
5168                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5169                         }
5170                         /* safe to do now that any $1 etc has been
5171                          * interpolated into the new pattern string and
5172                          * compiled */
5173                         S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5174                     }
5175                     SAVEFREESV(re_sv);
5176                     re = ReANY(re_sv);
5177                 }
5178                 RXp_MATCH_COPIED_off(re);
5179                 re->subbeg = rex->subbeg;
5180                 re->sublen = rex->sublen;
5181                 re->suboffset = rex->suboffset;
5182                 re->subcoffset = rex->subcoffset;
5183                 rei = RXi_GET(re);
5184                 DEBUG_EXECUTE_r(
5185                     debug_start_match(re_sv, utf8_target, locinput,
5186                                     reginfo->strend, "Matching embedded");
5187                 );
5188                 startpoint = rei->program + 1;
5189                 ST.close_paren = 0; /* only used for GOSUB */
5190
5191         eval_recurse_doit: /* Share code with GOSUB below this line */
5192                 /* run the pattern returned from (??{...}) */
5193
5194                 /* Save *all* the positions. */
5195                 ST.cp = regcppush(rex, 0, maxopenparen);
5196                 REGCP_SET(ST.lastcp);
5197
5198                 re->lastparen = 0;
5199                 re->lastcloseparen = 0;
5200
5201                 maxopenparen = 0;
5202
5203                 /* invalidate the S-L poscache. We're now executing a
5204                  * different set of WHILEM ops (and their associated
5205                  * indexes) against the same string, so the bits in the
5206                  * cache are meaningless. Setting maxiter to zero forces
5207                  * the cache to be invalidated and zeroed before reuse.
5208                  * XXX This is too dramatic a measure. Ideally we should
5209                  * save the old cache and restore when running the outer
5210                  * pattern again */
5211                 reginfo->poscache_maxiter = 0;
5212
5213                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5214
5215                 ST.prev_rex = rex_sv;
5216                 ST.prev_curlyx = cur_curlyx;
5217                 rex_sv = re_sv;
5218                 SET_reg_curpm(rex_sv);
5219                 rex = re;
5220                 rexi = rei;
5221                 cur_curlyx = NULL;
5222                 ST.B = next;
5223                 ST.prev_eval = cur_eval;
5224                 cur_eval = st;
5225                 /* now continue from first node in postoned RE */
5226                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5227                 assert(0); /* NOTREACHED */
5228         }
5229
5230         case EVAL_AB: /* cleanup after a successful (??{A})B */
5231             /* note: this is called twice; first after popping B, then A */
5232             rex_sv = ST.prev_rex;
5233             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5234             SET_reg_curpm(rex_sv);
5235             rex = ReANY(rex_sv);
5236             rexi = RXi_GET(rex);
5237             regcpblow(ST.cp);
5238             cur_eval = ST.prev_eval;
5239             cur_curlyx = ST.prev_curlyx;
5240
5241             /* Invalidate cache. See "invalidate" comment above. */
5242             reginfo->poscache_maxiter = 0;
5243             if ( nochange_depth )
5244                 nochange_depth--;
5245             sayYES;
5246
5247
5248         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5249             /* note: this is called twice; first after popping B, then A */
5250             rex_sv = ST.prev_rex;
5251             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5252             SET_reg_curpm(rex_sv);
5253             rex = ReANY(rex_sv);
5254             rexi = RXi_GET(rex);
5255
5256             REGCP_UNWIND(ST.lastcp);
5257             regcppop(rex, &maxopenparen);
5258             cur_eval = ST.prev_eval;
5259             cur_curlyx = ST.prev_curlyx;
5260             /* Invalidate cache. See "invalidate" comment above. */
5261             reginfo->poscache_maxiter = 0;
5262             if ( nochange_depth )
5263                 nochange_depth--;
5264             sayNO_SILENT;
5265 #undef ST
5266
5267         case OPEN: /*  (  */
5268             n = ARG(scan);  /* which paren pair */
5269             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5270             if (n > maxopenparen)
5271                 maxopenparen = n;
5272             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5273                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5274                 PTR2UV(rex),
5275                 PTR2UV(rex->offs),
5276                 (UV)n,
5277                 (IV)rex->offs[n].start_tmp,
5278                 (UV)maxopenparen
5279             ));
5280             lastopen = n;
5281             break;
5282
5283 /* XXX really need to log other places start/end are set too */
5284 #define CLOSE_CAPTURE \
5285     rex->offs[n].start = rex->offs[n].start_tmp; \
5286     rex->offs[n].end = locinput - reginfo->strbeg; \
5287     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5288         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5289         PTR2UV(rex), \
5290         PTR2UV(rex->offs), \
5291         (UV)n, \
5292         (IV)rex->offs[n].start, \
5293         (IV)rex->offs[n].end \
5294     ))
5295
5296         case CLOSE:  /*  )  */
5297             n = ARG(scan);  /* which paren pair */
5298             CLOSE_CAPTURE;
5299             if (n > rex->lastparen)
5300                 rex->lastparen = n;
5301             rex->lastcloseparen = n;
5302             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5303                 goto fake_end;
5304             }
5305             break;
5306
5307         case ACCEPT:  /*  (*ACCEPT)  */
5308             if (ARG(scan)){
5309                 regnode *cursor;
5310                 for (cursor=scan;
5311                      cursor && OP(cursor)!=END;
5312                      cursor=regnext(cursor))
5313                 {
5314                     if ( OP(cursor)==CLOSE ){
5315                         n = ARG(cursor);
5316                         if ( n <= lastopen ) {
5317                             CLOSE_CAPTURE;
5318                             if (n > rex->lastparen)
5319                                 rex->lastparen = n;
5320                             rex->lastcloseparen = n;
5321                             if ( n == ARG(scan) || (cur_eval &&
5322                                 cur_eval->u.eval.close_paren == n))
5323                                 break;
5324                         }
5325                     }
5326                 }
5327             }
5328             goto fake_end;
5329             /*NOTREACHED*/
5330
5331         case GROUPP:  /*  (?(1))  */
5332             n = ARG(scan);  /* which paren pair */
5333             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5334             break;
5335
5336         case NGROUPP:  /*  (?(<name>))  */
5337             /* reg_check_named_buff_matched returns 0 for no match */
5338             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5339             break;
5340
5341         case INSUBP:   /*  (?(R))  */
5342             n = ARG(scan);
5343             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5344             break;
5345
5346         case DEFINEP:  /*  (?(DEFINE))  */
5347             sw = 0;
5348             break;
5349
5350         case IFTHEN:   /*  (?(cond)A|B)  */
5351             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5352             if (sw)
5353                 next = NEXTOPER(NEXTOPER(scan));
5354             else {
5355                 next = scan + ARG(scan);
5356                 if (OP(next) == IFTHEN) /* Fake one. */
5357                     next = NEXTOPER(NEXTOPER(next));
5358             }
5359             break;
5360
5361         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5362             logical = scan->flags;
5363             break;
5364
5365 /*******************************************************************
5366
5367 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5368 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5369 STAR/PLUS/CURLY/CURLYN are used instead.)
5370
5371 A*B is compiled as <CURLYX><A><WHILEM><B>
5372
5373 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5374 state, which contains the current count, initialised to -1. It also sets
5375 cur_curlyx to point to this state, with any previous value saved in the
5376 state block.
5377
5378 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5379 since the pattern may possibly match zero times (i.e. it's a while {} loop
5380 rather than a do {} while loop).
5381
5382 Each entry to WHILEM represents a successful match of A. The count in the
5383 CURLYX block is incremented, another WHILEM state is pushed, and execution
5384 passes to A or B depending on greediness and the current count.
5385
5386 For example, if matching against the string a1a2a3b (where the aN are
5387 substrings that match /A/), then the match progresses as follows: (the
5388 pushed states are interspersed with the bits of strings matched so far):
5389
5390     <CURLYX cnt=-1>
5391     <CURLYX cnt=0><WHILEM>
5392     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5393     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5394     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5395     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5396
5397 (Contrast this with something like CURLYM, which maintains only a single
5398 backtrack state:
5399
5400     <CURLYM cnt=0> a1
5401     a1 <CURLYM cnt=1> a2
5402     a1 a2 <CURLYM cnt=2> a3
5403     a1 a2 a3 <CURLYM cnt=3> b
5404 )
5405
5406 Each WHILEM state block marks a point to backtrack to upon partial failure
5407 of A or B, and also contains some minor state data related to that
5408 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5409 overall state, such as the count, and pointers to the A and B ops.
5410
5411 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5412 must always point to the *current* CURLYX block, the rules are:
5413
5414 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5415 and set cur_curlyx to point the new block.
5416
5417 When popping the CURLYX block after a successful or unsuccessful match,
5418 restore the previous cur_curlyx.
5419
5420 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5421 to the outer one saved in the CURLYX block.
5422
5423 When popping the WHILEM block after a successful or unsuccessful B match,
5424 restore the previous cur_curlyx.
5425
5426 Here's an example for the pattern (AI* BI)*BO
5427 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5428
5429 cur_
5430 curlyx backtrack stack
5431 ------ ---------------
5432 NULL
5433 CO     <CO prev=NULL> <WO>
5434 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5435 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5436 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5437
5438 At this point the pattern succeeds, and we work back down the stack to
5439 clean up, restoring as we go:
5440
5441 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5442 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5443 CO     <CO prev=NULL> <WO>
5444 NULL
5445
5446 *******************************************************************/
5447
5448 #define ST st->u.curlyx
5449
5450         case CURLYX:    /* start of /A*B/  (for complex A) */
5451         {
5452             /* No need to save/restore up to this paren */
5453             I32 parenfloor = scan->flags;
5454
5455             assert(next); /* keep Coverity happy */
5456             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5457                 next += ARG(next);
5458
5459             /* XXXX Probably it is better to teach regpush to support
5460                parenfloor > maxopenparen ... */
5461             if (parenfloor > (I32)rex->lastparen)
5462                 parenfloor = rex->lastparen; /* Pessimization... */
5463
5464             ST.prev_curlyx= cur_curlyx;
5465             cur_curlyx = st;
5466             ST.cp = PL_savestack_ix;
5467
5468             /* these fields contain the state of the current curly.
5469              * they are accessed by subsequent WHILEMs */
5470             ST.parenfloor = parenfloor;
5471             ST.me = scan;
5472             ST.B = next;
5473             ST.minmod = minmod;
5474             minmod = 0;
5475             ST.count = -1;      /* this will be updated by WHILEM */
5476             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5477
5478             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5479             assert(0); /* NOTREACHED */
5480         }
5481
5482         case CURLYX_end: /* just finished matching all of A*B */
5483             cur_curlyx = ST.prev_curlyx;
5484             sayYES;
5485             assert(0); /* NOTREACHED */
5486
5487         case CURLYX_end_fail: /* just failed to match all of A*B */
5488             regcpblow(ST.cp);
5489             cur_curlyx = ST.prev_curlyx;
5490             sayNO;
5491             assert(0); /* NOTREACHED */
5492
5493
5494 #undef ST
5495 #define ST st->u.whilem
5496
5497         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5498         {
5499             /* see the discussion above about CURLYX/WHILEM */
5500             I32 n;
5501             int min = ARG1(cur_curlyx->u.curlyx.me);
5502             int max = ARG2(cur_curlyx->u.curlyx.me);
5503             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5504
5505             assert(cur_curlyx); /* keep Coverity happy */
5506             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5507             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5508             ST.cache_offset = 0;
5509             ST.cache_mask = 0;
5510
5511
5512             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5513                   "%*s  whilem: matched %ld out of %d..%d\n",
5514                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5515             );
5516
5517             /* First just match a string of min A's. */
5518
5519             if (n < min) {
5520                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5521                                     maxopenparen);
5522                 cur_curlyx->u.curlyx.lastloc = locinput;
5523                 REGCP_SET(ST.lastcp);
5524
5525                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5526                 assert(0); /* NOTREACHED */
5527             }
5528
5529             /* If degenerate A matches "", assume A done. */
5530
5531             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5532                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5533                    "%*s  whilem: empty match detected, trying continuation...\n",
5534                    REPORT_CODE_OFF+depth*2, "")
5535                 );
5536                 goto do_whilem_B_max;
5537             }
5538
5539             /* super-linear cache processing.
5540              *
5541              * The idea here is that for certain types of CURLYX/WHILEM -
5542              * principally those whose upper bound is infinity (and
5543              * excluding regexes that have things like \1 and other very
5544              * non-regular expresssiony things), then if a pattern like
5545              * /....A*.../ fails and we backtrack to the WHILEM, then we
5546              * make a note that this particular WHILEM op was at string
5547              * position 47 (say) when the rest of pattern failed. Then, if
5548              * we ever find ourselves back at that WHILEM, and at string
5549              * position 47 again, we can just fail immediately rather than
5550              * running the rest of the pattern again.
5551              *
5552              * This is very handy when patterns start to go
5553              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5554              * with a combinatorial explosion of backtracking.
5555              *
5556              * The cache is implemented as a bit array, with one bit per
5557              * string byte position per WHILEM op (up to 16) - so its
5558              * between 0.25 and 2x the string size.
5559              *
5560              * To avoid allocating a poscache buffer every time, we do an
5561              * initially countdown; only after we have  executed a WHILEM
5562              * op (string-length x #WHILEMs) times do we allocate the
5563              * cache.
5564              *
5565              * The top 4 bits of scan->flags byte say how many different
5566              * relevant CURLLYX/WHILEM op pairs there are, while the
5567              * bottom 4-bits is the identifying index number of this
5568              * WHILEM.
5569              */
5570
5571             if (scan->flags) {
5572
5573                 if (!reginfo->poscache_maxiter) {
5574                     /* start the countdown: Postpone detection until we
5575                      * know the match is not *that* much linear. */
5576                     reginfo->poscache_maxiter
5577                         =    (reginfo->strend - reginfo->strbeg + 1)
5578                            * (scan->flags>>4);
5579                     /* possible overflow for long strings and many CURLYX's */
5580                     if (reginfo->poscache_maxiter < 0)
5581                         reginfo->poscache_maxiter = I32_MAX;
5582                     reginfo->poscache_iter = reginfo->poscache_maxiter;
5583                 }
5584
5585                 if (reginfo->poscache_iter-- == 0) {
5586                     /* initialise cache */
5587                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5588                     regmatch_info_aux *const aux = reginfo->info_aux;
5589                     if (aux->poscache) {
5590                         if ((SSize_t)reginfo->poscache_size < size) {
5591                             Renew(aux->poscache, size, char);
5592                             reginfo->poscache_size = size;
5593                         }
5594                         Zero(aux->poscache, size, char);
5595                     }
5596                     else {
5597                         reginfo->poscache_size = size;
5598                         Newxz(aux->poscache, size, char);
5599                     }
5600                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5601       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5602                               PL_colors[4], PL_colors[5])
5603                     );
5604                 }
5605
5606                 if (reginfo->poscache_iter < 0) {
5607                     /* have we already failed at this position? */
5608                     SSize_t offset, mask;
5609
5610                     reginfo->poscache_iter = -1; /* stop eventual underflow */
5611                     offset  = (scan->flags & 0xf) - 1
5612                                 +   (locinput - reginfo->strbeg)
5613                                   * (scan->flags>>4);
5614                     mask    = 1 << (offset % 8);
5615                     offset /= 8;
5616                     if (reginfo->info_aux->poscache[offset] & mask) {
5617                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5618                             "%*s  whilem: (cache) already tried at this position...\n",
5619                             REPORT_CODE_OFF+depth*2, "")
5620                         );
5621                         sayNO; /* cache records failure */
5622                     }
5623                     ST.cache_offset = offset;
5624                     ST.cache_mask   = mask;
5625                 }
5626             }
5627
5628             /* Prefer B over A for minimal matching. */
5629
5630             if (cur_curlyx->u.curlyx.minmod) {
5631                 ST.save_curlyx = cur_curlyx;
5632                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5633                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5634                             maxopenparen);
5635                 REGCP_SET(ST.lastcp);
5636                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5637                                     locinput);
5638                 assert(0); /* NOTREACHED */
5639             }
5640
5641             /* Prefer A over B for maximal matching. */
5642
5643             if (n < max) { /* More greed allowed? */
5644                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5645                             maxopenparen);
5646                 cur_curlyx->u.curlyx.lastloc = locinput;
5647                 REGCP_SET(ST.lastcp);
5648                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5649                 assert(0); /* NOTREACHED */
5650             }
5651             goto do_whilem_B_max;
5652         }
5653         assert(0); /* NOTREACHED */
5654
5655         case WHILEM_B_min: /* just matched B in a minimal match */
5656         case WHILEM_B_max: /* just matched B in a maximal match */
5657             cur_curlyx = ST.save_curlyx;
5658             sayYES;
5659             assert(0); /* NOTREACHED */
5660
5661         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5662             cur_curlyx = ST.save_curlyx;
5663             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5664             cur_curlyx->u.curlyx.count--;
5665             CACHEsayNO;
5666             assert(0); /* NOTREACHED */
5667
5668         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5669             /* FALL THROUGH */
5670         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5671             REGCP_UNWIND(ST.lastcp);
5672             regcppop(rex, &maxopenparen);
5673             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5674             cur_curlyx->u.curlyx.count--;
5675             CACHEsayNO;
5676             assert(0); /* NOTREACHED */
5677
5678         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5679             REGCP_UNWIND(ST.lastcp);
5680             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5681             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5682                 "%*s  whilem: failed, trying continuation...\n",
5683                 REPORT_CODE_OFF+depth*2, "")
5684             );
5685           do_whilem_B_max:
5686             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5687                 && ckWARN(WARN_REGEXP)
5688                 && !reginfo->warned)
5689             {
5690                 reginfo->warned = TRUE;
5691                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5692                      "Complex regular subexpression recursion limit (%d) "
5693                      "exceeded",
5694                      REG_INFTY - 1);
5695             }
5696
5697             /* now try B */
5698             ST.save_curlyx = cur_curlyx;
5699             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5700             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5701                                 locinput);
5702             assert(0); /* NOTREACHED */
5703
5704         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5705             cur_curlyx = ST.save_curlyx;
5706             REGCP_UNWIND(ST.lastcp);
5707             regcppop(rex, &maxopenparen);
5708
5709             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5710                 /* Maximum greed exceeded */
5711                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5712                     && ckWARN(WARN_REGEXP)
5713                     && !reginfo->warned)
5714                 {
5715                     reginfo->warned     = TRUE;
5716                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5717                         "Complex regular subexpression recursion "
5718                         "limit (%d) exceeded",
5719                         REG_INFTY - 1);
5720                 }
5721                 cur_curlyx->u.curlyx.count--;
5722                 CACHEsayNO;
5723             }
5724
5725             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5726                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5727             );
5728             /* Try grabbing another A and see if it helps. */
5729             cur_curlyx->u.curlyx.lastloc = locinput;
5730             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5731                             maxopenparen);
5732             REGCP_SET(ST.lastcp);
5733             PUSH_STATE_GOTO(WHILEM_A_min,
5734                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5735                 locinput);
5736             assert(0); /* NOTREACHED */
5737
5738 #undef  ST
5739 #define ST st->u.branch
5740
5741         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5742             next = scan + ARG(scan);
5743             if (next == scan)
5744                 next = NULL;
5745             scan = NEXTOPER(scan);
5746             /* FALL THROUGH */
5747
5748         case BRANCH:        /*  /(...|A|...)/ */
5749             scan = NEXTOPER(scan); /* scan now points to inner node */
5750             ST.lastparen = rex->lastparen;
5751             ST.lastcloseparen = rex->lastcloseparen;
5752             ST.next_branch = next;
5753             REGCP_SET(ST.cp);
5754
5755             /* Now go into the branch */
5756             if (has_cutgroup) {
5757                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5758             } else {
5759                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5760             }
5761             assert(0); /* NOTREACHED */
5762
5763         case CUTGROUP:  /*  /(*THEN)/  */
5764             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5765                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5766             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5767             assert(0); /* NOTREACHED */
5768
5769         case CUTGROUP_next_fail:
5770             do_cutgroup = 1;
5771             no_final = 1;
5772             if (st->u.mark.mark_name)
5773                 sv_commit = st->u.mark.mark_name;
5774             sayNO;
5775             assert(0); /* NOTREACHED */
5776
5777         case BRANCH_next:
5778             sayYES;
5779             assert(0); /* NOTREACHED */
5780
5781         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5782             if (do_cutgroup) {
5783                 do_cutgroup = 0;
5784                 no_final = 0;
5785             }
5786             REGCP_UNWIND(ST.cp);
5787             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5788             scan = ST.next_branch;
5789             /* no more branches? */
5790             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5791                 DEBUG_EXECUTE_r({
5792                     PerlIO_printf( Perl_debug_log,
5793                         "%*s  %sBRANCH failed...%s\n",
5794                         REPORT_CODE_OFF+depth*2, "",
5795                         PL_colors[4],
5796                         PL_colors[5] );
5797                 });
5798                 sayNO_SILENT;
5799             }
5800             continue; /* execute next BRANCH[J] op */
5801             assert(0); /* NOTREACHED */
5802
5803         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5804             minmod = 1;
5805             break;
5806
5807 #undef  ST
5808 #define ST st->u.curlym
5809
5810         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5811
5812             /* This is an optimisation of CURLYX that enables us to push
5813              * only a single backtracking state, no matter how many matches
5814              * there are in {m,n}. It relies on the pattern being constant
5815              * length, with no parens to influence future backrefs
5816              */
5817
5818             ST.me = scan;
5819             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5820
5821             ST.lastparen      = rex->lastparen;
5822             ST.lastcloseparen = rex->lastcloseparen;
5823
5824             /* if paren positive, emulate an OPEN/CLOSE around A */
5825             if (ST.me->flags) {
5826                 U32 paren = ST.me->flags;
5827                 if (paren > maxopenparen)
5828                     maxopenparen = paren;
5829                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5830             }
5831             ST.A = scan;
5832             ST.B = next;
5833             ST.alen = 0;
5834             ST.count = 0;
5835             ST.minmod = minmod;
5836             minmod = 0;
5837             ST.c1 = CHRTEST_UNINIT;
5838             REGCP_SET(ST.cp);
5839
5840             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5841                 goto curlym_do_B;
5842
5843           curlym_do_A: /* execute the A in /A{m,n}B/  */
5844             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5845             assert(0); /* NOTREACHED */
5846
5847         case CURLYM_A: /* we've just matched an A */
5848             ST.count++;
5849             /* after first match, determine A's length: u.curlym.alen */
5850             if (ST.count == 1) {
5851                 if (reginfo->is_utf8_target) {
5852                     char *s = st->locinput;
5853                     while (s < locinput) {
5854                         ST.alen++;
5855                         s += UTF8SKIP(s);
5856                     }
5857                 }
5858                 else {
5859                     ST.alen = locinput - st->locinput;
5860                 }
5861                 if (ST.alen == 0)
5862                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5863             }
5864             DEBUG_EXECUTE_r(
5865                 PerlIO_printf(Perl_debug_log,
5866                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5867                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5868                           (IV) ST.count, (IV)ST.alen)
5869             );
5870
5871             if (cur_eval && cur_eval->u.eval.close_paren &&
5872                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5873                 goto fake_end;
5874
5875             {
5876                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5877                 if ( max == REG_INFTY || ST.count < max )
5878                     goto curlym_do_A; /* try to match another A */
5879             }
5880             goto curlym_do_B; /* try to match B */
5881
5882         case CURLYM_A_fail: /* just failed to match an A */
5883             REGCP_UNWIND(ST.cp);
5884
5885             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5886                 || (cur_eval && cur_eval->u.eval.close_paren &&
5887                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5888                 sayNO;
5889
5890           curlym_do_B: /* execute the B in /A{m,n}B/  */
5891             if (ST.c1 == CHRTEST_UNINIT) {
5892                 /* calculate c1 and c2 for possible match of 1st char
5893                  * following curly */
5894                 ST.c1 = ST.c2 = CHRTEST_VOID;
5895                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5896                     regnode *text_node = ST.B;
5897                     if (! HAS_TEXT(text_node))
5898                         FIND_NEXT_IMPT(text_node);
5899                     /* this used to be
5900
5901                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5902
5903                         But the former is redundant in light of the latter.
5904
5905                         if this changes back then the macro for
5906                         IS_TEXT and friends need to change.
5907                      */
5908                     if (PL_regkind[OP(text_node)] == EXACT) {
5909                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5910                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5911                            reginfo))
5912                         {
5913                             sayNO;
5914                         }
5915                     }
5916                 }
5917             }
5918
5919             DEBUG_EXECUTE_r(
5920                 PerlIO_printf(Perl_debug_log,
5921                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5922                     (int)(REPORT_CODE_OFF+(depth*2)),
5923                     "", (IV)ST.count)
5924                 );
5925             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5926                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5927                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5928                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5929                     {
5930                         /* simulate B failing */
5931                         DEBUG_OPTIMISE_r(
5932                             PerlIO_printf(Perl_debug_log,
5933                                 "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
5934                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5935                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5936                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5937                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5938                         );
5939                         state_num = CURLYM_B_fail;
5940                         goto reenter_switch;
5941                     }
5942                 }
5943                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5944                     /* simulate B failing */
5945                     DEBUG_OPTIMISE_r(
5946                         PerlIO_printf(Perl_debug_log,
5947                             "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
5948                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5949                             (int) nextchr, ST.c1, ST.c2)
5950                     );
5951                     state_num = CURLYM_B_fail;
5952                     goto reenter_switch;
5953                 }
5954             }
5955
5956             if (ST.me->flags) {
5957                 /* emulate CLOSE: mark current A as captured */
5958                 I32 paren = ST.me->flags;
5959                 if (ST.count) {
5960                     rex->offs[paren].start
5961                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5962                     rex->offs[paren].end = locinput - reginfo->strbeg;
5963                     if ((U32)paren > rex->lastparen)
5964                         rex->lastparen = paren;
5965                     rex->lastcloseparen = paren;
5966                 }
5967                 else
5968                     rex->offs[paren].end = -1;
5969                 if (cur_eval && cur_eval->u.eval.close_paren &&
5970                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5971                 {
5972                     if (ST.count)
5973                         goto fake_end;
5974                     else
5975                         sayNO;
5976                 }
5977             }
5978
5979             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5980             assert(0); /* NOTREACHED */
5981
5982         case CURLYM_B_fail: /* just failed to match a B */
5983             REGCP_UNWIND(ST.cp);
5984             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5985             if (ST.minmod) {
5986                 I32 max = ARG2(ST.me);
5987                 if (max != REG_INFTY && ST.count == max)
5988                     sayNO;
5989                 goto curlym_do_A; /* try to match a further A */
5990             }
5991             /* backtrack one A */
5992             if (ST.count == ARG1(ST.me) /* min */)
5993                 sayNO;
5994             ST.count--;
5995             SET_locinput(HOPc(locinput, -ST.alen));
5996             goto curlym_do_B; /* try to match B */
5997
5998 #undef ST
5999 #define ST st->u.curly
6000
6001 #define CURLY_SETPAREN(paren, success) \
6002     if (paren) { \
6003         if (success) { \
6004             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6005             rex->offs[paren].end = locinput - reginfo->strbeg; \
6006             if (paren > rex->lastparen) \
6007                 rex->lastparen = paren; \
6008             rex->lastcloseparen = paren; \
6009         } \
6010         else { \
6011             rex->offs[paren].end = -1; \
6012             rex->lastparen      = ST.lastparen; \
6013             rex->lastcloseparen = ST.lastcloseparen; \
6014         } \
6015     }
6016
6017         case STAR:              /*  /A*B/ where A is width 1 char */
6018             ST.paren = 0;
6019             ST.min = 0;
6020             ST.max = REG_INFTY;
6021             scan = NEXTOPER(scan);
6022             goto repeat;
6023
6024         case PLUS:              /*  /A+B/ where A is width 1 char */
6025             ST.paren = 0;
6026             ST.min = 1;
6027             ST.max = REG_INFTY;
6028             scan = NEXTOPER(scan);
6029             goto repeat;
6030
6031         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
6032             ST.paren = scan->flags;     /* Which paren to set */
6033             ST.lastparen      = rex->lastparen;
6034             ST.lastcloseparen = rex->lastcloseparen;
6035             if (ST.paren > maxopenparen)
6036                 maxopenparen = ST.paren;
6037             ST.min = ARG1(scan);  /* min to match */
6038             ST.max = ARG2(scan);  /* max to match */
6039             if (cur_eval && cur_eval->u.eval.close_paren &&
6040                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6041                 ST.min=1;
6042                 ST.max=1;
6043             }
6044             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6045             goto repeat;
6046
6047         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
6048             ST.paren = 0;
6049             ST.min = ARG1(scan);  /* min to match */
6050             ST.max = ARG2(scan);  /* max to match */
6051             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6052           repeat:
6053             /*
6054             * Lookahead to avoid useless match attempts
6055             * when we know what character comes next.
6056             *
6057             * Used to only do .*x and .*?x, but now it allows
6058             * for )'s, ('s and (?{ ... })'s to be in the way
6059             * of the quantifier and the EXACT-like node.  -- japhy
6060             */
6061
6062             assert(ST.min <= ST.max);
6063             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6064                 ST.c1 = ST.c2 = CHRTEST_VOID;
6065             }
6066             else {
6067                 regnode *text_node = next;
6068
6069                 if (! HAS_TEXT(text_node))
6070                     FIND_NEXT_IMPT(text_node);
6071
6072                 if (! HAS_TEXT(text_node))
6073                     ST.c1 = ST.c2 = CHRTEST_VOID;
6074                 else {
6075                     if ( PL_regkind[OP(text_node)] != EXACT ) {
6076                         ST.c1 = ST.c2 = CHRTEST_VOID;
6077                     }
6078                     else {
6079
6080                     /*  Currently we only get here when
6081
6082                         PL_rekind[OP(text_node)] == EXACT
6083
6084                         if this changes back then the macro for IS_TEXT and
6085                         friends need to change. */
6086                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6087                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6088                            reginfo))
6089                         {
6090                             sayNO;
6091                         }
6092                     }
6093                 }
6094             }
6095
6096             ST.A = scan;
6097             ST.B = next;
6098             if (minmod) {
6099                 char *li = locinput;
6100                 minmod = 0;
6101                 if (ST.min &&
6102                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6103                             < ST.min)
6104                     sayNO;
6105                 SET_locinput(li);
6106                 ST.count = ST.min;
6107                 REGCP_SET(ST.cp);
6108                 if (ST.c1 == CHRTEST_VOID)
6109                     goto curly_try_B_min;
6110
6111                 ST.oldloc = locinput;
6112
6113                 /* set ST.maxpos to the furthest point along the
6114                  * string that could possibly match */
6115                 if  (ST.max == REG_INFTY) {
6116                     ST.maxpos = reginfo->strend - 1;
6117                     if (utf8_target)
6118                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6119                             ST.maxpos--;
6120                 }
6121                 else if (utf8_target) {
6122                     int m = ST.max - ST.min;
6123                     for (ST.maxpos = locinput;
6124                          m >0 && ST.maxpos < reginfo->strend; m--)
6125                         ST.maxpos += UTF8SKIP(ST.maxpos);
6126                 }
6127                 else {
6128                     ST.maxpos = locinput + ST.max - ST.min;
6129                     if (ST.maxpos >= reginfo->strend)
6130                         ST.maxpos = reginfo->strend - 1;
6131                 }
6132                 goto curly_try_B_min_known;
6133
6134             }
6135             else {
6136                 /* avoid taking address of locinput, so it can remain
6137                  * a register var */
6138                 char *li = locinput;
6139                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6140                 if (ST.count < ST.min)
6141                     sayNO;
6142                 SET_locinput(li);
6143                 if ((ST.count > ST.min)
6144                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6145                 {
6146                     /* A{m,n} must come at the end of the string, there's
6147                      * no point in backing off ... */
6148                     ST.min = ST.count;
6149                     /* ...except that $ and \Z can match before *and* after
6150                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6151                        We may back off by one in this case. */
6152                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6153                         ST.min--;
6154                 }
6155                 REGCP_SET(ST.cp);
6156                 goto curly_try_B_max;
6157             }
6158             assert(0); /* NOTREACHED */
6159
6160
6161         case CURLY_B_min_known_fail:
6162             /* failed to find B in a non-greedy match where c1,c2 valid */
6163
6164             REGCP_UNWIND(ST.cp);
6165             if (ST.paren) {
6166                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6167             }
6168             /* Couldn't or didn't -- move forward. */
6169             ST.oldloc = locinput;
6170             if (utf8_target)
6171                 locinput += UTF8SKIP(locinput);
6172             else
6173                 locinput++;
6174             ST.count++;
6175           curly_try_B_min_known:
6176              /* find the next place where 'B' could work, then call B */
6177             {
6178                 int n;
6179                 if (utf8_target) {
6180                     n = (ST.oldloc == locinput) ? 0 : 1;
6181                     if (ST.c1 == ST.c2) {
6182                         /* set n to utf8_distance(oldloc, locinput) */
6183                         while (locinput <= ST.maxpos
6184                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6185                         {
6186                             locinput += UTF8SKIP(locinput);
6187                             n++;
6188                         }
6189                     }
6190                     else {
6191                         /* set n to utf8_distance(oldloc, locinput) */
6192                         while (locinput <= ST.maxpos
6193                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6194                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6195                         {
6196                             locinput += UTF8SKIP(locinput);
6197                             n++;
6198                         }
6199                     }
6200                 }
6201                 else {  /* Not utf8_target */
6202                     if (ST.c1 == ST.c2) {
6203                         while (locinput <= ST.maxpos &&
6204                                UCHARAT(locinput) != ST.c1)
6205                             locinput++;
6206                     }
6207                     else {
6208                         while (locinput <= ST.maxpos
6209                                && UCHARAT(locinput) != ST.c1
6210                                && UCHARAT(locinput) != ST.c2)
6211                             locinput++;
6212                     }
6213                     n = locinput - ST.oldloc;
6214                 }
6215                 if (locinput > ST.maxpos)
6216                     sayNO;
6217                 if (n) {
6218                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6219                      * at b; check that everything between oldloc and
6220                      * locinput matches */
6221                     char *li = ST.oldloc;
6222                     ST.count += n;
6223                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6224                         sayNO;
6225                     assert(n == REG_INFTY || locinput == li);
6226                 }
6227                 CURLY_SETPAREN(ST.paren, ST.count);
6228                 if (cur_eval && cur_eval->u.eval.close_paren &&
6229                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6230                     goto fake_end;
6231                 }
6232                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6233             }
6234             assert(0); /* NOTREACHED */
6235
6236
6237         case CURLY_B_min_fail:
6238             /* failed to find B in a non-greedy match where c1,c2 invalid */
6239
6240             REGCP_UNWIND(ST.cp);
6241             if (ST.paren) {
6242                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6243             }
6244             /* failed -- move forward one */
6245             {
6246                 char *li = locinput;
6247                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6248                     sayNO;
6249                 }
6250                 locinput = li;
6251             }
6252             {
6253                 ST.count++;
6254                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6255                         ST.count > 0)) /* count overflow ? */
6256                 {
6257                   curly_try_B_min:
6258                     CURLY_SETPAREN(ST.paren, ST.count);
6259                     if (cur_eval && cur_eval->u.eval.close_paren &&
6260                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6261                         goto fake_end;
6262                     }
6263                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6264                 }
6265             }
6266             sayNO;
6267             assert(0); /* NOTREACHED */
6268
6269
6270         curly_try_B_max:
6271             /* a successful greedy match: now try to match B */
6272             if (cur_eval && cur_eval->u.eval.close_paren &&
6273                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6274                 goto fake_end;
6275             }
6276             {
6277                 bool could_match = locinput < reginfo->strend;
6278
6279                 /* If it could work, try it. */
6280                 if (ST.c1 != CHRTEST_VOID && could_match) {
6281                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6282                     {
6283                         could_match = memEQ(locinput,
6284                                             ST.c1_utf8,
6285                                             UTF8SKIP(locinput))
6286                                     || memEQ(locinput,
6287                                              ST.c2_utf8,
6288                                              UTF8SKIP(locinput));
6289                     }
6290                     else {
6291                         could_match = UCHARAT(locinput) == ST.c1
6292                                       || UCHARAT(locinput) == ST.c2;
6293                     }
6294                 }
6295                 if (ST.c1 == CHRTEST_VOID || could_match) {
6296                     CURLY_SETPAREN(ST.paren, ST.count);
6297                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6298                     assert(0); /* NOTREACHED */
6299                 }
6300             }
6301             /* FALL THROUGH */
6302
6303         case CURLY_B_max_fail:
6304             /* failed to find B in a greedy match */
6305
6306             REGCP_UNWIND(ST.cp);
6307             if (ST.paren) {
6308                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6309             }
6310             /*  back up. */
6311             if (--ST.count < ST.min)
6312                 sayNO;
6313             locinput = HOPc(locinput, -1);
6314             goto curly_try_B_max;
6315
6316 #undef ST
6317
6318         case END: /*  last op of main pattern  */
6319             fake_end:
6320             if (cur_eval) {
6321                 /* we've just finished A in /(??{A})B/; now continue with B */
6322
6323                 st->u.eval.prev_rex = rex_sv;           /* inner */
6324
6325                 /* Save *all* the positions. */
6326                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6327                 rex_sv = cur_eval->u.eval.prev_rex;
6328                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6329                 SET_reg_curpm(rex_sv);
6330                 rex = ReANY(rex_sv);
6331                 rexi = RXi_GET(rex);
6332                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6333
6334                 REGCP_SET(st->u.eval.lastcp);
6335
6336                 /* Restore parens of the outer rex without popping the
6337                  * savestack */
6338                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6339                                         &maxopenparen);
6340
6341                 st->u.eval.prev_eval = cur_eval;
6342                 cur_eval = cur_eval->u.eval.prev_eval;
6343                 DEBUG_EXECUTE_r(
6344                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6345                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6346                 if ( nochange_depth )
6347                     nochange_depth--;
6348
6349                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6350                                     locinput); /* match B */
6351             }
6352
6353             if (locinput < reginfo->till) {
6354                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6355                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6356                                       PL_colors[4],
6357                                       (long)(locinput - startpos),
6358                                       (long)(reginfo->till - startpos),
6359                                       PL_colors[5]));
6360
6361                 sayNO_SILENT;           /* Cannot match: too short. */
6362             }
6363             sayYES;                     /* Success! */
6364
6365         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6366             DEBUG_EXECUTE_r(
6367             PerlIO_printf(Perl_debug_log,
6368                 "%*s  %ssubpattern success...%s\n",
6369                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6370             sayYES;                     /* Success! */
6371
6372 #undef  ST
6373 #define ST st->u.ifmatch
6374
6375         {
6376             char *newstart;
6377
6378         case SUSPEND:   /* (?>A) */
6379             ST.wanted = 1;
6380             newstart = locinput;
6381             goto do_ifmatch;
6382
6383         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6384             ST.wanted = 0;
6385             goto ifmatch_trivial_fail_test;
6386
6387         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6388             ST.wanted = 1;
6389           ifmatch_trivial_fail_test:
6390             if (scan->flags) {
6391                 char * const s = HOPBACKc(locinput, scan->flags);
6392                 if (!s) {
6393                     /* trivial fail */
6394                     if (logical) {
6395                         logical = 0;
6396                         sw = 1 - cBOOL(ST.wanted);
6397                     }
6398                     else if (ST.wanted)
6399                         sayNO;
6400                     next = scan + ARG(scan);
6401                     if (next == scan)
6402                         next = NULL;
6403                     break;
6404                 }
6405                 newstart = s;
6406             }
6407             else
6408                 newstart = locinput;
6409
6410           do_ifmatch:
6411             ST.me = scan;
6412             ST.logical = logical;
6413             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6414
6415             /* execute body of (?...A) */
6416             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6417             assert(0); /* NOTREACHED */
6418         }
6419
6420         case IFMATCH_A_fail: /* body of (?...A) failed */
6421             ST.wanted = !ST.wanted;
6422             /* FALL THROUGH */
6423
6424         case IFMATCH_A: /* body of (?...A) succeeded */
6425             if (ST.logical) {
6426                 sw = cBOOL(ST.wanted);
6427             }
6428             else if (!ST.wanted)
6429                 sayNO;
6430
6431             if (OP(ST.me) != SUSPEND) {
6432                 /* restore old position except for (?>...) */
6433                 locinput = st->locinput;
6434             }
6435             scan = ST.me + ARG(ST.me);
6436             if (scan == ST.me)
6437                 scan = NULL;
6438             continue; /* execute B */
6439
6440 #undef ST
6441
6442         case LONGJMP: /*  alternative with many branches compiles to
6443                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6444             next = scan + ARG(scan);
6445             if (next == scan)
6446                 next = NULL;
6447             break;
6448
6449         case COMMIT:  /*  (*COMMIT)  */
6450             reginfo->cutpoint = reginfo->strend;
6451             /* FALLTHROUGH */
6452
6453         case PRUNE:   /*  (*PRUNE)   */
6454             if (!scan->flags)
6455                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6456             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6457             assert(0); /* NOTREACHED */
6458
6459         case COMMIT_next_fail:
6460             no_final = 1;
6461             /* FALLTHROUGH */
6462
6463         case OPFAIL:   /* (*FAIL)  */
6464             sayNO;
6465             assert(0); /* NOTREACHED */
6466
6467 #define ST st->u.mark
6468         case MARKPOINT: /*  (*MARK:foo)  */
6469             ST.prev_mark = mark_state;
6470             ST.mark_name = sv_commit = sv_yes_mark
6471                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6472             mark_state = st;
6473             ST.mark_loc = locinput;
6474             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6475             assert(0); /* NOTREACHED */
6476
6477         case MARKPOINT_next:
6478             mark_state = ST.prev_mark;
6479             sayYES;
6480             assert(0); /* NOTREACHED */
6481
6482         case MARKPOINT_next_fail:
6483             if (popmark && sv_eq(ST.mark_name,popmark))
6484             {
6485                 if (ST.mark_loc > startpoint)
6486                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6487                 popmark = NULL; /* we found our mark */
6488                 sv_commit = ST.mark_name;
6489
6490                 DEBUG_EXECUTE_r({
6491                         PerlIO_printf(Perl_debug_log,
6492                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6493                             REPORT_CODE_OFF+depth*2, "",
6494                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6495                 });
6496             }
6497             mark_state = ST.prev_mark;
6498             sv_yes_mark = mark_state ?
6499                 mark_state->u.mark.mark_name : NULL;
6500             sayNO;
6501             assert(0); /* NOTREACHED */
6502
6503         case SKIP:  /*  (*SKIP)  */
6504             if (scan->flags) {
6505                 /* (*SKIP) : if we fail we cut here*/
6506                 ST.mark_name = NULL;
6507                 ST.mark_loc = locinput;
6508                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6509             } else {
6510                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6511                    otherwise do nothing.  Meaning we need to scan
6512                  */
6513                 regmatch_state *cur = mark_state;
6514                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6515
6516                 while (cur) {
6517                     if ( sv_eq( cur->u.mark.mark_name,
6518                                 find ) )
6519                     {
6520                         ST.mark_name = find;
6521                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6522                     }
6523                     cur = cur->u.mark.prev_mark;
6524                 }
6525             }
6526             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6527             break;
6528
6529         case SKIP_next_fail:
6530             if (ST.mark_name) {
6531                 /* (*CUT:NAME) - Set up to search for the name as we
6532                    collapse the stack*/
6533                 popmark = ST.mark_name;
6534             } else {
6535                 /* (*CUT) - No name, we cut here.*/
6536                 if (ST.mark_loc > startpoint)
6537                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6538                 /* but we set sv_commit to latest mark_name if there
6539                    is one so they can test to see how things lead to this
6540                    cut */
6541                 if (mark_state)
6542                     sv_commit=mark_state->u.mark.mark_name;
6543             }
6544             no_final = 1;
6545             sayNO;
6546             assert(0); /* NOTREACHED */
6547 #undef ST
6548
6549         case LNBREAK: /* \R */
6550             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6551                 locinput += n;
6552             } else
6553                 sayNO;
6554             break;
6555
6556         default:
6557             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6558                           PTR2UV(scan), OP(scan));
6559             Perl_croak(aTHX_ "regexp memory corruption");
6560
6561         /* this is a point to jump to in order to increment
6562          * locinput by one character */
6563         increment_locinput:
6564             assert(!NEXTCHR_IS_EOS);
6565             if (utf8_target) {
6566                 locinput += PL_utf8skip[nextchr];
6567                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6568                 if (locinput > reginfo->strend)
6569                     sayNO;
6570             }
6571             else
6572                 locinput++;
6573             break;
6574
6575         } /* end switch */
6576
6577         /* switch break jumps here */
6578         scan = next; /* prepare to execute the next op and ... */
6579         continue;    /* ... jump back to the top, reusing st */
6580         assert(0); /* NOTREACHED */
6581
6582       push_yes_state:
6583         /* push a state that backtracks on success */
6584         st->u.yes.prev_yes_state = yes_state;
6585         yes_state = st;
6586         /* FALL THROUGH */
6587       push_state:
6588         /* push a new regex state, then continue at scan  */
6589         {
6590             regmatch_state *newst;
6591
6592             DEBUG_STACK_r({
6593                 regmatch_state *cur = st;
6594                 regmatch_state *curyes = yes_state;
6595                 int curd = depth;
6596                 regmatch_slab *slab = PL_regmatch_slab;
6597                 for (;curd > -1;cur--,curd--) {
6598                     if (cur < SLAB_FIRST(slab)) {
6599                         slab = slab->prev;
6600                         cur = SLAB_LAST(slab);
6601                     }
6602                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6603                         REPORT_CODE_OFF + 2 + depth * 2,"",
6604                         curd, PL_reg_name[cur->resume_state],
6605                         (curyes == cur) ? "yes" : ""
6606                     );
6607                     if (curyes == cur)
6608                         curyes = cur->u.yes.prev_yes_state;
6609                 }
6610             } else
6611                 DEBUG_STATE_pp("push")
6612             );
6613             depth++;
6614             st->locinput = locinput;
6615             newst = st+1;
6616             if (newst >  SLAB_LAST(PL_regmatch_slab))
6617                 newst = S_push_slab(aTHX);
6618             PL_regmatch_state = newst;
6619
6620             locinput = pushinput;
6621             st = newst;
6622             continue;
6623             assert(0); /* NOTREACHED */
6624         }
6625     }
6626
6627     /*
6628     * We get here only if there's trouble -- normally "case END" is
6629     * the terminating point.
6630     */
6631     Perl_croak(aTHX_ "corrupted regexp pointers");
6632     /*NOTREACHED*/
6633     sayNO;
6634
6635 yes:
6636     if (yes_state) {
6637         /* we have successfully completed a subexpression, but we must now
6638          * pop to the state marked by yes_state and continue from there */
6639         assert(st != yes_state);
6640 #ifdef DEBUGGING
6641         while (st != yes_state) {
6642             st--;
6643             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6644                 PL_regmatch_slab = PL_regmatch_slab->prev;
6645                 st = SLAB_LAST(PL_regmatch_slab);
6646             }
6647             DEBUG_STATE_r({
6648                 if (no_final) {
6649                     DEBUG_STATE_pp("pop (no final)");
6650                 } else {
6651                     DEBUG_STATE_pp("pop (yes)");
6652                 }
6653             });
6654             depth--;
6655         }
6656 #else
6657         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6658             || yes_state > SLAB_LAST(PL_regmatch_slab))
6659         {
6660             /* not in this slab, pop slab */
6661             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6662             PL_regmatch_slab = PL_regmatch_slab->prev;
6663             st = SLAB_LAST(PL_regmatch_slab);
6664         }
6665         depth -= (st - yes_state);
6666 #endif
6667         st = yes_state;
6668         yes_state = st->u.yes.prev_yes_state;
6669         PL_regmatch_state = st;
6670
6671         if (no_final)
6672             locinput= st->locinput;
6673         state_num = st->resume_state + no_final;
6674         goto reenter_switch;
6675     }
6676
6677     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6678                           PL_colors[4], PL_colors[5]));
6679
6680     if (reginfo->info_aux_eval) {
6681         /* each successfully executed (?{...}) block does the equivalent of
6682          *   local $^R = do {...}
6683          * When popping the save stack, all these locals would be undone;
6684          * bypass this by setting the outermost saved $^R to the latest
6685          * value */
6686         if (oreplsv != GvSV(PL_replgv))
6687             sv_setsv(oreplsv, GvSV(PL_replgv));
6688     }
6689     result = 1;
6690     goto final_exit;
6691
6692 no:
6693     DEBUG_EXECUTE_r(
6694         PerlIO_printf(Perl_debug_log,
6695             "%*s  %sfailed...%s\n",
6696             REPORT_CODE_OFF+depth*2, "",
6697             PL_colors[4], PL_colors[5])
6698         );
6699
6700 no_silent:
6701     if (no_final) {
6702         if (yes_state) {
6703             goto yes;
6704         } else {
6705             goto final_exit;
6706         }
6707     }
6708     if (depth) {
6709         /* there's a previous state to backtrack to */
6710         st--;
6711         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6712             PL_regmatch_slab = PL_regmatch_slab->prev;
6713             st = SLAB_LAST(PL_regmatch_slab);
6714         }
6715         PL_regmatch_state = st;
6716         locinput= st->locinput;
6717
6718         DEBUG_STATE_pp("pop");
6719         depth--;
6720         if (yes_state == st)
6721             yes_state = st->u.yes.prev_yes_state;
6722
6723         state_num = st->resume_state + 1; /* failure = success + 1 */
6724         goto reenter_switch;
6725     }
6726     result = 0;
6727
6728   final_exit:
6729     if (rex->intflags & PREGf_VERBARG_SEEN) {
6730         SV *sv_err = get_sv("REGERROR", 1);
6731         SV *sv_mrk = get_sv("REGMARK", 1);
6732         if (result) {
6733             sv_commit = &PL_sv_no;
6734             if (!sv_yes_mark)
6735                 sv_yes_mark = &PL_sv_yes;
6736         } else {
6737             if (!sv_commit)
6738                 sv_commit = &PL_sv_yes;
6739             sv_yes_mark = &PL_sv_no;
6740         }
6741         sv_setsv(sv_err, sv_commit);
6742         sv_setsv(sv_mrk, sv_yes_mark);
6743     }
6744
6745
6746     if (last_pushed_cv) {
6747         dSP;
6748         POP_MULTICALL;
6749         PERL_UNUSED_VAR(SP);
6750     }
6751
6752     assert(!result ||  locinput - reginfo->strbeg >= 0);
6753     return result ?  locinput - reginfo->strbeg : -1;
6754 }
6755
6756 /*
6757  - regrepeat - repeatedly match something simple, report how many
6758  *
6759  * What 'simple' means is a node which can be the operand of a quantifier like
6760  * '+', or {1,3}
6761  *
6762  * startposp - pointer a pointer to the start position.  This is updated
6763  *             to point to the byte following the highest successful
6764  *             match.
6765  * p         - the regnode to be repeatedly matched against.
6766  * reginfo   - struct holding match state, such as strend
6767  * max       - maximum number of things to match.
6768  * depth     - (for debugging) backtracking depth.
6769  */
6770 STATIC I32
6771 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6772             regmatch_info *const reginfo, I32 max, int depth)
6773 {
6774     dVAR;
6775     char *scan;     /* Pointer to current position in target string */
6776     I32 c;
6777     char *loceol = reginfo->strend;   /* local version */
6778     I32 hardcount = 0;  /* How many matches so far */
6779     bool utf8_target = reginfo->is_utf8_target;
6780     int to_complement = 0;  /* Invert the result? */
6781     UV utf8_flags;
6782     _char_class_number classnum;
6783 #ifndef DEBUGGING
6784     PERL_UNUSED_ARG(depth);
6785 #endif
6786
6787     PERL_ARGS_ASSERT_REGREPEAT;
6788
6789     scan = *startposp;
6790     if (max == REG_INFTY)
6791         max = I32_MAX;
6792     else if (! utf8_target && loceol - scan > max)
6793         loceol = scan + max;
6794
6795     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6796      * to the maximum of how far we should go in it (leaving it set to the real
6797      * end, if the maximum permissible would take us beyond that).  This allows
6798      * us to make the loop exit condition that we haven't gone past <loceol> to
6799      * also mean that we haven't exceeded the max permissible count, saving a
6800      * test each time through the loop.  But it assumes that the OP matches a
6801      * single byte, which is true for most of the OPs below when applied to a
6802      * non-UTF-8 target.  Those relatively few OPs that don't have this
6803      * characteristic will have to compensate.
6804      *
6805      * There is no adjustment for UTF-8 targets, as the number of bytes per
6806      * character varies.  OPs will have to test both that the count is less
6807      * than the max permissible (using <hardcount> to keep track), and that we
6808      * are still within the bounds of the string (using <loceol>.  A few OPs
6809      * match a single byte no matter what the encoding.  They can omit the max
6810      * test if, for the UTF-8 case, they do the adjustment that was skipped
6811      * above.
6812      *
6813      * Thus, the code above sets things up for the common case; and exceptional
6814      * cases need extra work; the common case is to make sure <scan> doesn't
6815      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6816      * count doesn't exceed the maximum permissible */
6817
6818     switch (OP(p)) {
6819     case REG_ANY:
6820         if (utf8_target) {
6821             while (scan < loceol && hardcount < max && *scan != '\n') {
6822                 scan += UTF8SKIP(scan);
6823                 hardcount++;
6824             }
6825         } else {
6826             while (scan < loceol && *scan != '\n')
6827                 scan++;
6828         }
6829         break;
6830     case SANY:
6831         if (utf8_target) {
6832             while (scan < loceol && hardcount < max) {
6833                 scan += UTF8SKIP(scan);
6834                 hardcount++;
6835             }
6836         }
6837         else
6838             scan = loceol;
6839         break;
6840     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6841         if (utf8_target && loceol - scan > max) {
6842
6843             /* <loceol> hadn't been adjusted in the UTF-8 case */
6844             scan +=  max;
6845         }
6846         else {
6847             scan = loceol;
6848         }
6849         break;
6850     case EXACT:
6851         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6852
6853         c = (U8)*STRING(p);
6854
6855         /* Can use a simple loop if the pattern char to match on is invariant
6856          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6857          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6858          * true iff it doesn't matter if the argument is in UTF-8 or not */
6859         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6860             if (utf8_target && loceol - scan > max) {
6861                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6862                  * since here, to match at all, 1 char == 1 byte */
6863                 loceol = scan + max;
6864             }
6865             while (scan < loceol && UCHARAT(scan) == c) {
6866                 scan++;
6867             }
6868         }
6869         else if (reginfo->is_utf8_pat) {
6870             if (utf8_target) {
6871                 STRLEN scan_char_len;
6872
6873                 /* When both target and pattern are UTF-8, we have to do
6874                  * string EQ */
6875                 while (hardcount < max
6876                        && scan < loceol
6877                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6878                        && memEQ(scan, STRING(p), scan_char_len))
6879                 {
6880                     scan += scan_char_len;
6881                     hardcount++;
6882                 }
6883             }
6884             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6885
6886                 /* Target isn't utf8; convert the character in the UTF-8
6887                  * pattern to non-UTF8, and do a simple loop */
6888                 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
6889                 while (scan < loceol && UCHARAT(scan) == c) {
6890                     scan++;
6891                 }
6892             } /* else pattern char is above Latin1, can't possibly match the
6893                  non-UTF-8 target */
6894         }
6895         else {
6896
6897             /* Here, the string must be utf8; pattern isn't, and <c> is
6898              * different in utf8 than not, so can't compare them directly.
6899              * Outside the loop, find the two utf8 bytes that represent c, and
6900              * then look for those in sequence in the utf8 string */
6901             U8 high = UTF8_TWO_BYTE_HI(c);
6902             U8 low = UTF8_TWO_BYTE_LO(c);
6903
6904             while (hardcount < max
6905                     && scan + 1 < loceol
6906                     && UCHARAT(scan) == high
6907                     && UCHARAT(scan + 1) == low)
6908             {
6909                 scan += 2;
6910                 hardcount++;
6911             }
6912         }
6913         break;
6914
6915     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
6916         assert(! reginfo->is_utf8_pat);
6917         /* FALL THROUGH */
6918     case EXACTFA:
6919         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6920         goto do_exactf;
6921
6922     case EXACTFL:
6923         RXp_MATCH_TAINTED_on(prog);
6924         utf8_flags = FOLDEQ_UTF8_LOCALE;
6925         goto do_exactf;
6926
6927     case EXACTF:   /* This node only generated for non-utf8 patterns */
6928         assert(! reginfo->is_utf8_pat);
6929         utf8_flags = 0;
6930         goto do_exactf;
6931
6932     case EXACTFU_SS:
6933     case EXACTFU:
6934         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6935
6936     do_exactf: {
6937         int c1, c2;
6938         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6939
6940         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6941
6942         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6943                                         reginfo))
6944         {
6945             if (c1 == CHRTEST_VOID) {
6946                 /* Use full Unicode fold matching */
6947                 char *tmpeol = reginfo->strend;
6948                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6949                 while (hardcount < max
6950                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6951                                              STRING(p), NULL, pat_len,
6952                                              reginfo->is_utf8_pat, utf8_flags))
6953                 {
6954                     scan = tmpeol;
6955                     tmpeol = reginfo->strend;
6956                     hardcount++;
6957                 }
6958             }
6959             else if (utf8_target) {
6960                 if (c1 == c2) {
6961                     while (scan < loceol
6962                            && hardcount < max
6963                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6964                     {
6965                         scan += UTF8SKIP(scan);
6966                         hardcount++;
6967                     }
6968                 }
6969                 else {
6970                     while (scan < loceol
6971                            && hardcount < max
6972                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6973                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6974                     {
6975                         scan += UTF8SKIP(scan);
6976                         hardcount++;
6977                     }
6978                 }
6979             }
6980             else if (c1 == c2) {
6981                 while (scan < loceol && UCHARAT(scan) == c1) {
6982                     scan++;
6983                 }
6984             }
6985             else {
6986                 while (scan < loceol &&
6987                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6988                 {
6989                     scan++;
6990                 }
6991             }
6992         }
6993         break;
6994     }
6995     case ANYOF:
6996     case ANYOF_WARN_SUPER:
6997         if (utf8_target) {
6998             while (hardcount < max
6999                    && scan < loceol
7000                    && reginclass(prog, p, (U8*)scan, utf8_target))
7001             {
7002                 scan += UTF8SKIP(scan);
7003                 hardcount++;
7004             }
7005         } else {
7006             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7007                 scan++;
7008         }
7009         break;
7010
7011     /* The argument (FLAGS) to all the POSIX node types is the class number */
7012
7013     case NPOSIXL:
7014         to_complement = 1;
7015         /* FALLTHROUGH */
7016
7017     case POSIXL:
7018         RXp_MATCH_TAINTED_on(prog);
7019         if (! utf8_target) {
7020             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7021                                                                    *scan)))
7022             {
7023                 scan++;
7024             }
7025         } else {
7026             while (hardcount < max && scan < loceol
7027                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7028                                                                   (U8 *) scan)))
7029             {
7030                 scan += UTF8SKIP(scan);
7031                 hardcount++;
7032             }
7033         }
7034         break;
7035
7036     case POSIXD:
7037         if (utf8_target) {
7038             goto utf8_posix;
7039         }
7040         /* FALLTHROUGH */
7041
7042     case POSIXA:
7043         if (utf8_target && loceol - scan > max) {
7044
7045             /* We didn't adjust <loceol> at the beginning of this routine
7046              * because is UTF-8, but it is actually ok to do so, since here, to
7047              * match, 1 char == 1 byte. */
7048             loceol = scan + max;
7049         }
7050         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7051             scan++;
7052         }
7053         break;
7054
7055     case NPOSIXD:
7056         if (utf8_target) {
7057             to_complement = 1;
7058             goto utf8_posix;
7059         }
7060         /* FALL THROUGH */
7061
7062     case NPOSIXA:
7063         if (! utf8_target) {
7064             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7065                 scan++;
7066             }
7067         }
7068         else {
7069
7070             /* The complement of something that matches only ASCII matches all
7071              * UTF-8 variant code points, plus everything in ASCII that isn't
7072              * in the class. */
7073             while (hardcount < max && scan < loceol
7074                    && (! UTF8_IS_INVARIANT(*scan)
7075                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7076             {
7077                 scan += UTF8SKIP(scan);
7078                 hardcount++;
7079             }
7080         }
7081         break;
7082
7083     case NPOSIXU:
7084         to_complement = 1;
7085         /* FALLTHROUGH */
7086
7087     case POSIXU:
7088         if (! utf8_target) {
7089             while (scan < loceol && to_complement
7090                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7091             {
7092                 scan++;
7093             }
7094         }
7095         else {
7096       utf8_posix:
7097             classnum = (_char_class_number) FLAGS(p);
7098             if (classnum < _FIRST_NON_SWASH_CC) {
7099
7100                 /* Here, a swash is needed for above-Latin1 code points.
7101                  * Process as many Latin1 code points using the built-in rules.
7102                  * Go to another loop to finish processing upon encountering
7103                  * the first Latin1 code point.  We could do that in this loop
7104                  * as well, but the other way saves having to test if the swash
7105                  * has been loaded every time through the loop: extra space to
7106                  * save a test. */
7107                 while (hardcount < max && scan < loceol) {
7108                     if (UTF8_IS_INVARIANT(*scan)) {
7109                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7110                                                                    classnum))))
7111                         {
7112                             break;
7113                         }
7114                         scan++;
7115                     }
7116                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7117                         if (! (to_complement
7118                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7119                                                                      *(scan + 1)),
7120                                                     classnum))))
7121                         {
7122                             break;
7123                         }
7124                         scan += 2;
7125                     }
7126                     else {
7127                         goto found_above_latin1;
7128                     }
7129
7130                     hardcount++;
7131                 }
7132             }
7133             else {
7134                 /* For these character classes, the knowledge of how to handle
7135                  * every code point is compiled in to Perl via a macro.  This
7136                  * code is written for making the loops as tight as possible.
7137                  * It could be refactored to save space instead */
7138                 switch (classnum) {
7139                     case _CC_ENUM_SPACE:    /* XXX would require separate code
7140                                                if we revert the change of \v
7141                                                matching this */
7142                         /* FALL THROUGH */
7143                     case _CC_ENUM_PSXSPC:
7144                         while (hardcount < max
7145                                && scan < loceol
7146                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7147                         {
7148                             scan += UTF8SKIP(scan);
7149                             hardcount++;
7150                         }
7151                         break;
7152                     case _CC_ENUM_BLANK:
7153                         while (hardcount < max
7154                                && scan < loceol
7155                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7156                         {
7157                             scan += UTF8SKIP(scan);
7158                             hardcount++;
7159                         }
7160                         break;
7161                     case _CC_ENUM_XDIGIT:
7162                         while (hardcount < max
7163                                && scan < loceol
7164                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7165                         {
7166                             scan += UTF8SKIP(scan);
7167                             hardcount++;
7168                         }
7169                         break;
7170                     case _CC_ENUM_VERTSPACE:
7171                         while (hardcount < max
7172                                && scan < loceol
7173                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7174                         {
7175                             scan += UTF8SKIP(scan);
7176                             hardcount++;
7177                         }
7178                         break;
7179                     case _CC_ENUM_CNTRL:
7180                         while (hardcount < max
7181                                && scan < loceol
7182                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7183                         {
7184                             scan += UTF8SKIP(scan);
7185                             hardcount++;
7186                         }
7187                         break;
7188                     default:
7189                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7190                 }
7191             }
7192         }
7193         break;
7194
7195       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7196
7197         /* Load the swash if not already present */
7198         if (! PL_utf8_swash_ptrs[classnum]) {
7199             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7200             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7201                                         "utf8", swash_property_names[classnum],
7202                                         &PL_sv_undef, 1, 0, NULL, &flags);
7203         }
7204
7205         while (hardcount < max && scan < loceol
7206                && to_complement ^ cBOOL(_generic_utf8(
7207                                        classnum,
7208                                        scan,
7209                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
7210                                                    (U8 *) scan,
7211                                                    TRUE))))
7212         {
7213             scan += UTF8SKIP(scan);
7214             hardcount++;
7215         }
7216         break;
7217
7218     case LNBREAK:
7219         if (utf8_target) {
7220             while (hardcount < max && scan < loceol &&
7221                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7222                 scan += c;
7223                 hardcount++;
7224             }
7225         } else {
7226             /* LNBREAK can match one or two latin chars, which is ok, but we
7227              * have to use hardcount in this situation, and throw away the
7228              * adjustment to <loceol> done before the switch statement */
7229             loceol = reginfo->strend;
7230             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7231                 scan+=c;
7232                 hardcount++;
7233             }
7234         }
7235         break;
7236
7237     case BOUND:
7238     case BOUNDA:
7239     case BOUNDL:
7240     case BOUNDU:
7241     case EOS:
7242     case GPOS:
7243     case KEEPS:
7244     case NBOUND:
7245     case NBOUNDA:
7246     case NBOUNDL:
7247     case NBOUNDU:
7248     case OPFAIL:
7249     case SBOL:
7250     case SEOL:
7251         /* These are all 0 width, so match right here or not at all. */
7252         break;
7253
7254     default:
7255         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7256         assert(0); /* NOTREACHED */
7257
7258     }
7259
7260     if (hardcount)
7261         c = hardcount;
7262     else
7263         c = scan - *startposp;
7264     *startposp = scan;
7265
7266     DEBUG_r({
7267         GET_RE_DEBUG_FLAGS_DECL;
7268         DEBUG_EXECUTE_r({
7269             SV * const prop = sv_newmortal();
7270             regprop(prog, prop, p);
7271             PerlIO_printf(Perl_debug_log,
7272                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7273                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7274         });
7275     });
7276
7277     return(c);
7278 }
7279
7280
7281 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7282 /*
7283 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7284 create a copy so that changes the caller makes won't change the shared one.
7285 If <altsvp> is non-null, will return NULL in it, for back-compat.
7286  */
7287 SV *
7288 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7289 {
7290     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7291
7292     if (altsvp) {
7293         *altsvp = NULL;
7294     }
7295
7296     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7297 }
7298 #endif
7299
7300 STATIC SV *
7301 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7302 {
7303     /* Returns the swash for the input 'node' in the regex 'prog'.
7304      * If <doinit> is 'true', will attempt to create the swash if not already
7305      *    done.
7306      * If <listsvp> is non-null, will return the printable contents of the
7307      *    swash.  This can be used to get debugging information even before the
7308      *    swash exists, by calling this function with 'doinit' set to false, in
7309      *    which case the components that will be used to eventually create the
7310      *    swash are returned  (in a printable form).
7311      * Tied intimately to how regcomp.c sets up the data structure */
7312
7313     dVAR;
7314     SV *sw  = NULL;
7315     SV *si  = NULL;         /* Input swash initialization string */
7316     SV*  invlist = NULL;
7317
7318     RXi_GET_DECL(prog,progi);
7319     const struct reg_data * const data = prog ? progi->data : NULL;
7320
7321     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7322
7323     assert(ANYOF_NONBITMAP(node));
7324
7325     if (data && data->count) {
7326         const U32 n = ARG(node);
7327
7328         if (data->what[n] == 's') {
7329             SV * const rv = MUTABLE_SV(data->data[n]);
7330             AV * const av = MUTABLE_AV(SvRV(rv));
7331             SV **const ary = AvARRAY(av);
7332             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7333
7334             si = *ary;  /* ary[0] = the string to initialize the swash with */
7335
7336             /* Elements 2 and 3 are either both present or both absent. [2] is
7337              * any inversion list generated at compile time; [3] indicates if
7338              * that inversion list has any user-defined properties in it. */
7339             if (av_len(av) >= 2) {
7340                 invlist = ary[2];
7341                 if (SvUV(ary[3])) {
7342                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7343                 }
7344             }
7345             else {
7346                 invlist = NULL;
7347             }
7348
7349             /* Element [1] is reserved for the set-up swash.  If already there,
7350              * return it; if not, create it and store it there */
7351             if (ary[1] && SvROK(ary[1])) {
7352                 sw = ary[1];
7353             }
7354             else if (si && doinit) {
7355
7356                 sw = _core_swash_init("utf8", /* the utf8 package */
7357                                       "", /* nameless */
7358                                       si,
7359                                       1, /* binary */
7360                                       0, /* not from tr/// */
7361                                       invlist,
7362                                       &swash_init_flags);
7363                 (void)av_store(av, 1, sw);
7364             }
7365         }
7366     }
7367
7368     /* If requested, return a printable version of what this swash matches */
7369     if (listsvp) {
7370         SV* matches_string = newSVpvn("", 0);
7371
7372         /* The swash should be used, if possible, to get the data, as it
7373          * contains the resolved data.  But this function can be called at
7374          * compile-time, before everything gets resolved, in which case we
7375          * return the currently best available information, which is the string
7376          * that will eventually be used to do that resolving, 'si' */
7377         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7378             && (si && si != &PL_sv_undef))
7379         {
7380             sv_catsv(matches_string, si);
7381         }
7382
7383         /* Add the inversion list to whatever we have.  This may have come from
7384          * the swash, or from an input parameter */
7385         if (invlist) {
7386             sv_catsv(matches_string, _invlist_contents(invlist));
7387         }
7388         *listsvp = matches_string;
7389     }
7390
7391     return sw;
7392 }
7393
7394 /*
7395  - reginclass - determine if a character falls into a character class
7396
7397   n is the ANYOF regnode
7398   p is the target string
7399   utf8_target tells whether p is in UTF-8.
7400
7401   Returns true if matched; false otherwise.
7402
7403   Note that this can be a synthetic start class, a combination of various
7404   nodes, so things you think might be mutually exclusive, such as locale,
7405   aren't.  It can match both locale and non-locale
7406
7407  */
7408
7409 STATIC bool
7410 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7411 {
7412     dVAR;
7413     const char flags = ANYOF_FLAGS(n);
7414     bool match = FALSE;
7415     UV c = *p;
7416
7417     PERL_ARGS_ASSERT_REGINCLASS;
7418
7419     /* If c is not already the code point, get it.  Note that
7420      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7421     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7422         STRLEN c_len = 0;
7423         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7424                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7425                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7426                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7427                  * UTF8_ALLOW_FFFF */
7428         if (c_len == (STRLEN)-1)
7429             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7430     }
7431
7432     /* If this character is potentially in the bitmap, check it */
7433     if (c < 256) {
7434         if (ANYOF_BITMAP_TEST(n, c))
7435             match = TRUE;
7436         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7437                 && ! utf8_target
7438                 && ! isASCII(c))
7439         {
7440             match = TRUE;
7441         }
7442         else if (flags & ANYOF_LOCALE) {
7443             RXp_MATCH_TAINTED_on(prog);
7444
7445             if ((flags & ANYOF_LOC_FOLD)
7446                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7447             {
7448                 match = TRUE;
7449             }
7450             else if (ANYOF_POSIXL_TEST_ANY_SET(n)) {
7451
7452                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7453                  * if the class includes the Posix character class given by
7454                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7455                  * complemented Posix class given by int(bit/2).  So we loop
7456                  * through the bits, each time changing whether we complement
7457                  * the result or not.  Suppose for the sake of illustration
7458                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7459                  * is set, it means there is a match for this ANYOF node if the
7460                  * character is in the class given by the expression (0 / 2 = 0
7461                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7462                  * and since 'to_complement' is 0, the result will stay TRUE,
7463                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7464                  * bit 1 is 1.  That means there is a match if the character
7465                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7466                  * but will on bit 1.  On the second iteration 'to_complement'
7467                  * will be 1, so the exclusive or will reverse things, so we
7468                  * are testing for \W.  On the third iteration, 'to_complement'
7469                  * will be 0, and we would be testing for \s; the fourth
7470                  * iteration would test for \S, etc.
7471                  *
7472                  * Note that this code assumes that all the classes are closed
7473                  * under folding.  For example, if a character matches \w, then
7474                  * its fold does too; and vice versa.  This should be true for
7475                  * any well-behaved locale for all the currently defined Posix
7476                  * classes, except for :lower: and :upper:, which are handled
7477                  * by the pseudo-class :cased: which matches if either of the
7478                  * other two does.  To get rid of this assumption, an outer
7479                  * loop could be used below to iterate over both the source
7480                  * character, and its fold (if different) */
7481
7482                 int count = 0;
7483                 int to_complement = 0;
7484                 while (count < ANYOF_MAX) {
7485                     if (ANYOF_POSIXL_TEST(n, count)
7486                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7487                     {
7488                         match = TRUE;
7489                         break;
7490                     }
7491                     count++;
7492                     to_complement ^= 1;
7493                 }
7494             }
7495         }
7496     }
7497
7498     /* If the bitmap didn't (or couldn't) match, and something outside the
7499      * bitmap could match, try that.  Locale nodes specify completely the
7500      * behavior of code points in the bit map (otherwise, a utf8 target would
7501      * cause them to be treated as Unicode and not locale), except in
7502      * the very unlikely event when this node is a synthetic start class, which
7503      * could be a combination of locale and non-locale nodes.  So allow locale
7504      * to match for the synthetic start class, which will give a false
7505      * positive that will be resolved when the match is done again as not part
7506      * of the synthetic start class */
7507     if (!match) {
7508         if (utf8_target && (flags & ANYOF_ABOVE_LATIN1_ALL) && c >= 256) {
7509             match = TRUE;       /* Everything above 255 matches */
7510         }
7511         else if (ANYOF_NONBITMAP(n)
7512                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7513                      || (utf8_target
7514                          && (c >=256
7515                              || (! (flags & ANYOF_LOCALE))
7516                              || OP(n) == ANYOF_SYNTHETIC))))
7517         {
7518             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7519             if (sw) {
7520                 U8 * utf8_p;
7521                 if (utf8_target) {
7522                     utf8_p = (U8 *) p;
7523                 } else { /* Convert to utf8 */
7524                     STRLEN len = 1;
7525                     utf8_p = bytes_to_utf8(p, &len);
7526                 }
7527
7528                 if (swash_fetch(sw, utf8_p, TRUE)) {
7529                     match = TRUE;
7530                 }
7531
7532                 /* If we allocated a string above, free it */
7533                 if (! utf8_target) Safefree(utf8_p);
7534             }
7535         }
7536
7537         if (UNICODE_IS_SUPER(c)
7538             && OP(n) == ANYOF_WARN_SUPER
7539             && ckWARN_d(WARN_NON_UNICODE))
7540         {
7541             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7542                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7543         }
7544     }
7545
7546     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7547     return cBOOL(flags & ANYOF_INVERT) ^ match;
7548 }
7549
7550 STATIC U8 *
7551 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7552 {
7553     /* return the position 'off' UTF-8 characters away from 's', forward if
7554      * 'off' >= 0, backwards if negative.  But don't go outside of position
7555      * 'lim', which better be < s  if off < 0 */
7556
7557     dVAR;
7558
7559     PERL_ARGS_ASSERT_REGHOP3;
7560
7561     if (off >= 0) {
7562         while (off-- && s < lim) {
7563             /* XXX could check well-formedness here */
7564             s += UTF8SKIP(s);
7565         }
7566     }
7567     else {
7568         while (off++ && s > lim) {
7569             s--;
7570             if (UTF8_IS_CONTINUED(*s)) {
7571                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7572                     s--;
7573             }
7574             /* XXX could check well-formedness here */
7575         }
7576     }
7577     return s;
7578 }
7579
7580 #ifdef XXX_dmq
7581 /* there are a bunch of places where we use two reghop3's that should
7582    be replaced with this routine. but since thats not done yet
7583    we ifdef it out - dmq
7584 */
7585 STATIC U8 *
7586 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7587 {
7588     dVAR;
7589
7590     PERL_ARGS_ASSERT_REGHOP4;
7591
7592     if (off >= 0) {
7593         while (off-- && s < rlim) {
7594             /* XXX could check well-formedness here */
7595             s += UTF8SKIP(s);
7596         }
7597     }
7598     else {
7599         while (off++ && s > llim) {
7600             s--;
7601             if (UTF8_IS_CONTINUED(*s)) {
7602                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7603                     s--;
7604             }
7605             /* XXX could check well-formedness here */
7606         }
7607     }
7608     return s;
7609 }
7610 #endif
7611
7612 STATIC U8 *
7613 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7614 {
7615     dVAR;
7616
7617     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7618
7619     if (off >= 0) {
7620         while (off-- && s < lim) {
7621             /* XXX could check well-formedness here */
7622             s += UTF8SKIP(s);
7623         }
7624         if (off >= 0)
7625             return NULL;
7626     }
7627     else {
7628         while (off++ && s > lim) {
7629             s--;
7630             if (UTF8_IS_CONTINUED(*s)) {
7631                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7632                     s--;
7633             }
7634             /* XXX could check well-formedness here */
7635         }
7636         if (off <= 0)
7637             return NULL;
7638     }
7639     return s;
7640 }
7641
7642
7643 /* when executing a regex that may have (?{}), extra stuff needs setting
7644    up that will be visible to the called code, even before the current
7645    match has finished. In particular:
7646
7647    * $_ is localised to the SV currently being matched;
7648    * pos($_) is created if necessary, ready to be updated on each call-out
7649      to code;
7650    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7651      isn't set until the current pattern is successfully finished), so that
7652      $1 etc of the match-so-far can be seen;
7653    * save the old values of subbeg etc of the current regex, and  set then
7654      to the current string (again, this is normally only done at the end
7655      of execution)
7656 */
7657
7658 static void
7659 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7660 {
7661     MAGIC *mg;
7662     regexp *const rex = ReANY(reginfo->prog);
7663     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7664
7665     eval_state->rex = rex;
7666
7667     if (reginfo->sv) {
7668         /* Make $_ available to executed code. */
7669         if (reginfo->sv != DEFSV) {
7670             SAVE_DEFSV;
7671             DEFSV_set(reginfo->sv);
7672         }
7673
7674         if (!(mg = mg_find_mglob(reginfo->sv))) {
7675             /* prepare for quick setting of pos */
7676             mg = sv_magicext_mglob(reginfo->sv);
7677             mg->mg_len = -1;
7678         }
7679         eval_state->pos_magic = mg;
7680         eval_state->pos       = mg->mg_len;
7681         eval_state->pos_flags = mg->mg_flags;
7682     }
7683     else
7684         eval_state->pos_magic = NULL;
7685
7686     if (!PL_reg_curpm) {
7687         /* PL_reg_curpm is a fake PMOP that we can attach the current
7688          * regex to and point PL_curpm at, so that $1 et al are visible
7689          * within a /(?{})/. It's just allocated once per interpreter the
7690          * first time its needed */
7691         Newxz(PL_reg_curpm, 1, PMOP);
7692 #ifdef USE_ITHREADS
7693         {
7694             SV* const repointer = &PL_sv_undef;
7695             /* this regexp is also owned by the new PL_reg_curpm, which
7696                will try to free it.  */
7697             av_push(PL_regex_padav, repointer);
7698             PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7699             PL_regex_pad = AvARRAY(PL_regex_padav);
7700         }
7701 #endif
7702     }
7703     SET_reg_curpm(reginfo->prog);
7704     eval_state->curpm = PL_curpm;
7705     PL_curpm = PL_reg_curpm;
7706     if (RXp_MATCH_COPIED(rex)) {
7707         /*  Here is a serious problem: we cannot rewrite subbeg,
7708             since it may be needed if this match fails.  Thus
7709             $` inside (?{}) could fail... */
7710         eval_state->subbeg     = rex->subbeg;
7711         eval_state->sublen     = rex->sublen;
7712         eval_state->suboffset  = rex->suboffset;
7713         eval_state->subcoffset = rex->subcoffset;
7714 #ifdef PERL_ANY_COW
7715         eval_state->saved_copy = rex->saved_copy;
7716 #endif
7717         RXp_MATCH_COPIED_off(rex);
7718     }
7719     else
7720         eval_state->subbeg = NULL;
7721     rex->subbeg = (char *)reginfo->strbeg;
7722     rex->suboffset = 0;
7723     rex->subcoffset = 0;
7724     rex->sublen = reginfo->strend - reginfo->strbeg;
7725 }
7726
7727
7728 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7729
7730 static void
7731 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7732 {
7733     dVAR;
7734     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7735     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
7736     regmatch_slab *s;
7737
7738     Safefree(aux->poscache);
7739
7740     if (eval_state) {
7741
7742         /* undo the effects of S_setup_eval_state() */
7743
7744         if (eval_state->subbeg) {
7745             regexp * const rex = eval_state->rex;
7746             rex->subbeg     = eval_state->subbeg;
7747             rex->sublen     = eval_state->sublen;
7748             rex->suboffset  = eval_state->suboffset;
7749             rex->subcoffset = eval_state->subcoffset;
7750 #ifdef PERL_ANY_COW
7751             rex->saved_copy = eval_state->saved_copy;
7752 #endif
7753             RXp_MATCH_COPIED_on(rex);
7754         }
7755         if (eval_state->pos_magic)
7756         {
7757             eval_state->pos_magic->mg_len = eval_state->pos;
7758             eval_state->pos_magic->mg_flags =
7759                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
7760                | (eval_state->pos_flags & MGf_BYTES);
7761         }
7762
7763         PL_curpm = eval_state->curpm;
7764     }
7765
7766     PL_regmatch_state = aux->old_regmatch_state;
7767     PL_regmatch_slab  = aux->old_regmatch_slab;
7768
7769     /* free all slabs above current one - this must be the last action
7770      * of this function, as aux and eval_state are allocated within
7771      * slabs and may be freed here */
7772
7773     s = PL_regmatch_slab->next;
7774     if (s) {
7775         PL_regmatch_slab->next = NULL;
7776         while (s) {
7777             regmatch_slab * const osl = s;
7778             s = s->next;
7779             Safefree(osl);
7780         }
7781     }
7782 }
7783
7784
7785 STATIC void
7786 S_to_utf8_substr(pTHX_ regexp *prog)
7787 {
7788     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7789      * on the converted value */
7790
7791     int i = 1;
7792
7793     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7794
7795     do {
7796         if (prog->substrs->data[i].substr
7797             && !prog->substrs->data[i].utf8_substr) {
7798             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7799             prog->substrs->data[i].utf8_substr = sv;
7800             sv_utf8_upgrade(sv);
7801             if (SvVALID(prog->substrs->data[i].substr)) {
7802                 if (SvTAIL(prog->substrs->data[i].substr)) {
7803                     /* Trim the trailing \n that fbm_compile added last
7804                        time.  */
7805                     SvCUR_set(sv, SvCUR(sv) - 1);
7806                     /* Whilst this makes the SV technically "invalid" (as its
7807                        buffer is no longer followed by "\0") when fbm_compile()
7808                        adds the "\n" back, a "\0" is restored.  */
7809                     fbm_compile(sv, FBMcf_TAIL);
7810                 } else
7811                     fbm_compile(sv, 0);
7812             }
7813             if (prog->substrs->data[i].substr == prog->check_substr)
7814                 prog->check_utf8 = sv;
7815         }
7816     } while (i--);
7817 }
7818
7819 STATIC bool
7820 S_to_byte_substr(pTHX_ regexp *prog)
7821 {
7822     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7823      * on the converted value; returns FALSE if can't be converted. */
7824
7825     dVAR;
7826     int i = 1;
7827
7828     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7829
7830     do {
7831         if (prog->substrs->data[i].utf8_substr
7832             && !prog->substrs->data[i].substr) {
7833             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7834             if (! sv_utf8_downgrade(sv, TRUE)) {
7835                 return FALSE;
7836             }
7837             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7838                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7839                     /* Trim the trailing \n that fbm_compile added last
7840                         time.  */
7841                     SvCUR_set(sv, SvCUR(sv) - 1);
7842                     fbm_compile(sv, FBMcf_TAIL);
7843                 } else
7844                     fbm_compile(sv, 0);
7845             }
7846             prog->substrs->data[i].substr = sv;
7847             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7848                 prog->check_substr = sv;
7849         }
7850     } while (i--);
7851
7852     return TRUE;
7853 }
7854
7855 /*
7856  * Local variables:
7857  * c-indentation-style: bsd
7858  * c-basic-offset: 4
7859  * indent-tabs-mode: nil
7860  * End:
7861  *
7862  * ex: set ts=8 sts=4 sw=4 et:
7863  */