regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define RF_tainted      1       /* tainted information used? e.g. locale */
  97 #define RF_warned       2               /* warned about big count? */
  98
  99 #define RF_utf8         8               /* Pattern contains multibyte chars? */
 100
 101 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
 102
 103 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 104
 105 #ifndef STATIC
 106 #define STATIC  static
 107 #endif
 108
 109 /* Valid for non-utf8 strings: avoids the reginclass
 110  * call if there are no complications: i.e., if everything matchable is
 111  * straight forward in the bitmap */
 112 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 113                                               : ANYOF_BITMAP_TEST(p,*(c)))
 114
 115 /*
 116  * Forwards.
 117  */
 118
 119 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 120 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 121
 122 #define HOPc(pos,off) \
 123         (char *)(PL_reg_match_utf8 \
 124             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 125             : (U8*)(pos + off))
 126 #define HOPBACKc(pos, off) \
 127         (char*)(PL_reg_match_utf8\
 128             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 129             : (pos - off >= PL_bostr)           \
 130                 ? (U8*)pos - off                \
 131                 : NULL)
 132
 133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 135
 136
 137 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 138 #define NEXTCHR_IS_EOS (nextchr < 0)
 139
 140 #define SET_nextchr \
 141     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 142
 143 #define SET_locinput(p) \
 144     locinput = (p);  \
 145     SET_nextchr
 146
 147
 148 /* these are unrolled below in the CCC_TRY_XXX defined */
 149 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 150     if (!CAT2(PL_utf8_,class)) { \
 151         bool ok; \
 152         ENTER; save_re_context(); \
 153         ok=CAT2(is_utf8_,class)((const U8*)str); \
 154         PERL_UNUSED_VAR(ok); \
 155         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 156 /* Doesn't do an assert to verify that is correct */
 157 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 158     if (!CAT2(PL_utf8_,class)) { \
 159         bool throw_away PERL_UNUSED_DECL; \
 160         ENTER; save_re_context(); \
 161         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 162         LEAVE; } } STMT_END
 163
 164 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 165 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 166 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 167
 168 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 169         /* No asserts are done for some of these, in case called on a   */  \
 170         /* Unicode version in which they map to nothing */                  \
 171         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 172         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 173
 174 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 175
 176 /* The actual code for CCC_TRY, which uses several variables from the routine
 177  * it's callable from.  It is designed to be the bulk of a case statement.
 178  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 179  *      nextchr matches the class.
 180  * UTF8_TEST is the whole test string to use for utf8 targets
 181  * LOAD is what to use to test, and if not present to load in the swash for the
 182  *      class
 183  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 184  *      UTF8_TEST test.
 185  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 186  * utf8 and a variant, load the swash if necessary and test using the utf8
 187  * test.  Advance to the next character if test is ok, otherwise fail; If not
 188  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 189  * fails, or advance to the next character */
 190
 191 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 192     if (NEXTCHR_IS_EOS) {                                              \
 193         sayNO;                                                                \
 194     }                                                                         \
 195     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 196         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 197         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 198             sayNO;                                                            \
 199         }                                                                     \
 200     }                                                                         \
 201     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 202             sayNO;                                                            \
 203     }                                                                         \
 204     goto increment_locinput;
 205
 206 /* Handle the non-locale cases for a character class and its complement.  It
 207  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 208  * This is because that code fails when the test succeeds, so we want to have
 209  * the test fail so that the code succeeds.  The swash is stored in a
 210  * predictable PL_ place */
 211 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 212                            CLASS, STR)                                        \
 213     case NAME:                                                                \
 214         _CCC_TRY_CODE( !, FUNC,                                               \
 215                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 216                                             (U8*)locinput, TRUE)),            \
 217                           CLASS, STR)                                         \
 218     case NNAME:                                                               \
 219         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 220                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 221                                             (U8*)locinput, TRUE)),            \
 222                           CLASS, STR)                                         \
 223
 224 /* Generate the case statements for both locale and non-locale character
 225  * classes in regmatch for classes that don't have special unicode semantics.
 226  * Locales don't use an immediate swash, but an intermediary special locale
 227  * function that is called on the pointer to the current place in the input
 228  * string.  That function will resolve to needing the same swash.  One might
 229  * think that because we don't know what the locale will match, we shouldn't
 230  * check with the swash loading function that it loaded properly; ie, that we
 231  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 232  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 233  * irrelevant here */
 234 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 235                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 236                 NAMEA, NNAMEA, FUNCA,                                         \
 237                 CLASS, STR)                                                   \
 238     case NAMEL:                                                               \
 239         PL_reg_flags |= RF_tainted;                                           \
 240         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 241     case NNAMEL:                                                              \
 242         PL_reg_flags |= RF_tainted;                                           \
 243         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 244                        CLASS, STR)                                            \
 245     case NAMEA:                                                               \
 246         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 247             sayNO;                                                            \
 248         }                                                                     \
 249         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 250         locinput++;                                        \
 251         break;                                                                \
 252     case NNAMEA:                                                              \
 253         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 254             sayNO;                                                            \
 255         }                                                                     \
 256         goto increment_locinput;                                              \
 257     /* Generate the non-locale cases */                                       \
 258     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 259
 260 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 261  * statements to handle separate Unicode semantics nodes */
 262 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 263                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 264                   NAMEU, NNAMEU, FUNCU,                                        \
 265                   NAMEA, NNAMEA, FUNCA,                                        \
 266                   CLASS, STR)                                                  \
 267     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 268             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 269             NAMEA, NNAMEA, FUNCA,                                              \
 270             CLASS, STR)                                                        \
 271     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 272
 273 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 274
 275 /* for use after a quantifier and before an EXACT-like node -- japhy */
 276 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 277  *
 278  * NOTE that *nothing* that affects backtracking should be in here, specifically
 279  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 280  * node that is in between two EXACT like nodes when ascertaining what the required
 281  * "follow" character is. This should probably be moved to regex compile time
 282  * although it may be done at run time beause of the REF possibility - more
 283  * investigation required. -- demerphq
 284 */
 285 #define JUMPABLE(rn) (      \
 286     OP(rn) == OPEN ||       \
 287     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 288     OP(rn) == EVAL ||   \
 289     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 290     OP(rn) == PLUS || OP(rn) == MINMOD || \
 291     OP(rn) == KEEPS || \
 292     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 293 )
 294 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 295
 296 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 297
 298 #if 0
 299 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 300    we don't need this definition. */
 301 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 302 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 303 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 304
 305 #else
 306 /* ... so we use this as its faster. */
 307 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 308 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 309 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 310 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 311
 312 #endif
 313
 314 /*
 315   Search for mandatory following text node; for lookahead, the text must
 316   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 317 */
 318 #define FIND_NEXT_IMPT(rn) STMT_START { \
 319     while (JUMPABLE(rn)) { \
 320         const OPCODE type = OP(rn); \
 321         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 322             rn = NEXTOPER(NEXTOPER(rn)); \
 323         else if (type == PLUS) \
 324             rn = NEXTOPER(rn); \
 325         else if (type == IFMATCH) \
 326             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 327         else rn += NEXT_OFF(rn); \
 328     } \
 329 } STMT_END
 330
 331
 332 static void restore_pos(pTHX_ void *arg);
 333
 334 #define REGCP_PAREN_ELEMS 3
 335 #define REGCP_OTHER_ELEMS 3
 336 #define REGCP_FRAME_ELEMS 1
 337 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 338  * are needed for the regexp context stack bookkeeping. */
 339
 340 STATIC CHECKPOINT
 341 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 342 {
 343     dVAR;
 344     const int retval = PL_savestack_ix;
 345     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 346     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 347     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 348     I32 p;
 349     GET_RE_DEBUG_FLAGS_DECL;
 350
 351     PERL_ARGS_ASSERT_REGCPPUSH;
 352
 353     if (paren_elems_to_push < 0)
 354         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 355                    paren_elems_to_push);
 356
 357     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 358         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 359                    " out of range (%lu-%ld)",
 360                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 361
 362     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 363
 364     DEBUG_BUFFERS_r(
 365         if ((int)PL_regsize > (int)parenfloor)
 366             PerlIO_printf(Perl_debug_log,
 367                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 368                 PTR2UV(rex),
 369                 PTR2UV(rex->offs)
 370             );
 371     );
 372     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 373 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 374         SSPUSHINT(rex->offs[p].end);
 375         SSPUSHINT(rex->offs[p].start);
 376         SSPUSHINT(rex->offs[p].start_tmp);
 377         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 378             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 379             (UV)p,
 380             (IV)rex->offs[p].start,
 381             (IV)rex->offs[p].start_tmp,
 382             (IV)rex->offs[p].end
 383         ));
 384     }
 385 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 386     SSPUSHINT(PL_regsize);
 387     SSPUSHINT(rex->lastparen);
 388     SSPUSHINT(rex->lastcloseparen);
 389     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 390
 391     return retval;
 392 }
 393
 394 /* These are needed since we do not localize EVAL nodes: */
 395 #define REGCP_SET(cp)                                           \
 396     DEBUG_STATE_r(                                              \
 397             PerlIO_printf(Perl_debug_log,                       \
 398                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 399                 (IV)PL_savestack_ix));                          \
 400     cp = PL_savestack_ix
 401
 402 #define REGCP_UNWIND(cp)                                        \
 403     DEBUG_STATE_r(                                              \
 404         if (cp != PL_savestack_ix)                              \
 405             PerlIO_printf(Perl_debug_log,                       \
 406                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 407                 (IV)(cp), (IV)PL_savestack_ix));                \
 408     regcpblow(cp)
 409
 410 #define UNWIND_PAREN(lp, lcp)               \
 411     for (n = rex->lastparen; n > lp; n--)   \
 412         rex->offs[n].end = -1;              \
 413     rex->lastparen = n;                     \
 414     rex->lastcloseparen = lcp;
 415
 416
 417 STATIC void
 418 S_regcppop(pTHX_ regexp *rex)
 419 {
 420     dVAR;
 421     UV i;
 422     U32 paren;
 423     GET_RE_DEBUG_FLAGS_DECL;
 424
 425     PERL_ARGS_ASSERT_REGCPPOP;
 426
 427     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 428     i = SSPOPUV;
 429     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 430     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 431     rex->lastcloseparen = SSPOPINT;
 432     rex->lastparen = SSPOPINT;
 433     PL_regsize = SSPOPINT;
 434
 435     i -= REGCP_OTHER_ELEMS;
 436     /* Now restore the parentheses context. */
 437     DEBUG_BUFFERS_r(
 438         if (i || rex->lastparen + 1 <= rex->nparens)
 439             PerlIO_printf(Perl_debug_log,
 440                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 441                 PTR2UV(rex),
 442                 PTR2UV(rex->offs)
 443             );
 444     );
 445     paren = PL_regsize;
 446     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 447         I32 tmps;
 448         rex->offs[paren].start_tmp = SSPOPINT;
 449         rex->offs[paren].start = SSPOPINT;
 450         tmps = SSPOPINT;
 451         if (paren <= rex->lastparen)
 452             rex->offs[paren].end = tmps;
 453         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 454             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 455             (UV)paren,
 456             (IV)rex->offs[paren].start,
 457             (IV)rex->offs[paren].start_tmp,
 458             (IV)rex->offs[paren].end,
 459             (paren > rex->lastparen ? "(skipped)" : ""));
 460         );
 461         paren--;
 462     }
 463 #if 1
 464     /* It would seem that the similar code in regtry()
 465      * already takes care of this, and in fact it is in
 466      * a better location to since this code can #if 0-ed out
 467      * but the code in regtry() is needed or otherwise tests
 468      * requiring null fields (pat.t#187 and split.t#{13,14}
 469      * (as of patchlevel 7877)  will fail.  Then again,
 470      * this code seems to be necessary or otherwise
 471      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 472      * --jhi updated by dapm */
 473     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 474         if (i > PL_regsize)
 475             rex->offs[i].start = -1;
 476         rex->offs[i].end = -1;
 477         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 478             "    \\%"UVuf": %s   ..-1 undeffing\n",
 479             (UV)i,
 480             (i > PL_regsize) ? "-1" : "  "
 481         ));
 482     }
 483 #endif
 484 }
 485
 486 /* restore the parens and associated vars at savestack position ix,
 487  * but without popping the stack */
 488
 489 STATIC void
 490 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 491 {
 492     I32 tmpix = PL_savestack_ix;
 493     PL_savestack_ix = ix;
 494     regcppop(rex);
 495     PL_savestack_ix = tmpix;
 496 }
 497
 498 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 499
 500 /*
 501  * pregexec and friends
 502  */
 503
 504 #ifndef PERL_IN_XSUB_RE
 505 /*
 506  - pregexec - match a regexp against a string
 507  */
 508 I32
 509 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 510          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 511 /* stringarg: the point in the string at which to begin matching */
 512 /* strend:    pointer to null at end of string */
 513 /* strbeg:    real beginning of string */
 514 /* minend:    end of match must be >= minend bytes after stringarg. */
 515 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 516  *            itself is accessed via the pointers above */
 517 /* nosave:    For optimizations. */
 518 {
 519     PERL_ARGS_ASSERT_PREGEXEC;
 520
 521     return
 522         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 523                       nosave ? 0 : REXEC_COPY_STR);
 524 }
 525 #endif
 526
 527 /*
 528  * Need to implement the following flags for reg_anch:
 529  *
 530  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 531  * USE_INTUIT_ML
 532  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 533  * INTUIT_AUTORITATIVE_ML
 534  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 535  * INTUIT_ONCE_ML
 536  *
 537  * Another flag for this function: SECOND_TIME (so that float substrs
 538  * with giant delta may be not rechecked).
 539  */
 540
 541 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 542
 543 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 544    Otherwise, only SvCUR(sv) is used to get strbeg. */
 545
 546 /* XXXX We assume that strpos is strbeg unless sv. */
 547
 548 /* XXXX Some places assume that there is a fixed substring.
 549         An update may be needed if optimizer marks as "INTUITable"
 550         RExen without fixed substrings.  Similarly, it is assumed that
 551         lengths of all the strings are no more than minlen, thus they
 552         cannot come from lookahead.
 553         (Or minlen should take into account lookahead.)
 554   NOTE: Some of this comment is not correct. minlen does now take account
 555   of lookahead/behind. Further research is required. -- demerphq
 556
 557 */
 558
 559 /* A failure to find a constant substring means that there is no need to make
 560    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 561    finding a substring too deep into the string means that less calls to
 562    regtry() should be needed.
 563
 564    REx compiler's optimizer found 4 possible hints:
 565         a) Anchored substring;
 566         b) Fixed substring;
 567         c) Whether we are anchored (beginning-of-line or \G);
 568         d) First node (of those at offset 0) which may distinguish positions;
 569    We use a)b)d) and multiline-part of c), and try to find a position in the
 570    string which does not contradict any of them.
 571  */
 572
 573 /* Most of decisions we do here should have been done at compile time.
 574    The nodes of the REx which we used for the search should have been
 575    deleted from the finite automaton. */
 576
 577 char *
 578 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 579                      char *strend, const U32 flags, re_scream_pos_data *data)
 580 {
 581     dVAR;
 582     struct regexp *const prog = ReANY(rx);
 583     I32 start_shift = 0;
 584     /* Should be nonnegative! */
 585     I32 end_shift   = 0;
 586     char *s;
 587     SV *check;
 588     char *strbeg;
 589     char *t;
 590     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 591     I32 ml_anch;
 592     char *other_last = NULL;    /* other substr checked before this */
 593     char *check_at = NULL;              /* check substr found at this pos */
 594     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 595     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 596     RXi_GET_DECL(prog,progi);
 597 #ifdef DEBUGGING
 598     const char * const i_strpos = strpos;
 599 #endif
 600     GET_RE_DEBUG_FLAGS_DECL;
 601
 602     PERL_ARGS_ASSERT_RE_INTUIT_START;
 603     PERL_UNUSED_ARG(flags);
 604     PERL_UNUSED_ARG(data);
 605
 606     RX_MATCH_UTF8_set(rx,utf8_target);
 607
 608     if (RX_UTF8(rx)) {
 609         PL_reg_flags |= RF_utf8;
 610     }
 611     DEBUG_EXECUTE_r(
 612         debug_start_match(rx, utf8_target, strpos, strend,
 613             sv ? "Guessing start of match in sv for"
 614                : "Guessing start of match in string for");
 615               );
 616
 617     /* CHR_DIST() would be more correct here but it makes things slow. */
 618     if (prog->minlen > strend - strpos) {
 619         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 620                               "String too short... [re_intuit_start]\n"));
 621         goto fail;
 622     }
 623
 624     /* XXX we need to pass strbeg as a separate arg: the following is
 625      * guesswork and can be wrong... */
 626     if (sv && SvPOK(sv)) {
 627         char * p   = SvPVX(sv);
 628         STRLEN cur = SvCUR(sv);
 629         if (p <= strpos && strpos < p + cur) {
 630             strbeg = p;
 631             assert(p <= strend && strend <= p + cur);
 632         }
 633         else
 634             strbeg = strend - cur;
 635     }
 636     else
 637         strbeg = strpos;
 638
 639     PL_regeol = strend;
 640     if (utf8_target) {
 641         if (!prog->check_utf8 && prog->check_substr)
 642             to_utf8_substr(prog);
 643         check = prog->check_utf8;
 644     } else {
 645         if (!prog->check_substr && prog->check_utf8) {
 646             if (! to_byte_substr(prog)) {
 647                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 648             }
 649         }
 650         check = prog->check_substr;
 651     }
 652     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 653         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 654                      || ( (prog->extflags & RXf_ANCH_BOL)
 655                           && !multiline ) );    /* Check after \n? */
 656
 657         if (!ml_anch) {
 658           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 659                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 660                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 661                && sv && !SvROK(sv)
 662                && (strpos != strbeg)) {
 663               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 664               goto fail;
 665           }
 666           if (prog->check_offset_min == prog->check_offset_max
 667               && !(prog->extflags & RXf_CANY_SEEN)
 668               && ! multiline)   /* /m can cause \n's to match that aren't
 669                                    accounted for in the string max length.
 670                                    See [perl #115242] */
 671           {
 672             /* Substring at constant offset from beg-of-str... */
 673             I32 slen;
 674
 675             s = HOP3c(strpos, prog->check_offset_min, strend);
 676
 677             if (SvTAIL(check)) {
 678                 slen = SvCUR(check);    /* >= 1 */
 679
 680                 if ( strend - s > slen || strend - s < slen - 1
 681                      || (strend - s == slen && strend[-1] != '\n')) {
 682                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 683                     goto fail_finish;
 684                 }
 685                 /* Now should match s[0..slen-2] */
 686                 slen--;
 687                 if (slen && (*SvPVX_const(check) != *s
 688                              || (slen > 1
 689                                  && memNE(SvPVX_const(check), s, slen)))) {
 690                   report_neq:
 691                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 692                     goto fail_finish;
 693                 }
 694             }
 695             else if (*SvPVX_const(check) != *s
 696                      || ((slen = SvCUR(check)) > 1
 697                          && memNE(SvPVX_const(check), s, slen)))
 698                 goto report_neq;
 699             check_at = s;
 700             goto success_at_start;
 701           }
 702         }
 703         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 704         s = strpos;
 705         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 706         end_shift = prog->check_end_shift;
 707
 708         if (!ml_anch) {
 709             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 710                                          - (SvTAIL(check) != 0);
 711             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 712
 713             if (end_shift < eshift)
 714                 end_shift = eshift;
 715         }
 716     }
 717     else {                              /* Can match at random position */
 718         ml_anch = 0;
 719         s = strpos;
 720         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 721         end_shift = prog->check_end_shift;
 722
 723         /* end shift should be non negative here */
 724     }
 725
 726 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 727     if (end_shift < 0)
 728         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 729                    (IV)end_shift, RX_PRECOMP(prog));
 730 #endif
 731
 732   restart:
 733     /* Find a possible match in the region s..strend by looking for
 734        the "check" substring in the region corrected by start/end_shift. */
 735
 736     {
 737         I32 srch_start_shift = start_shift;
 738         I32 srch_end_shift = end_shift;
 739         U8* start_point;
 740         U8* end_point;
 741         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 742             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 743             srch_start_shift = strbeg - s;
 744         }
 745     DEBUG_OPTIMISE_MORE_r({
 746         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 747             (IV)prog->check_offset_min,
 748             (IV)srch_start_shift,
 749             (IV)srch_end_shift,
 750             (IV)prog->check_end_shift);
 751     });
 752
 753         if (prog->extflags & RXf_CANY_SEEN) {
 754             start_point= (U8*)(s + srch_start_shift);
 755             end_point= (U8*)(strend - srch_end_shift);
 756         } else {
 757             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 758             end_point= HOP3(strend, -srch_end_shift, strbeg);
 759         }
 760         DEBUG_OPTIMISE_MORE_r({
 761             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 762                 (int)(end_point - start_point),
 763                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 764                 start_point);
 765         });
 766
 767         s = fbm_instr( start_point, end_point,
 768                       check, multiline ? FBMrf_MULTILINE : 0);
 769     }
 770     /* Update the count-of-usability, remove useless subpatterns,
 771         unshift s.  */
 772
 773     DEBUG_EXECUTE_r({
 774         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 775             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 776         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 777                           (s ? "Found" : "Did not find"),
 778             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 779                 ? "anchored" : "floating"),
 780             quoted,
 781             RE_SV_TAIL(check),
 782             (s ? " at offset " : "...\n") );
 783     });
 784
 785     if (!s)
 786         goto fail_finish;
 787     /* Finish the diagnostic message */
 788     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 789
 790     /* XXX dmq: first branch is for positive lookbehind...
 791        Our check string is offset from the beginning of the pattern.
 792        So we need to do any stclass tests offset forward from that
 793        point. I think. :-(
 794      */
 795
 796
 797
 798     check_at=s;
 799
 800
 801     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 802        Start with the other substr.
 803        XXXX no SCREAM optimization yet - and a very coarse implementation
 804        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 805                 *always* match.  Probably should be marked during compile...
 806        Probably it is right to do no SCREAM here...
 807      */
 808
 809     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 810                 : (prog->float_substr && prog->anchored_substr))
 811     {
 812         /* Take into account the "other" substring. */
 813         /* XXXX May be hopelessly wrong for UTF... */
 814         if (!other_last)
 815             other_last = strpos;
 816         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 817           do_other_anchored:
 818             {
 819                 char * const last = HOP3c(s, -start_shift, strbeg);
 820                 char *last1, *last2;
 821                 char * const saved_s = s;
 822                 SV* must;
 823
 824                 t = s - prog->check_offset_max;
 825                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 826                     && (!utf8_target
 827                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 828                             && t > strpos)))
 829                     NOOP;
 830                 else
 831                     t = strpos;
 832                 t = HOP3c(t, prog->anchored_offset, strend);
 833                 if (t < other_last)     /* These positions already checked */
 834                     t = other_last;
 835                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 836                 if (last < last1)
 837                     last1 = last;
 838                 /* XXXX It is not documented what units *_offsets are in.
 839                    We assume bytes, but this is clearly wrong.
 840                    Meaning this code needs to be carefully reviewed for errors.
 841                    dmq.
 842                   */
 843
 844                 /* On end-of-str: see comment below. */
 845                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 846                 if (must == &PL_sv_undef) {
 847                     s = (char*)NULL;
 848                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 849                 }
 850                 else
 851                     s = fbm_instr(
 852                         (unsigned char*)t,
 853                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 854                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 855                         must,
 856                         multiline ? FBMrf_MULTILINE : 0
 857                     );
 858                 DEBUG_EXECUTE_r({
 859                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 860                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 861                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 862                         (s ? "Found" : "Contradicts"),
 863                         quoted, RE_SV_TAIL(must));
 864                 });
 865
 866
 867                 if (!s) {
 868                     if (last1 >= last2) {
 869                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 870                                                 ", giving up...\n"));
 871                         goto fail_finish;
 872                     }
 873                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 874                         ", trying floating at offset %ld...\n",
 875                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 876                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 877                     s = HOP3c(last, 1, strend);
 878                     goto restart;
 879                 }
 880                 else {
 881                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 882                           (long)(s - i_strpos)));
 883                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 884                     other_last = HOP3c(s, 1, strend);
 885                     s = saved_s;
 886                     if (t == strpos)
 887                         goto try_at_start;
 888                     goto try_at_offset;
 889                 }
 890             }
 891         }
 892         else {          /* Take into account the floating substring. */
 893             char *last, *last1;
 894             char * const saved_s = s;
 895             SV* must;
 896
 897             t = HOP3c(s, -start_shift, strbeg);
 898             last1 = last =
 899                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 900             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 901                 last = HOP3c(t, prog->float_max_offset, strend);
 902             s = HOP3c(t, prog->float_min_offset, strend);
 903             if (s < other_last)
 904                 s = other_last;
 905  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 906             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 907             /* fbm_instr() takes into account exact value of end-of-str
 908                if the check is SvTAIL(ed).  Since false positives are OK,
 909                and end-of-str is not later than strend we are OK. */
 910             if (must == &PL_sv_undef) {
 911                 s = (char*)NULL;
 912                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 913             }
 914             else
 915                 s = fbm_instr((unsigned char*)s,
 916                               (unsigned char*)last + SvCUR(must)
 917                                   - (SvTAIL(must)!=0),
 918                               must, multiline ? FBMrf_MULTILINE : 0);
 919             DEBUG_EXECUTE_r({
 920                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 921                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 922                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 923                     (s ? "Found" : "Contradicts"),
 924                     quoted, RE_SV_TAIL(must));
 925             });
 926             if (!s) {
 927                 if (last1 == last) {
 928                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 929                                             ", giving up...\n"));
 930                     goto fail_finish;
 931                 }
 932                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 933                     ", trying anchored starting at offset %ld...\n",
 934                     (long)(saved_s + 1 - i_strpos)));
 935                 other_last = last;
 936                 s = HOP3c(t, 1, strend);
 937                 goto restart;
 938             }
 939             else {
 940                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 941                       (long)(s - i_strpos)));
 942                 other_last = s; /* Fix this later. --Hugo */
 943                 s = saved_s;
 944                 if (t == strpos)
 945                     goto try_at_start;
 946                 goto try_at_offset;
 947             }
 948         }
 949     }
 950
 951
 952     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 953
 954     DEBUG_OPTIMISE_MORE_r(
 955         PerlIO_printf(Perl_debug_log,
 956             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 957             (IV)prog->check_offset_min,
 958             (IV)prog->check_offset_max,
 959             (IV)(s-strpos),
 960             (IV)(t-strpos),
 961             (IV)(t-s),
 962             (IV)(strend-strpos)
 963         )
 964     );
 965
 966     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 967         && (!utf8_target
 968             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 969                  && t > strpos)))
 970     {
 971         /* Fixed substring is found far enough so that the match
 972            cannot start at strpos. */
 973       try_at_offset:
 974         if (ml_anch && t[-1] != '\n') {
 975             /* Eventually fbm_*() should handle this, but often
 976                anchored_offset is not 0, so this check will not be wasted. */
 977             /* XXXX In the code below we prefer to look for "^" even in
 978                presence of anchored substrings.  And we search even
 979                beyond the found float position.  These pessimizations
 980                are historical artefacts only.  */
 981           find_anchor:
 982             while (t < strend - prog->minlen) {
 983                 if (*t == '\n') {
 984                     if (t < check_at - prog->check_offset_min) {
 985                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 986                             /* Since we moved from the found position,
 987                                we definitely contradict the found anchored
 988                                substr.  Due to the above check we do not
 989                                contradict "check" substr.
 990                                Thus we can arrive here only if check substr
 991                                is float.  Redo checking for "other"=="fixed".
 992                              */
 993                             strpos = t + 1;
 994                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 995                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 996                             goto do_other_anchored;
 997                         }
 998                         /* We don't contradict the found floating substring. */
 999                         /* XXXX Why not check for STCLASS? */
1000                         s = t + 1;
1001                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1002                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1003                         goto set_useful;
1004                     }
1005                     /* Position contradicts check-string */
1006                     /* XXXX probably better to look for check-string
1007                        than for "\n", so one should lower the limit for t? */
1008                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1009                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1010                     other_last = strpos = s = t + 1;
1011                     goto restart;
1012                 }
1013                 t++;
1014             }
1015             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1016                         PL_colors[0], PL_colors[1]));
1017             goto fail_finish;
1018         }
1019         else {
1020             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1021                         PL_colors[0], PL_colors[1]));
1022         }
1023         s = t;
1024       set_useful:
1025         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1026     }
1027     else {
1028         /* The found string does not prohibit matching at strpos,
1029            - no optimization of calling REx engine can be performed,
1030            unless it was an MBOL and we are not after MBOL,
1031            or a future STCLASS check will fail this. */
1032       try_at_start:
1033         /* Even in this situation we may use MBOL flag if strpos is offset
1034            wrt the start of the string. */
1035         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1036             && (strpos != strbeg) && strpos[-1] != '\n'
1037             /* May be due to an implicit anchor of m{.*foo}  */
1038             && !(prog->intflags & PREGf_IMPLICIT))
1039         {
1040             t = strpos;
1041             goto find_anchor;
1042         }
1043         DEBUG_EXECUTE_r( if (ml_anch)
1044             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1045                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1046         );
1047       success_at_start:
1048         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1049             && (utf8_target ? (
1050                 prog->check_utf8                /* Could be deleted already */
1051                 && --BmUSEFUL(prog->check_utf8) < 0
1052                 && (prog->check_utf8 == prog->float_utf8)
1053             ) : (
1054                 prog->check_substr              /* Could be deleted already */
1055                 && --BmUSEFUL(prog->check_substr) < 0
1056                 && (prog->check_substr == prog->float_substr)
1057             )))
1058         {
1059             /* If flags & SOMETHING - do not do it many times on the same match */
1060             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1061             /* XXX Does the destruction order has to change with utf8_target? */
1062             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1063             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1064             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1065             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1066             check = NULL;                       /* abort */
1067             s = strpos;
1068             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1069                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1070             if (prog->intflags & PREGf_IMPLICIT)
1071                 prog->extflags &= ~RXf_ANCH_MBOL;
1072             /* XXXX This is a remnant of the old implementation.  It
1073                     looks wasteful, since now INTUIT can use many
1074                     other heuristics. */
1075             prog->extflags &= ~RXf_USE_INTUIT;
1076             /* XXXX What other flags might need to be cleared in this branch? */
1077         }
1078         else
1079             s = strpos;
1080     }
1081
1082     /* Last resort... */
1083     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1084     /* trie stclasses are too expensive to use here, we are better off to
1085        leave it to regmatch itself */
1086     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1087         /* minlen == 0 is possible if regstclass is \b or \B,
1088            and the fixed substr is ''$.
1089            Since minlen is already taken into account, s+1 is before strend;
1090            accidentally, minlen >= 1 guaranties no false positives at s + 1
1091            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1092            regstclass does not come from lookahead...  */
1093         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1094            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1095         const U8* const str = (U8*)STRING(progi->regstclass);
1096         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1097                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1098                     : 1);
1099         char * endpos;
1100         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1101             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1102         else if (prog->float_substr || prog->float_utf8)
1103             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1104         else
1105             endpos= strend;
1106
1107         if (checked_upto < s)
1108            checked_upto = s;
1109         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1110                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1111
1112         t = s;
1113         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1114         if (s) {
1115             checked_upto = s;
1116         } else {
1117 #ifdef DEBUGGING
1118             const char *what = NULL;
1119 #endif
1120             if (endpos == strend) {
1121                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1122                                 "Could not match STCLASS...\n") );
1123                 goto fail;
1124             }
1125             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126                                    "This position contradicts STCLASS...\n") );
1127             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1128                 goto fail;
1129             checked_upto = HOPBACKc(endpos, start_shift);
1130             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1132             /* Contradict one of substrings */
1133             if (prog->anchored_substr || prog->anchored_utf8) {
1134                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1135                     DEBUG_EXECUTE_r( what = "anchored" );
1136                   hop_and_restart:
1137                     s = HOP3c(t, 1, strend);
1138                     if (s + start_shift + end_shift > strend) {
1139                         /* XXXX Should be taken into account earlier? */
1140                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1141                                                "Could not match STCLASS...\n") );
1142                         goto fail;
1143                     }
1144                     if (!check)
1145                         goto giveup;
1146                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1147                                 "Looking for %s substr starting at offset %ld...\n",
1148                                  what, (long)(s + start_shift - i_strpos)) );
1149                     goto restart;
1150                 }
1151                 /* Have both, check_string is floating */
1152                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1153                     goto retry_floating_check;
1154                 /* Recheck anchored substring, but not floating... */
1155                 s = check_at;
1156                 if (!check)
1157                     goto giveup;
1158                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1159                           "Looking for anchored substr starting at offset %ld...\n",
1160                           (long)(other_last - i_strpos)) );
1161                 goto do_other_anchored;
1162             }
1163             /* Another way we could have checked stclass at the
1164                current position only: */
1165             if (ml_anch) {
1166                 s = t = t + 1;
1167                 if (!check)
1168                     goto giveup;
1169                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1170                           "Looking for /%s^%s/m starting at offset %ld...\n",
1171                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1172                 goto try_at_offset;
1173             }
1174             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1175                 goto fail;
1176             /* Check is floating substring. */
1177           retry_floating_check:
1178             t = check_at - start_shift;
1179             DEBUG_EXECUTE_r( what = "floating" );
1180             goto hop_and_restart;
1181         }
1182         if (t != s) {
1183             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1184                         "By STCLASS: moving %ld --> %ld\n",
1185                                   (long)(t - i_strpos), (long)(s - i_strpos))
1186                    );
1187         }
1188         else {
1189             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1190                                   "Does not contradict STCLASS...\n");
1191                    );
1192         }
1193     }
1194   giveup:
1195     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1196                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1197                           PL_colors[5], (long)(s - i_strpos)) );
1198     return s;
1199
1200   fail_finish:                          /* Substring not found */
1201     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1202         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1203   fail:
1204     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1205                           PL_colors[4], PL_colors[5]));
1206     return NULL;
1207 }
1208
1209 #define DECL_TRIE_TYPE(scan) \
1210     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1211                     trie_type = ((scan->flags == EXACT) \
1212                               ? (utf8_target ? trie_utf8 : trie_plain) \
1213                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1214
1215 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1216 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1217     STRLEN skiplen;                                                                 \
1218     switch (trie_type) {                                                            \
1219     case trie_utf8_fold:                                                            \
1220         if ( foldlen>0 ) {                                                          \
1221             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1222             foldlen -= len;                                                         \
1223             uscan += len;                                                           \
1224             len=0;                                                                  \
1225         } else {                                                                    \
1226             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1227             len = UTF8SKIP(uc);                                                     \
1228             skiplen = UNISKIP( uvc );                                               \
1229             foldlen -= skiplen;                                                     \
1230             uscan = foldbuf + skiplen;                                              \
1231         }                                                                           \
1232         break;                                                                      \
1233     case trie_latin_utf8_fold:                                                      \
1234         if ( foldlen>0 ) {                                                          \
1235             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1236             foldlen -= len;                                                         \
1237             uscan += len;                                                           \
1238             len=0;                                                                  \
1239         } else {                                                                    \
1240             len = 1;                                                                \
1241             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1242             skiplen = UNISKIP( uvc );                                               \
1243             foldlen -= skiplen;                                                     \
1244             uscan = foldbuf + skiplen;                                              \
1245         }                                                                           \
1246         break;                                                                      \
1247     case trie_utf8:                                                                 \
1248         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1249         break;                                                                      \
1250     case trie_plain:                                                                \
1251         uvc = (UV)*uc;                                                              \
1252         len = 1;                                                                    \
1253     }                                                                               \
1254     if (uvc < 256) {                                                                \
1255         charid = trie->charmap[ uvc ];                                              \
1256     }                                                                               \
1257     else {                                                                          \
1258         charid = 0;                                                                 \
1259         if (widecharmap) {                                                          \
1260             SV** const svpp = hv_fetch(widecharmap,                                 \
1261                         (char*)&uvc, sizeof(UV), 0);                                \
1262             if (svpp)                                                               \
1263                 charid = (U16)SvIV(*svpp);                                          \
1264         }                                                                           \
1265     }                                                                               \
1266 } STMT_END
1267
1268 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1269 STMT_START {                                              \
1270     while (s <= e) {                                      \
1271         if ( (CoNd)                                       \
1272              && (ln == 1 || folder(s, pat_string, ln))    \
1273              && (!reginfo || regtry(reginfo, &s)) )       \
1274             goto got_it;                                  \
1275         s++;                                              \
1276     }                                                     \
1277 } STMT_END
1278
1279 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1280 STMT_START {                                          \
1281     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1282         CoDe                                          \
1283         s += uskip;                                   \
1284     }                                                 \
1285 } STMT_END
1286
1287 #define REXEC_FBC_SCAN(CoDe)                          \
1288 STMT_START {                                          \
1289     while (s < strend) {                              \
1290         CoDe                                          \
1291         s++;                                          \
1292     }                                                 \
1293 } STMT_END
1294
1295 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1296 REXEC_FBC_UTF8_SCAN(                                  \
1297     if (CoNd) {                                       \
1298         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1299             goto got_it;                              \
1300         else                                          \
1301             tmp = doevery;                            \
1302     }                                                 \
1303     else                                              \
1304         tmp = 1;                                      \
1305 )
1306
1307 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1308 REXEC_FBC_SCAN(                                       \
1309     if (CoNd) {                                       \
1310         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1311             goto got_it;                              \
1312         else                                          \
1313             tmp = doevery;                            \
1314     }                                                 \
1315     else                                              \
1316         tmp = 1;                                      \
1317 )
1318
1319 #define REXEC_FBC_TRYIT               \
1320 if ((!reginfo || regtry(reginfo, &s))) \
1321     goto got_it
1322
1323 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1324     if (utf8_target) {                                             \
1325         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1326     }                                                          \
1327     else {                                                     \
1328         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1329     }
1330
1331 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1332     if (utf8_target) {                                             \
1333         UtFpReLoAd;                                            \
1334         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1335     }                                                          \
1336     else {                                                     \
1337         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1338     }
1339
1340 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1341     PL_reg_flags |= RF_tainted;                                \
1342     if (utf8_target) {                                             \
1343         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1344     }                                                          \
1345     else {                                                     \
1346         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1347     }
1348
1349 #define DUMP_EXEC_POS(li,s,doutf8) \
1350     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1351
1352
1353 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1354         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1355         tmp = TEST_NON_UTF8(tmp);                                              \
1356         REXEC_FBC_UTF8_SCAN(                                                   \
1357             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1358                 tmp = !tmp;                                                    \
1359                 IF_SUCCESS;                                                    \
1360             }                                                                  \
1361             else {                                                             \
1362                 IF_FAIL;                                                       \
1363             }                                                                  \
1364         );                                                                     \
1365
1366 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1367         if (s == PL_bostr) {                                                   \
1368             tmp = '\n';                                                        \
1369         }                                                                      \
1370         else {                                                                 \
1371             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1372             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1373         }                                                                      \
1374         tmp = TeSt1_UtF8;                                                      \
1375         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1376         REXEC_FBC_UTF8_SCAN(                                                   \
1377             if (tmp == ! (TeSt2_UtF8)) { \
1378                 tmp = !tmp;                                                    \
1379                 IF_SUCCESS;                                                    \
1380             }                                                                  \
1381             else {                                                             \
1382                 IF_FAIL;                                                       \
1383             }                                                                  \
1384         );                                                                     \
1385
1386 /* The only difference between the BOUND and NBOUND cases is that
1387  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1388  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1389  * with the other one being empty */
1390 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1391     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1392
1393 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1394     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1395
1396 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1397     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1398
1399 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1400     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1401
1402
1403 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1404  * be passed in completely with the variable name being tested, which isn't
1405  * such a clean interface, but this is easier to read than it was before.  We
1406  * are looking for the boundary (or non-boundary between a word and non-word
1407  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1408  * must be different.  Find the "wordness" of the character just prior to this
1409  * one, and compare it with the wordness of this one.  If they differ, we have
1410  * a boundary.  At the beginning of the string, pretend that the previous
1411  * character was a new-line */
1412 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1413     if (utf8_target) {                                                         \
1414                 UTF8_CODE \
1415     }                                                                          \
1416     else {  /* Not utf8 */                                                     \
1417         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1418         tmp = TEST_NON_UTF8(tmp);                                              \
1419         REXEC_FBC_SCAN(                                                        \
1420             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1421                 tmp = !tmp;                                                    \
1422                 IF_SUCCESS;                                                    \
1423             }                                                                  \
1424             else {                                                             \
1425                 IF_FAIL;                                                       \
1426             }                                                                  \
1427         );                                                                     \
1428     }                                                                          \
1429     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1430         goto got_it;
1431
1432 /* We know what class REx starts with.  Try to find this position... */
1433 /* if reginfo is NULL, its a dryrun */
1434 /* annoyingly all the vars in this routine have different names from their counterparts
1435    in regmatch. /grrr */
1436
1437 STATIC char *
1438 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1439     const char *strend, regmatch_info *reginfo)
1440 {
1441         dVAR;
1442         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1443         char *pat_string;   /* The pattern's exactish string */
1444         char *pat_end;      /* ptr to end char of pat_string */
1445         re_fold_t folder;       /* Function for computing non-utf8 folds */
1446         const U8 *fold_array;   /* array for folding ords < 256 */
1447         STRLEN ln;
1448         STRLEN lnc;
1449         STRLEN uskip;
1450         U8 c1;
1451         U8 c2;
1452         char *e;
1453         I32 tmp = 1;    /* Scratch variable? */
1454         const bool utf8_target = PL_reg_match_utf8;
1455         UV utf8_fold_flags = 0;
1456         RXi_GET_DECL(prog,progi);
1457
1458         PERL_ARGS_ASSERT_FIND_BYCLASS;
1459
1460         /* We know what class it must start with. */
1461         switch (OP(c)) {
1462         case ANYOF:
1463             if (utf8_target) {
1464                 REXEC_FBC_UTF8_CLASS_SCAN(
1465                           reginclass(prog, c, (U8*)s, utf8_target));
1466             }
1467             else {
1468                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1469             }
1470             break;
1471         case CANY:
1472             REXEC_FBC_SCAN(
1473                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1474                     goto got_it;
1475                 else
1476                     tmp = doevery;
1477             );
1478             break;
1479
1480         case EXACTFA:
1481             if (UTF_PATTERN || utf8_target) {
1482                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1483                 goto do_exactf_utf8;
1484             }
1485             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1486             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1487             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1488
1489         case EXACTF:
1490             if (utf8_target) {
1491
1492                 /* regcomp.c already folded this if pattern is in UTF-8 */
1493                 utf8_fold_flags = 0;
1494                 goto do_exactf_utf8;
1495             }
1496             fold_array = PL_fold;
1497             folder = foldEQ;
1498             goto do_exactf_non_utf8;
1499
1500         case EXACTFL:
1501             if (UTF_PATTERN || utf8_target) {
1502                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1503                 goto do_exactf_utf8;
1504             }
1505             fold_array = PL_fold_locale;
1506             folder = foldEQ_locale;
1507             goto do_exactf_non_utf8;
1508
1509         case EXACTFU_SS:
1510             if (UTF_PATTERN) {
1511                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1512             }
1513             goto do_exactf_utf8;
1514
1515         case EXACTFU_TRICKYFOLD:
1516         case EXACTFU:
1517             if (UTF_PATTERN || utf8_target) {
1518                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1519                 goto do_exactf_utf8;
1520             }
1521
1522             /* Any 'ss' in the pattern should have been replaced by regcomp,
1523              * so we don't have to worry here about this single special case
1524              * in the Latin1 range */
1525             fold_array = PL_fold_latin1;
1526             folder = foldEQ_latin1;
1527
1528             /* FALL THROUGH */
1529
1530         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1531                                are no glitches with fold-length differences
1532                                between the target string and pattern */
1533
1534             /* The idea in the non-utf8 EXACTF* cases is to first find the
1535              * first character of the EXACTF* node and then, if necessary,
1536              * case-insensitively compare the full text of the node.  c1 is the
1537              * first character.  c2 is its fold.  This logic will not work for
1538              * Unicode semantics and the german sharp ss, which hence should
1539              * not be compiled into a node that gets here. */
1540             pat_string = STRING(c);
1541             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1542
1543             /* We know that we have to match at least 'ln' bytes (which is the
1544              * same as characters, since not utf8).  If we have to match 3
1545              * characters, and there are only 2 availabe, we know without
1546              * trying that it will fail; so don't start a match past the
1547              * required minimum number from the far end */
1548             e = HOP3c(strend, -((I32)ln), s);
1549
1550             if (!reginfo && e < s) {
1551                 e = s;                  /* Due to minlen logic of intuit() */
1552             }
1553
1554             c1 = *pat_string;
1555             c2 = fold_array[c1];
1556             if (c1 == c2) { /* If char and fold are the same */
1557                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1558             }
1559             else {
1560                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1561             }
1562             break;
1563
1564         do_exactf_utf8:
1565         {
1566             unsigned expansion;
1567
1568
1569             /* If one of the operands is in utf8, we can't use the simpler
1570              * folding above, due to the fact that many different characters
1571              * can have the same fold, or portion of a fold, or different-
1572              * length fold */
1573             pat_string = STRING(c);
1574             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1575             pat_end = pat_string + ln;
1576             lnc = (UTF_PATTERN) /* length to match in characters */
1577                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1578                     : ln;
1579
1580             /* We have 'lnc' characters to match in the pattern, but because of
1581              * multi-character folding, each character in the target can match
1582              * up to 3 characters (Unicode guarantees it will never exceed
1583              * this) if it is utf8-encoded; and up to 2 if not (based on the
1584              * fact that the Latin 1 folds are already determined, and the
1585              * only multi-char fold in that range is the sharp-s folding to
1586              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1587              * string character.  Adjust lnc accordingly, rounding up, so that
1588              * if we need to match at least 4+1/3 chars, that really is 5. */
1589             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1590             lnc = (lnc + expansion - 1) / expansion;
1591
1592             /* As in the non-UTF8 case, if we have to match 3 characters, and
1593              * only 2 are left, it's guaranteed to fail, so don't start a
1594              * match that would require us to go beyond the end of the string
1595              */
1596             e = HOP3c(strend, -((I32)lnc), s);
1597
1598             if (!reginfo && e < s) {
1599                 e = s;                  /* Due to minlen logic of intuit() */
1600             }
1601
1602             /* XXX Note that we could recalculate e to stop the loop earlier,
1603              * as the worst case expansion above will rarely be met, and as we
1604              * go along we would usually find that e moves further to the left.
1605              * This would happen only after we reached the point in the loop
1606              * where if there were no expansion we should fail.  Unclear if
1607              * worth the expense */
1608
1609             while (s <= e) {
1610                 char *my_strend= (char *)strend;
1611                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1612                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1613                     && (!reginfo || regtry(reginfo, &s)) )
1614                 {
1615                     goto got_it;
1616                 }
1617                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1618             }
1619             break;
1620         }
1621         case BOUNDL:
1622             PL_reg_flags |= RF_tainted;
1623             FBC_BOUND(isALNUM_LC,
1624                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625                       isALNUM_LC_utf8((U8*)s));
1626             break;
1627         case NBOUNDL:
1628             PL_reg_flags |= RF_tainted;
1629             FBC_NBOUND(isALNUM_LC,
1630                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631                        isALNUM_LC_utf8((U8*)s));
1632             break;
1633         case BOUND:
1634             FBC_BOUND(isWORDCHAR,
1635                       isALNUM_uni(tmp),
1636                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1637             break;
1638         case BOUNDA:
1639             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1640                              isWORDCHAR_A(tmp),
1641                              isWORDCHAR_A((U8*)s));
1642             break;
1643         case NBOUND:
1644             FBC_NBOUND(isWORDCHAR,
1645                        isALNUM_uni(tmp),
1646                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1647             break;
1648         case NBOUNDA:
1649             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1650                               isWORDCHAR_A(tmp),
1651                               isWORDCHAR_A((U8*)s));
1652             break;
1653         case BOUNDU:
1654             FBC_BOUND(isWORDCHAR_L1,
1655                       isALNUM_uni(tmp),
1656                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1657             break;
1658         case NBOUNDU:
1659             FBC_NBOUND(isWORDCHAR_L1,
1660                        isALNUM_uni(tmp),
1661                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1662             break;
1663         case ALNUML:
1664             REXEC_FBC_CSCAN_TAINT(
1665                 isALNUM_LC_utf8((U8*)s),
1666                 isALNUM_LC(*s)
1667             );
1668             break;
1669         case ALNUMU:
1670             REXEC_FBC_CSCAN_PRELOAD(
1671                 LOAD_UTF8_CHARCLASS_ALNUM(),
1672                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1673                 isWORDCHAR_L1((U8) *s)
1674             );
1675             break;
1676         case ALNUM:
1677             REXEC_FBC_CSCAN_PRELOAD(
1678                 LOAD_UTF8_CHARCLASS_ALNUM(),
1679                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1680                 isWORDCHAR((U8) *s)
1681             );
1682             break;
1683         case ALNUMA:
1684             /* Don't need to worry about utf8, as it can match only a single
1685              * byte invariant character */
1686             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1687             break;
1688         case NALNUMU:
1689             REXEC_FBC_CSCAN_PRELOAD(
1690                 LOAD_UTF8_CHARCLASS_ALNUM(),
1691                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1692                 ! isWORDCHAR_L1((U8) *s)
1693             );
1694             break;
1695         case NALNUM:
1696             REXEC_FBC_CSCAN_PRELOAD(
1697                 LOAD_UTF8_CHARCLASS_ALNUM(),
1698                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1699                 ! isALNUM(*s)
1700             );
1701             break;
1702         case NALNUMA:
1703             REXEC_FBC_CSCAN(
1704                 !isWORDCHAR_A(*s),
1705                 !isWORDCHAR_A(*s)
1706             );
1707             break;
1708         case NALNUML:
1709             REXEC_FBC_CSCAN_TAINT(
1710                 !isALNUM_LC_utf8((U8*)s),
1711                 !isALNUM_LC(*s)
1712             );
1713             break;
1714         case SPACEU:
1715             REXEC_FBC_CSCAN_PRELOAD(
1716                 LOAD_UTF8_CHARCLASS_SPACE(),
1717                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1718                 isSPACE_L1((U8) *s)
1719             );
1720             break;
1721         case SPACE:
1722             REXEC_FBC_CSCAN_PRELOAD(
1723                 LOAD_UTF8_CHARCLASS_SPACE(),
1724                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1725                 isSPACE((U8) *s)
1726             );
1727             break;
1728         case SPACEA:
1729             /* Don't need to worry about utf8, as it can match only a single
1730              * byte invariant character */
1731             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1732             break;
1733         case SPACEL:
1734             REXEC_FBC_CSCAN_TAINT(
1735                 isSPACE_LC_utf8((U8*)s),
1736                 isSPACE_LC(*s)
1737             );
1738             break;
1739         case NSPACEU:
1740             REXEC_FBC_CSCAN_PRELOAD(
1741                 LOAD_UTF8_CHARCLASS_SPACE(),
1742                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1743                 ! isSPACE_L1((U8) *s)
1744             );
1745             break;
1746         case NSPACE:
1747             REXEC_FBC_CSCAN_PRELOAD(
1748                 LOAD_UTF8_CHARCLASS_SPACE(),
1749                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1750                 ! isSPACE((U8) *s)
1751             );
1752             break;
1753         case NSPACEA:
1754             REXEC_FBC_CSCAN(
1755                 !isSPACE_A(*s),
1756                 !isSPACE_A(*s)
1757             );
1758             break;
1759         case NSPACEL:
1760             REXEC_FBC_CSCAN_TAINT(
1761                 !isSPACE_LC_utf8((U8*)s),
1762                 !isSPACE_LC(*s)
1763             );
1764             break;
1765         case DIGIT:
1766             REXEC_FBC_CSCAN_PRELOAD(
1767                 LOAD_UTF8_CHARCLASS_DIGIT(),
1768                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1769                 isDIGIT(*s)
1770             );
1771             break;
1772         case DIGITA:
1773             /* Don't need to worry about utf8, as it can match only a single
1774              * byte invariant character */
1775             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1776             break;
1777         case DIGITL:
1778             REXEC_FBC_CSCAN_TAINT(
1779                 isDIGIT_LC_utf8((U8*)s),
1780                 isDIGIT_LC(*s)
1781             );
1782             break;
1783         case NDIGIT:
1784             REXEC_FBC_CSCAN_PRELOAD(
1785                 LOAD_UTF8_CHARCLASS_DIGIT(),
1786                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1787                 !isDIGIT(*s)
1788             );
1789             break;
1790         case NDIGITA:
1791             REXEC_FBC_CSCAN(
1792                 !isDIGIT_A(*s),
1793                 !isDIGIT_A(*s)
1794             );
1795             break;
1796         case NDIGITL:
1797             REXEC_FBC_CSCAN_TAINT(
1798                 !isDIGIT_LC_utf8((U8*)s),
1799                 !isDIGIT_LC(*s)
1800             );
1801             break;
1802         case LNBREAK:
1803             REXEC_FBC_CSCAN(
1804                 is_LNBREAK_utf8_safe(s, strend),
1805                 is_LNBREAK_latin1_safe(s, strend)
1806             );
1807             break;
1808         case VERTWS:
1809             REXEC_FBC_CSCAN(
1810                 is_VERTWS_utf8_safe(s, strend),
1811                 is_VERTWS_latin1_safe(s, strend)
1812             );
1813             break;
1814         case NVERTWS:
1815             REXEC_FBC_CSCAN(
1816                 !is_VERTWS_utf8_safe(s, strend),
1817                 !is_VERTWS_latin1_safe(s, strend)
1818             );
1819             break;
1820         case HORIZWS:
1821             REXEC_FBC_CSCAN(
1822                 is_HORIZWS_utf8_safe(s, strend),
1823                 is_HORIZWS_latin1_safe(s, strend)
1824             );
1825             break;
1826         case NHORIZWS:
1827             REXEC_FBC_CSCAN(
1828                 !is_HORIZWS_utf8_safe(s, strend),
1829                 !is_HORIZWS_latin1_safe(s, strend)
1830             );
1831             break;
1832         case POSIXA:
1833             /* Don't need to worry about utf8, as it can match only a single
1834             * byte invariant character.  The flag in this node type is the
1835             * class number to pass to _generic_isCC() to build a mask for
1836             * searching in PL_charclass[] */
1837             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1838             break;
1839         case NPOSIXA:
1840             REXEC_FBC_CSCAN(
1841                 !_generic_isCC_A(*s, FLAGS(c)),
1842                 !_generic_isCC_A(*s, FLAGS(c))
1843             );
1844             break;
1845
1846         case AHOCORASICKC:
1847         case AHOCORASICK:
1848             {
1849                 DECL_TRIE_TYPE(c);
1850                 /* what trie are we using right now */
1851                 reg_ac_data *aho
1852                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1853                 reg_trie_data *trie
1854                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1855                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1856
1857                 const char *last_start = strend - trie->minlen;
1858 #ifdef DEBUGGING
1859                 const char *real_start = s;
1860 #endif
1861                 STRLEN maxlen = trie->maxlen;
1862                 SV *sv_points;
1863                 U8 **points; /* map of where we were in the input string
1864                                 when reading a given char. For ASCII this
1865                                 is unnecessary overhead as the relationship
1866                                 is always 1:1, but for Unicode, especially
1867                                 case folded Unicode this is not true. */
1868                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1869                 U8 *bitmap=NULL;
1870
1871
1872                 GET_RE_DEBUG_FLAGS_DECL;
1873
1874                 /* We can't just allocate points here. We need to wrap it in
1875                  * an SV so it gets freed properly if there is a croak while
1876                  * running the match */
1877                 ENTER;
1878                 SAVETMPS;
1879                 sv_points=newSV(maxlen * sizeof(U8 *));
1880                 SvCUR_set(sv_points,
1881                     maxlen * sizeof(U8 *));
1882                 SvPOK_on(sv_points);
1883                 sv_2mortal(sv_points);
1884                 points=(U8**)SvPV_nolen(sv_points );
1885                 if ( trie_type != trie_utf8_fold
1886                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1887                 {
1888                     if (trie->bitmap)
1889                         bitmap=(U8*)trie->bitmap;
1890                     else
1891                         bitmap=(U8*)ANYOF_BITMAP(c);
1892                 }
1893                 /* this is the Aho-Corasick algorithm modified a touch
1894                    to include special handling for long "unknown char"
1895                    sequences. The basic idea being that we use AC as long
1896                    as we are dealing with a possible matching char, when
1897                    we encounter an unknown char (and we have not encountered
1898                    an accepting state) we scan forward until we find a legal
1899                    starting char.
1900                    AC matching is basically that of trie matching, except
1901                    that when we encounter a failing transition, we fall back
1902                    to the current states "fail state", and try the current char
1903                    again, a process we repeat until we reach the root state,
1904                    state 1, or a legal transition. If we fail on the root state
1905                    then we can either terminate if we have reached an accepting
1906                    state previously, or restart the entire process from the beginning
1907                    if we have not.
1908
1909                  */
1910                 while (s <= last_start) {
1911                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1912                     U8 *uc = (U8*)s;
1913                     U16 charid = 0;
1914                     U32 base = 1;
1915                     U32 state = 1;
1916                     UV uvc = 0;
1917                     STRLEN len = 0;
1918                     STRLEN foldlen = 0;
1919                     U8 *uscan = (U8*)NULL;
1920                     U8 *leftmost = NULL;
1921 #ifdef DEBUGGING
1922                     U32 accepted_word= 0;
1923 #endif
1924                     U32 pointpos = 0;
1925
1926                     while ( state && uc <= (U8*)strend ) {
1927                         int failed=0;
1928                         U32 word = aho->states[ state ].wordnum;
1929
1930                         if( state==1 ) {
1931                             if ( bitmap ) {
1932                                 DEBUG_TRIE_EXECUTE_r(
1933                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1934                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1935                                             (char *)uc, utf8_target );
1936                                         PerlIO_printf( Perl_debug_log,
1937                                             " Scanning for legal start char...\n");
1938                                     }
1939                                 );
1940                                 if (utf8_target) {
1941                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1942                                         uc += UTF8SKIP(uc);
1943                                     }
1944                                 } else {
1945                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1946                                         uc++;
1947                                     }
1948                                 }
1949                                 s= (char *)uc;
1950                             }
1951                             if (uc >(U8*)last_start) break;
1952                         }
1953
1954                         if ( word ) {
1955                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1956                             if (!leftmost || lpos < leftmost) {
1957                                 DEBUG_r(accepted_word=word);
1958                                 leftmost= lpos;
1959                             }
1960                             if (base==0) break;
1961
1962                         }
1963                         points[pointpos++ % maxlen]= uc;
1964                         if (foldlen || uc < (U8*)strend) {
1965                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1966                                              widecharmap, uc,
1967                                              uscan, len, uvc, charid, foldlen,
1968                                              foldbuf, uniflags);
1969                             DEBUG_TRIE_EXECUTE_r({
1970                                 dump_exec_pos( (char *)uc, c, strend,
1971                                             real_start, s, utf8_target);
1972                                 PerlIO_printf(Perl_debug_log,
1973                                     " Charid:%3u CP:%4"UVxf" ",
1974                                      charid, uvc);
1975                             });
1976                         }
1977                         else {
1978                             len = 0;
1979                             charid = 0;
1980                         }
1981
1982
1983                         do {
1984 #ifdef DEBUGGING
1985                             word = aho->states[ state ].wordnum;
1986 #endif
1987                             base = aho->states[ state ].trans.base;
1988
1989                             DEBUG_TRIE_EXECUTE_r({
1990                                 if (failed)
1991                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1992                                         s,   utf8_target );
1993                                 PerlIO_printf( Perl_debug_log,
1994                                     "%sState: %4"UVxf", word=%"UVxf,
1995                                     failed ? " Fail transition to " : "",
1996                                     (UV)state, (UV)word);
1997                             });
1998                             if ( base ) {
1999                                 U32 tmp;
2000                                 I32 offset;
2001                                 if (charid &&
2002                                      ( ((offset = base + charid
2003                                         - 1 - trie->uniquecharcount)) >= 0)
2004                                      && ((U32)offset < trie->lasttrans)
2005                                      && trie->trans[offset].check == state
2006                                      && (tmp=trie->trans[offset].next))
2007                                 {
2008                                     DEBUG_TRIE_EXECUTE_r(
2009                                         PerlIO_printf( Perl_debug_log," - legal\n"));
2010                                     state = tmp;
2011                                     break;
2012                                 }
2013                                 else {
2014                                     DEBUG_TRIE_EXECUTE_r(
2015                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2016                                     failed = 1;
2017                                     state = aho->fail[state];
2018                                 }
2019                             }
2020                             else {
2021                                 /* we must be accepting here */
2022                                 DEBUG_TRIE_EXECUTE_r(
2023                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2024                                 failed = 1;
2025                                 break;
2026                             }
2027                         } while(state);
2028                         uc += len;
2029                         if (failed) {
2030                             if (leftmost)
2031                                 break;
2032                             if (!state) state = 1;
2033                         }
2034                     }
2035                     if ( aho->states[ state ].wordnum ) {
2036                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2037                         if (!leftmost || lpos < leftmost) {
2038                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2039                             leftmost = lpos;
2040                         }
2041                     }
2042                     if (leftmost) {
2043                         s = (char*)leftmost;
2044                         DEBUG_TRIE_EXECUTE_r({
2045                             PerlIO_printf(
2046                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2047                                 (UV)accepted_word, (IV)(s - real_start)
2048                             );
2049                         });
2050                         if (!reginfo || regtry(reginfo, &s)) {
2051                             FREETMPS;
2052                             LEAVE;
2053                             goto got_it;
2054                         }
2055                         s = HOPc(s,1);
2056                         DEBUG_TRIE_EXECUTE_r({
2057                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2058                         });
2059                     } else {
2060                         DEBUG_TRIE_EXECUTE_r(
2061                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2062                         break;
2063                     }
2064                 }
2065                 FREETMPS;
2066                 LEAVE;
2067             }
2068             break;
2069         default:
2070             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2071             break;
2072         }
2073         return 0;
2074       got_it:
2075         return s;
2076 }
2077
2078
2079 /*
2080  - regexec_flags - match a regexp against a string
2081  */
2082 I32
2083 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2084               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2085 /* stringarg: the point in the string at which to begin matching */
2086 /* strend:    pointer to null at end of string */
2087 /* strbeg:    real beginning of string */
2088 /* minend:    end of match must be >= minend bytes after stringarg. */
2089 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2090  *            itself is accessed via the pointers above */
2091 /* data:      May be used for some additional optimizations.
2092               Currently its only used, with a U32 cast, for transmitting
2093               the ganch offset when doing a /g match. This will change */
2094 /* nosave:    For optimizations. */
2095
2096 {
2097     dVAR;
2098     struct regexp *const prog = ReANY(rx);
2099     /*register*/ char *s;
2100     regnode *c;
2101     /*register*/ char *startpos = stringarg;
2102     I32 minlen;         /* must match at least this many chars */
2103     I32 dontbother = 0; /* how many characters not to try at end */
2104     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2105     I32 scream_pos = -1;                /* Internal iterator of scream. */
2106     char *scream_olds = NULL;
2107     const bool utf8_target = cBOOL(DO_UTF8(sv));
2108     I32 multiline;
2109     RXi_GET_DECL(prog,progi);
2110     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2111     regexp_paren_pair *swap = NULL;
2112     GET_RE_DEBUG_FLAGS_DECL;
2113
2114     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2115     PERL_UNUSED_ARG(data);
2116
2117     /* Be paranoid... */
2118     if (prog == NULL || startpos == NULL) {
2119         Perl_croak(aTHX_ "NULL regexp parameter");
2120         return 0;
2121     }
2122
2123     multiline = prog->extflags & RXf_PMf_MULTILINE;
2124     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2125
2126     RX_MATCH_UTF8_set(rx, utf8_target);
2127     DEBUG_EXECUTE_r(
2128         debug_start_match(rx, utf8_target, startpos, strend,
2129         "Matching");
2130     );
2131
2132     minlen = prog->minlen;
2133
2134     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2135         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2136                               "String too short [regexec_flags]...\n"));
2137         goto phooey;
2138     }
2139
2140
2141     /* Check validity of program. */
2142     if (UCHARAT(progi->program) != REG_MAGIC) {
2143         Perl_croak(aTHX_ "corrupted regexp program");
2144     }
2145
2146     PL_reg_flags = 0;
2147     PL_reg_state.re_state_eval_setup_done = FALSE;
2148     PL_reg_maxiter = 0;
2149
2150     if (RX_UTF8(rx))
2151         PL_reg_flags |= RF_utf8;
2152
2153     /* Mark beginning of line for ^ and lookbehind. */
2154     reginfo.bol = startpos; /* XXX not used ??? */
2155     PL_bostr  = strbeg;
2156     reginfo.sv = sv;
2157
2158     /* Mark end of line for $ (and such) */
2159     PL_regeol = strend;
2160
2161     /* see how far we have to get to not match where we matched before */
2162     reginfo.till = startpos+minend;
2163
2164     /* If there is a "must appear" string, look for it. */
2165     s = startpos;
2166
2167     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2168         MAGIC *mg;
2169         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2170             reginfo.ganch = startpos + prog->gofs;
2171             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2172               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2173         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2174                   && SvMAGIC(sv)
2175                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2176                   && mg->mg_len >= 0) {
2177             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2178             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2179                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2180
2181             if (prog->extflags & RXf_ANCH_GPOS) {
2182                 if (s > reginfo.ganch)
2183                     goto phooey;
2184                 s = reginfo.ganch - prog->gofs;
2185                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2186                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2187                 if (s < strbeg)
2188                     goto phooey;
2189             }
2190         }
2191         else if (data) {
2192             reginfo.ganch = strbeg + PTR2UV(data);
2193             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2195
2196         } else {                                /* pos() not defined */
2197             reginfo.ganch = strbeg;
2198             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2199                  "GPOS: reginfo.ganch = strbeg\n"));
2200         }
2201     }
2202     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2203         /* We have to be careful. If the previous successful match
2204            was from this regex we don't want a subsequent partially
2205            successful match to clobber the old results.
2206            So when we detect this possibility we add a swap buffer
2207            to the re, and switch the buffer each match. If we fail
2208            we switch it back, otherwise we leave it swapped.
2209         */
2210         swap = prog->offs;
2211         /* do we need a save destructor here for eval dies? */
2212         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2213         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2214             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2215             PTR2UV(prog),
2216             PTR2UV(swap),
2217             PTR2UV(prog->offs)
2218         ));
2219     }
2220     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2221         re_scream_pos_data d;
2222
2223         d.scream_olds = &scream_olds;
2224         d.scream_pos = &scream_pos;
2225         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2226         if (!s) {
2227             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2228             goto phooey;        /* not present */
2229         }
2230     }
2231
2232
2233
2234     /* Simplest case:  anchored match need be tried only once. */
2235     /*  [unless only anchor is BOL and multiline is set] */
2236     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2237         if (s == startpos && regtry(&reginfo, &startpos))
2238             goto got_it;
2239         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2240                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2241         {
2242             char *end;
2243
2244             if (minlen)
2245                 dontbother = minlen - 1;
2246             end = HOP3c(strend, -dontbother, strbeg) - 1;
2247             /* for multiline we only have to try after newlines */
2248             if (prog->check_substr || prog->check_utf8) {
2249                 /* because of the goto we can not easily reuse the macros for bifurcating the
2250                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2251                 if (utf8_target) {
2252                     if (s == startpos)
2253                         goto after_try_utf8;
2254                     while (1) {
2255                         if (regtry(&reginfo, &s)) {
2256                             goto got_it;
2257                         }
2258                       after_try_utf8:
2259                         if (s > end) {
2260                             goto phooey;
2261                         }
2262                         if (prog->extflags & RXf_USE_INTUIT) {
2263                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2264                             if (!s) {
2265                                 goto phooey;
2266                             }
2267                         }
2268                         else {
2269                             s += UTF8SKIP(s);
2270                         }
2271                     }
2272                 } /* end search for check string in unicode */
2273                 else {
2274                     if (s == startpos) {
2275                         goto after_try_latin;
2276                     }
2277                     while (1) {
2278                         if (regtry(&reginfo, &s)) {
2279                             goto got_it;
2280                         }
2281                       after_try_latin:
2282                         if (s > end) {
2283                             goto phooey;
2284                         }
2285                         if (prog->extflags & RXf_USE_INTUIT) {
2286                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2287                             if (!s) {
2288                                 goto phooey;
2289                             }
2290                         }
2291                         else {
2292                             s++;
2293                         }
2294                     }
2295                 } /* end search for check string in latin*/
2296             } /* end search for check string */
2297             else { /* search for newline */
2298                 if (s > startpos) {
2299                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2300                     s--;
2301                 }
2302                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2303                 while (s <= end) { /* note it could be possible to match at the end of the string */
2304                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2305                         if (regtry(&reginfo, &s))
2306                             goto got_it;
2307                     }
2308                 }
2309             } /* end search for newline */
2310         } /* end anchored/multiline check string search */
2311         goto phooey;
2312     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2313     {
2314         /* the warning about reginfo.ganch being used without initialization
2315            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2316            and we only enter this block when the same bit is set. */
2317         char *tmp_s = reginfo.ganch - prog->gofs;
2318
2319         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2320             goto got_it;
2321         goto phooey;
2322     }
2323
2324     /* Messy cases:  unanchored match. */
2325     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2326         /* we have /x+whatever/ */
2327         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2328         char ch;
2329 #ifdef DEBUGGING
2330         int did_match = 0;
2331 #endif
2332         if (utf8_target) {
2333             if (! prog->anchored_utf8) {
2334                 to_utf8_substr(prog);
2335             }
2336             ch = SvPVX_const(prog->anchored_utf8)[0];
2337             REXEC_FBC_SCAN(
2338                 if (*s == ch) {
2339                     DEBUG_EXECUTE_r( did_match = 1 );
2340                     if (regtry(&reginfo, &s)) goto got_it;
2341                     s += UTF8SKIP(s);
2342                     while (s < strend && *s == ch)
2343                         s += UTF8SKIP(s);
2344                 }
2345             );
2346
2347         }
2348         else {
2349             if (! prog->anchored_substr) {
2350                 if (! to_byte_substr(prog)) {
2351                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2352                 }
2353             }
2354             ch = SvPVX_const(prog->anchored_substr)[0];
2355             REXEC_FBC_SCAN(
2356                 if (*s == ch) {
2357                     DEBUG_EXECUTE_r( did_match = 1 );
2358                     if (regtry(&reginfo, &s)) goto got_it;
2359                     s++;
2360                     while (s < strend && *s == ch)
2361                         s++;
2362                 }
2363             );
2364         }
2365         DEBUG_EXECUTE_r(if (!did_match)
2366                 PerlIO_printf(Perl_debug_log,
2367                                   "Did not find anchored character...\n")
2368                );
2369     }
2370     else if (prog->anchored_substr != NULL
2371               || prog->anchored_utf8 != NULL
2372               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2373                   && prog->float_max_offset < strend - s)) {
2374         SV *must;
2375         I32 back_max;
2376         I32 back_min;
2377         char *last;
2378         char *last1;            /* Last position checked before */
2379 #ifdef DEBUGGING
2380         int did_match = 0;
2381 #endif
2382         if (prog->anchored_substr || prog->anchored_utf8) {
2383             if (utf8_target) {
2384                 if (! prog->anchored_utf8) {
2385                     to_utf8_substr(prog);
2386                 }
2387                 must = prog->anchored_utf8;
2388             }
2389             else {
2390                 if (! prog->anchored_substr) {
2391                     if (! to_byte_substr(prog)) {
2392                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2393                     }
2394                 }
2395                 must = prog->anchored_substr;
2396             }
2397             back_max = back_min = prog->anchored_offset;
2398         } else {
2399             if (utf8_target) {
2400                 if (! prog->float_utf8) {
2401                     to_utf8_substr(prog);
2402                 }
2403                 must = prog->float_utf8;
2404             }
2405             else {
2406                 if (! prog->float_substr) {
2407                     if (! to_byte_substr(prog)) {
2408                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2409                     }
2410                 }
2411                 must = prog->float_substr;
2412             }
2413             back_max = prog->float_max_offset;
2414             back_min = prog->float_min_offset;
2415         }
2416
2417         if (back_min<0) {
2418             last = strend;
2419         } else {
2420             last = HOP3c(strend,        /* Cannot start after this */
2421                   -(I32)(CHR_SVLEN(must)
2422                          - (SvTAIL(must) != 0) + back_min), strbeg);
2423         }
2424         if (s > PL_bostr)
2425             last1 = HOPc(s, -1);
2426         else
2427             last1 = s - 1;      /* bogus */
2428
2429         /* XXXX check_substr already used to find "s", can optimize if
2430            check_substr==must. */
2431         scream_pos = -1;
2432         dontbother = end_shift;
2433         strend = HOPc(strend, -dontbother);
2434         while ( (s <= last) &&
2435                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2436                                   (unsigned char*)strend, must,
2437                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2438             DEBUG_EXECUTE_r( did_match = 1 );
2439             if (HOPc(s, -back_max) > last1) {
2440                 last1 = HOPc(s, -back_min);
2441                 s = HOPc(s, -back_max);
2442             }
2443             else {
2444                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2445
2446                 last1 = HOPc(s, -back_min);
2447                 s = t;
2448             }
2449             if (utf8_target) {
2450                 while (s <= last1) {
2451                     if (regtry(&reginfo, &s))
2452                         goto got_it;
2453                     if (s >= last1) {
2454                         s++; /* to break out of outer loop */
2455                         break;
2456                     }
2457                     s += UTF8SKIP(s);
2458                 }
2459             }
2460             else {
2461                 while (s <= last1) {
2462                     if (regtry(&reginfo, &s))
2463                         goto got_it;
2464                     s++;
2465                 }
2466             }
2467         }
2468         DEBUG_EXECUTE_r(if (!did_match) {
2469             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2470                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2471             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2472                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2473                                ? "anchored" : "floating"),
2474                 quoted, RE_SV_TAIL(must));
2475         });
2476         goto phooey;
2477     }
2478     else if ( (c = progi->regstclass) ) {
2479         if (minlen) {
2480             const OPCODE op = OP(progi->regstclass);
2481             /* don't bother with what can't match */
2482             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2483                 strend = HOPc(strend, -(minlen - 1));
2484         }
2485         DEBUG_EXECUTE_r({
2486             SV * const prop = sv_newmortal();
2487             regprop(prog, prop, c);
2488             {
2489                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2490                     s,strend-s,60);
2491                 PerlIO_printf(Perl_debug_log,
2492                     "Matching stclass %.*s against %s (%d bytes)\n",
2493                     (int)SvCUR(prop), SvPVX_const(prop),
2494                      quoted, (int)(strend - s));
2495             }
2496         });
2497         if (find_byclass(prog, c, s, strend, &reginfo))
2498             goto got_it;
2499         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2500     }
2501     else {
2502         dontbother = 0;
2503         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2504             /* Trim the end. */
2505             char *last= NULL;
2506             SV* float_real;
2507             STRLEN len;
2508             const char *little;
2509
2510             if (utf8_target) {
2511                 if (! prog->float_utf8) {
2512                     to_utf8_substr(prog);
2513                 }
2514                 float_real = prog->float_utf8;
2515             }
2516             else {
2517                 if (! prog->float_substr) {
2518                     if (! to_byte_substr(prog)) {
2519                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2520                     }
2521                 }
2522                 float_real = prog->float_substr;
2523             }
2524
2525             little = SvPV_const(float_real, len);
2526             if (SvTAIL(float_real)) {
2527                     /* This means that float_real contains an artificial \n on
2528                      * the end due to the presence of something like this:
2529                      * /foo$/ where we can match both "foo" and "foo\n" at the
2530                      * end of the string.  So we have to compare the end of the
2531                      * string first against the float_real without the \n and
2532                      * then against the full float_real with the string.  We
2533                      * have to watch out for cases where the string might be
2534                      * smaller than the float_real or the float_real without
2535                      * the \n. */
2536                     char *checkpos= strend - len;
2537                     DEBUG_OPTIMISE_r(
2538                         PerlIO_printf(Perl_debug_log,
2539                             "%sChecking for float_real.%s\n",
2540                             PL_colors[4], PL_colors[5]));
2541                     if (checkpos + 1 < strbeg) {
2542                         /* can't match, even if we remove the trailing \n
2543                          * string is too short to match */
2544                         DEBUG_EXECUTE_r(
2545                             PerlIO_printf(Perl_debug_log,
2546                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2547                                 PL_colors[4], PL_colors[5]));
2548                         goto phooey;
2549                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2550                         /* can match, the end of the string matches without the
2551                          * "\n" */
2552                         last = checkpos + 1;
2553                     } else if (checkpos < strbeg) {
2554                         /* cant match, string is too short when the "\n" is
2555                          * included */
2556                         DEBUG_EXECUTE_r(
2557                             PerlIO_printf(Perl_debug_log,
2558                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2559                                 PL_colors[4], PL_colors[5]));
2560                         goto phooey;
2561                     } else if (!multiline) {
2562                         /* non multiline match, so compare with the "\n" at the
2563                          * end of the string */
2564                         if (memEQ(checkpos, little, len)) {
2565                             last= checkpos;
2566                         } else {
2567                             DEBUG_EXECUTE_r(
2568                                 PerlIO_printf(Perl_debug_log,
2569                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2570                                     PL_colors[4], PL_colors[5]));
2571                             goto phooey;
2572                         }
2573                     } else {
2574                         /* multiline match, so we have to search for a place
2575                          * where the full string is located */
2576                         goto find_last;
2577                     }
2578             } else {
2579                   find_last:
2580                     if (len)
2581                         last = rninstr(s, strend, little, little + len);
2582                     else
2583                         last = strend;  /* matching "$" */
2584             }
2585             if (!last) {
2586                 /* at one point this block contained a comment which was
2587                  * probably incorrect, which said that this was a "should not
2588                  * happen" case.  Even if it was true when it was written I am
2589                  * pretty sure it is not anymore, so I have removed the comment
2590                  * and replaced it with this one. Yves */
2591                 DEBUG_EXECUTE_r(
2592                     PerlIO_printf(Perl_debug_log,
2593                         "String does not contain required substring, cannot match.\n"
2594                     ));
2595                 goto phooey;
2596             }
2597             dontbother = strend - last + prog->float_min_offset;
2598         }
2599         if (minlen && (dontbother < minlen))
2600             dontbother = minlen - 1;
2601         strend -= dontbother;              /* this one's always in bytes! */
2602         /* We don't know much -- general case. */
2603         if (utf8_target) {
2604             for (;;) {
2605                 if (regtry(&reginfo, &s))
2606                     goto got_it;
2607                 if (s >= strend)
2608                     break;
2609                 s += UTF8SKIP(s);
2610             };
2611         }
2612         else {
2613             do {
2614                 if (regtry(&reginfo, &s))
2615                     goto got_it;
2616             } while (s++ < strend);
2617         }
2618     }
2619
2620     /* Failure. */
2621     goto phooey;
2622
2623 got_it:
2624     DEBUG_BUFFERS_r(
2625         if (swap)
2626             PerlIO_printf(Perl_debug_log,
2627                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2628                 PTR2UV(prog),
2629                 PTR2UV(swap)
2630             );
2631     );
2632     Safefree(swap);
2633     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2634
2635     if (PL_reg_state.re_state_eval_setup_done)
2636         restore_pos(aTHX_ prog);
2637     if (RXp_PAREN_NAMES(prog))
2638         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2639
2640     /* make sure $`, $&, $', and $digit will work later */
2641     if ( !(flags & REXEC_NOT_FIRST) ) {
2642         if (flags & REXEC_COPY_STR) {
2643 #ifdef PERL_OLD_COPY_ON_WRITE
2644             if ((SvIsCOW(sv)
2645                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2646                 if (DEBUG_C_TEST) {
2647                     PerlIO_printf(Perl_debug_log,
2648                                   "Copy on write: regexp capture, type %d\n",
2649                                   (int) SvTYPE(sv));
2650                 }
2651                 RX_MATCH_COPY_FREE(rx);
2652                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2653                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2654                 assert (SvPOKp(prog->saved_copy));
2655                 prog->sublen  = PL_regeol - strbeg;
2656                 prog->suboffset = 0;
2657                 prog->subcoffset = 0;
2658             } else
2659 #endif
2660             {
2661                 I32 min = 0;
2662                 I32 max = PL_regeol - strbeg;
2663                 I32 sublen;
2664
2665                 if (    (flags & REXEC_COPY_SKIP_POST)
2666                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2667                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2668                 ) { /* don't copy $' part of string */
2669                     U32 n = 0;
2670                     max = -1;
2671                     /* calculate the right-most part of the string covered
2672                      * by a capture. Due to look-ahead, this may be to
2673                      * the right of $&, so we have to scan all captures */
2674                     while (n <= prog->lastparen) {
2675                         if (prog->offs[n].end > max)
2676                             max = prog->offs[n].end;
2677                         n++;
2678                     }
2679                     if (max == -1)
2680                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2681                                 ? prog->offs[0].start
2682                                 : 0;
2683                     assert(max >= 0 && max <= PL_regeol - strbeg);
2684                 }
2685
2686                 if (    (flags & REXEC_COPY_SKIP_PRE)
2687                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2688                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2689                 ) { /* don't copy $` part of string */
2690                     U32 n = 0;
2691                     min = max;
2692                     /* calculate the left-most part of the string covered
2693                      * by a capture. Due to look-behind, this may be to
2694                      * the left of $&, so we have to scan all captures */
2695                     while (min && n <= prog->lastparen) {
2696                         if (   prog->offs[n].start != -1
2697                             && prog->offs[n].start < min)
2698                         {
2699                             min = prog->offs[n].start;
2700                         }
2701                         n++;
2702                     }
2703                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2704                         && min >  prog->offs[0].end
2705                     )
2706                         min = prog->offs[0].end;
2707
2708                 }
2709
2710                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2711                 sublen = max - min;
2712
2713                 if (RX_MATCH_COPIED(rx)) {
2714                     if (sublen > prog->sublen)
2715                         prog->subbeg =
2716                                 (char*)saferealloc(prog->subbeg, sublen+1);
2717                 }
2718                 else
2719                     prog->subbeg = (char*)safemalloc(sublen+1);
2720                 Copy(strbeg + min, prog->subbeg, sublen, char);
2721                 prog->subbeg[sublen] = '\0';
2722                 prog->suboffset = min;
2723                 prog->sublen = sublen;
2724                 RX_MATCH_COPIED_on(rx);
2725             }
2726             prog->subcoffset = prog->suboffset;
2727             if (prog->suboffset && utf8_target) {
2728                 /* Convert byte offset to chars.
2729                  * XXX ideally should only compute this if @-/@+
2730                  * has been seen, a la PL_sawampersand ??? */
2731
2732                 /* If there's a direct correspondence between the
2733                  * string which we're matching and the original SV,
2734                  * then we can use the utf8 len cache associated with
2735                  * the SV. In particular, it means that under //g,
2736                  * sv_pos_b2u() will use the previously cached
2737                  * position to speed up working out the new length of
2738                  * subcoffset, rather than counting from the start of
2739                  * the string each time. This stops
2740                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2741                  * from going quadratic */
2742                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2743                     sv_pos_b2u(sv, &(prog->subcoffset));
2744                 else
2745                     prog->subcoffset = utf8_length((U8*)strbeg,
2746                                         (U8*)(strbeg+prog->suboffset));
2747             }
2748         }
2749         else {
2750             RX_MATCH_COPY_FREE(rx);
2751             prog->subbeg = strbeg;
2752             prog->suboffset = 0;
2753             prog->subcoffset = 0;
2754             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2755         }
2756     }
2757
2758     return 1;
2759
2760 phooey:
2761     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2762                           PL_colors[4], PL_colors[5]));
2763     if (PL_reg_state.re_state_eval_setup_done)
2764         restore_pos(aTHX_ prog);
2765     if (swap) {
2766         /* we failed :-( roll it back */
2767         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2768             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2769             PTR2UV(prog),
2770             PTR2UV(prog->offs),
2771             PTR2UV(swap)
2772         ));
2773         Safefree(prog->offs);
2774         prog->offs = swap;
2775     }
2776     return 0;
2777 }
2778
2779
2780 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2781  * Do inc before dec, in case old and new rex are the same */
2782 #define SET_reg_curpm(Re2) \
2783     if (PL_reg_state.re_state_eval_setup_done) {    \
2784         (void)ReREFCNT_inc(Re2);                    \
2785         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2786         PM_SETRE((PL_reg_curpm), (Re2));            \
2787     }
2788
2789
2790 /*
2791  - regtry - try match at specific point
2792  */
2793 STATIC I32                      /* 0 failure, 1 success */
2794 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2795 {
2796     dVAR;
2797     CHECKPOINT lastcp;
2798     REGEXP *const rx = reginfo->prog;
2799     regexp *const prog = ReANY(rx);
2800     I32 result;
2801     RXi_GET_DECL(prog,progi);
2802     GET_RE_DEBUG_FLAGS_DECL;
2803
2804     PERL_ARGS_ASSERT_REGTRY;
2805
2806     reginfo->cutpoint=NULL;
2807
2808     if ((prog->extflags & RXf_EVAL_SEEN)
2809         && !PL_reg_state.re_state_eval_setup_done)
2810     {
2811         MAGIC *mg;
2812
2813         PL_reg_state.re_state_eval_setup_done = TRUE;
2814         if (reginfo->sv) {
2815             /* Make $_ available to executed code. */
2816             if (reginfo->sv != DEFSV) {
2817                 SAVE_DEFSV;
2818                 DEFSV_set(reginfo->sv);
2819             }
2820
2821             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2822                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2823                 /* prepare for quick setting of pos */
2824 #ifdef PERL_OLD_COPY_ON_WRITE
2825                 if (SvIsCOW(reginfo->sv))
2826                     sv_force_normal_flags(reginfo->sv, 0);
2827 #endif
2828                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2829                                  &PL_vtbl_mglob, NULL, 0);
2830                 mg->mg_len = -1;
2831             }
2832             PL_reg_magic    = mg;
2833             PL_reg_oldpos   = mg->mg_len;
2834             SAVEDESTRUCTOR_X(restore_pos, prog);
2835         }
2836         if (!PL_reg_curpm) {
2837             Newxz(PL_reg_curpm, 1, PMOP);
2838 #ifdef USE_ITHREADS
2839             {
2840                 SV* const repointer = &PL_sv_undef;
2841                 /* this regexp is also owned by the new PL_reg_curpm, which
2842                    will try to free it.  */
2843                 av_push(PL_regex_padav, repointer);
2844                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2845                 PL_regex_pad = AvARRAY(PL_regex_padav);
2846             }
2847 #endif
2848         }
2849         SET_reg_curpm(rx);
2850         PL_reg_oldcurpm = PL_curpm;
2851         PL_curpm = PL_reg_curpm;
2852         if (RXp_MATCH_COPIED(prog)) {
2853             /*  Here is a serious problem: we cannot rewrite subbeg,
2854                 since it may be needed if this match fails.  Thus
2855                 $` inside (?{}) could fail... */
2856             PL_reg_oldsaved = prog->subbeg;
2857             PL_reg_oldsavedlen = prog->sublen;
2858             PL_reg_oldsavedoffset = prog->suboffset;
2859             PL_reg_oldsavedcoffset = prog->suboffset;
2860 #ifdef PERL_OLD_COPY_ON_WRITE
2861             PL_nrs = prog->saved_copy;
2862 #endif
2863             RXp_MATCH_COPIED_off(prog);
2864         }
2865         else
2866             PL_reg_oldsaved = NULL;
2867         prog->subbeg = PL_bostr;
2868         prog->suboffset = 0;
2869         prog->subcoffset = 0;
2870         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2871     }
2872 #ifdef DEBUGGING
2873     PL_reg_starttry = *startposp;
2874 #endif
2875     prog->offs[0].start = *startposp - PL_bostr;
2876     prog->lastparen = 0;
2877     prog->lastcloseparen = 0;
2878     PL_regsize = 0;
2879
2880     /* XXXX What this code is doing here?!!!  There should be no need
2881        to do this again and again, prog->lastparen should take care of
2882        this!  --ilya*/
2883
2884     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2885      * Actually, the code in regcppop() (which Ilya may be meaning by
2886      * prog->lastparen), is not needed at all by the test suite
2887      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2888      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2889      * Meanwhile, this code *is* needed for the
2890      * above-mentioned test suite tests to succeed.  The common theme
2891      * on those tests seems to be returning null fields from matches.
2892      * --jhi updated by dapm */
2893 #if 1
2894     if (prog->nparens) {
2895         regexp_paren_pair *pp = prog->offs;
2896         I32 i;
2897         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2898             ++pp;
2899             pp->start = -1;
2900             pp->end = -1;
2901         }
2902     }
2903 #endif
2904     REGCP_SET(lastcp);
2905     result = regmatch(reginfo, *startposp, progi->program + 1);
2906     if (result != -1) {
2907         prog->offs[0].end = result;
2908         return 1;
2909     }
2910     if (reginfo->cutpoint)
2911         *startposp= reginfo->cutpoint;
2912     REGCP_UNWIND(lastcp);
2913     return 0;
2914 }
2915
2916
2917 #define sayYES goto yes
2918 #define sayNO goto no
2919 #define sayNO_SILENT goto no_silent
2920
2921 /* we dont use STMT_START/END here because it leads to
2922    "unreachable code" warnings, which are bogus, but distracting. */
2923 #define CACHEsayNO \
2924     if (ST.cache_mask) \
2925        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2926     sayNO
2927
2928 /* this is used to determine how far from the left messages like
2929    'failed...' are printed. It should be set such that messages
2930    are inline with the regop output that created them.
2931 */
2932 #define REPORT_CODE_OFF 32
2933
2934
2935 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2936 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2937 #define CHRTEST_NOT_A_CP_1 -999
2938 #define CHRTEST_NOT_A_CP_2 -998
2939
2940 #define SLAB_FIRST(s) (&(s)->states[0])
2941 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2942
2943 /* grab a new slab and return the first slot in it */
2944
2945 STATIC regmatch_state *
2946 S_push_slab(pTHX)
2947 {
2948 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2949     dMY_CXT;
2950 #endif
2951     regmatch_slab *s = PL_regmatch_slab->next;
2952     if (!s) {
2953         Newx(s, 1, regmatch_slab);
2954         s->prev = PL_regmatch_slab;
2955         s->next = NULL;
2956         PL_regmatch_slab->next = s;
2957     }
2958     PL_regmatch_slab = s;
2959     return SLAB_FIRST(s);
2960 }
2961
2962
2963 /* push a new state then goto it */
2964
2965 #define PUSH_STATE_GOTO(state, node, input) \
2966     pushinput = input; \
2967     scan = node; \
2968     st->resume_state = state; \
2969     goto push_state;
2970
2971 /* push a new state with success backtracking, then goto it */
2972
2973 #define PUSH_YES_STATE_GOTO(state, node, input) \
2974     pushinput = input; \
2975     scan = node; \
2976     st->resume_state = state; \
2977     goto push_yes_state;
2978
2979
2980
2981
2982 /*
2983
2984 regmatch() - main matching routine
2985
2986 This is basically one big switch statement in a loop. We execute an op,
2987 set 'next' to point the next op, and continue. If we come to a point which
2988 we may need to backtrack to on failure such as (A|B|C), we push a
2989 backtrack state onto the backtrack stack. On failure, we pop the top
2990 state, and re-enter the loop at the state indicated. If there are no more
2991 states to pop, we return failure.
2992
2993 Sometimes we also need to backtrack on success; for example /A+/, where
2994 after successfully matching one A, we need to go back and try to
2995 match another one; similarly for lookahead assertions: if the assertion
2996 completes successfully, we backtrack to the state just before the assertion
2997 and then carry on.  In these cases, the pushed state is marked as
2998 'backtrack on success too'. This marking is in fact done by a chain of
2999 pointers, each pointing to the previous 'yes' state. On success, we pop to
3000 the nearest yes state, discarding any intermediate failure-only states.
3001 Sometimes a yes state is pushed just to force some cleanup code to be
3002 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3003 it to free the inner regex.
3004
3005 Note that failure backtracking rewinds the cursor position, while
3006 success backtracking leaves it alone.
3007
3008 A pattern is complete when the END op is executed, while a subpattern
3009 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3010 ops trigger the "pop to last yes state if any, otherwise return true"
3011 behaviour.
3012
3013 A common convention in this function is to use A and B to refer to the two
3014 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3015 the subpattern to be matched possibly multiple times, while B is the entire
3016 rest of the pattern. Variable and state names reflect this convention.
3017
3018 The states in the main switch are the union of ops and failure/success of
3019 substates associated with with that op.  For example, IFMATCH is the op
3020 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3021 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3022 successfully matched A and IFMATCH_A_fail is a state saying that we have
3023 just failed to match A. Resume states always come in pairs. The backtrack
3024 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3025 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3026 on success or failure.
3027
3028 The struct that holds a backtracking state is actually a big union, with
3029 one variant for each major type of op. The variable st points to the
3030 top-most backtrack struct. To make the code clearer, within each
3031 block of code we #define ST to alias the relevant union.
3032
3033 Here's a concrete example of a (vastly oversimplified) IFMATCH
3034 implementation:
3035
3036     switch (state) {
3037     ....
3038
3039 #define ST st->u.ifmatch
3040
3041     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3042         ST.foo = ...; // some state we wish to save
3043         ...
3044         // push a yes backtrack state with a resume value of
3045         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3046         // first node of A:
3047         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3048         // NOTREACHED
3049
3050     case IFMATCH_A: // we have successfully executed A; now continue with B
3051         next = B;
3052         bar = ST.foo; // do something with the preserved value
3053         break;
3054
3055     case IFMATCH_A_fail: // A failed, so the assertion failed
3056         ...;   // do some housekeeping, then ...
3057         sayNO; // propagate the failure
3058
3059 #undef ST
3060
3061     ...
3062     }
3063
3064 For any old-timers reading this who are familiar with the old recursive
3065 approach, the code above is equivalent to:
3066
3067     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3068     {
3069         int foo = ...
3070         ...
3071         if (regmatch(A)) {
3072             next = B;
3073             bar = foo;
3074             break;
3075         }
3076         ...;   // do some housekeeping, then ...
3077         sayNO; // propagate the failure
3078     }
3079
3080 The topmost backtrack state, pointed to by st, is usually free. If you
3081 want to claim it, populate any ST.foo fields in it with values you wish to
3082 save, then do one of
3083
3084         PUSH_STATE_GOTO(resume_state, node, newinput);
3085         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3086
3087 which sets that backtrack state's resume value to 'resume_state', pushes a
3088 new free entry to the top of the backtrack stack, then goes to 'node'.
3089 On backtracking, the free slot is popped, and the saved state becomes the
3090 new free state. An ST.foo field in this new top state can be temporarily
3091 accessed to retrieve values, but once the main loop is re-entered, it
3092 becomes available for reuse.
3093
3094 Note that the depth of the backtrack stack constantly increases during the
3095 left-to-right execution of the pattern, rather than going up and down with
3096 the pattern nesting. For example the stack is at its maximum at Z at the
3097 end of the pattern, rather than at X in the following:
3098
3099     /(((X)+)+)+....(Y)+....Z/
3100
3101 The only exceptions to this are lookahead/behind assertions and the cut,
3102 (?>A), which pop all the backtrack states associated with A before
3103 continuing.
3104
3105 Backtrack state structs are allocated in slabs of about 4K in size.
3106 PL_regmatch_state and st always point to the currently active state,
3107 and PL_regmatch_slab points to the slab currently containing
3108 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3109 allocated, and is never freed until interpreter destruction. When the slab
3110 is full, a new one is allocated and chained to the end. At exit from
3111 regmatch(), slabs allocated since entry are freed.
3112
3113 */
3114
3115
3116 #define DEBUG_STATE_pp(pp)                                  \
3117     DEBUG_STATE_r({                                         \
3118         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3119         PerlIO_printf(Perl_debug_log,                       \
3120             "    %*s"pp" %s%s%s%s%s\n",                     \
3121             depth*2, "",                                    \
3122             PL_reg_name[st->resume_state],                     \
3123             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3124             ((st==yes_state) ? "Y" : ""),                   \
3125             ((st==mark_state) ? "M" : ""),                  \
3126             ((st==yes_state||st==mark_state) ? "]" : "")    \
3127         );                                                  \
3128     });
3129
3130
3131 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3132
3133 #ifdef DEBUGGING
3134
3135 STATIC void
3136 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3137     const char *start, const char *end, const char *blurb)
3138 {
3139     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3140
3141     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3142
3143     if (!PL_colorset)
3144             reginitcolors();
3145     {
3146         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3147             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3148
3149         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3150             start, end - start, 60);
3151
3152         PerlIO_printf(Perl_debug_log,
3153             "%s%s REx%s %s against %s\n",
3154                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3155
3156         if (utf8_target||utf8_pat)
3157             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3158                 utf8_pat ? "pattern" : "",
3159                 utf8_pat && utf8_target ? " and " : "",
3160                 utf8_target ? "string" : ""
3161             );
3162     }
3163 }
3164
3165 STATIC void
3166 S_dump_exec_pos(pTHX_ const char *locinput,
3167                       const regnode *scan,
3168                       const char *loc_regeol,
3169                       const char *loc_bostr,
3170                       const char *loc_reg_starttry,
3171                       const bool utf8_target)
3172 {
3173     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3174     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3175     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3176     /* The part of the string before starttry has one color
3177        (pref0_len chars), between starttry and current
3178        position another one (pref_len - pref0_len chars),
3179        after the current position the third one.
3180        We assume that pref0_len <= pref_len, otherwise we
3181        decrease pref0_len.  */
3182     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3183         ? (5 + taill) - l : locinput - loc_bostr;
3184     int pref0_len;
3185
3186     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3187
3188     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3189         pref_len++;
3190     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3191     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3192         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3193               ? (5 + taill) - pref_len : loc_regeol - locinput);
3194     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3195         l--;
3196     if (pref0_len < 0)
3197         pref0_len = 0;
3198     if (pref0_len > pref_len)
3199         pref0_len = pref_len;
3200     {
3201         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3202
3203         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3204             (locinput - pref_len),pref0_len, 60, 4, 5);
3205
3206         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3207                     (locinput - pref_len + pref0_len),
3208                     pref_len - pref0_len, 60, 2, 3);
3209
3210         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3211                     locinput, loc_regeol - locinput, 10, 0, 1);
3212
3213         const STRLEN tlen=len0+len1+len2;
3214         PerlIO_printf(Perl_debug_log,
3215                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3216                     (IV)(locinput - loc_bostr),
3217                     len0, s0,
3218                     len1, s1,
3219                     (docolor ? "" : "> <"),
3220                     len2, s2,
3221                     (int)(tlen > 19 ? 0 :  19 - tlen),
3222                     "");
3223     }
3224 }
3225
3226 #endif
3227
3228 /* reg_check_named_buff_matched()
3229  * Checks to see if a named buffer has matched. The data array of
3230  * buffer numbers corresponding to the buffer is expected to reside
3231  * in the regexp->data->data array in the slot stored in the ARG() of
3232  * node involved. Note that this routine doesn't actually care about the
3233  * name, that information is not preserved from compilation to execution.
3234  * Returns the index of the leftmost defined buffer with the given name
3235  * or 0 if non of the buffers matched.
3236  */
3237 STATIC I32
3238 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3239 {
3240     I32 n;
3241     RXi_GET_DECL(rex,rexi);
3242     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3243     I32 *nums=(I32*)SvPVX(sv_dat);
3244
3245     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3246
3247     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3248         if ((I32)rex->lastparen >= nums[n] &&
3249             rex->offs[nums[n]].end != -1)
3250         {
3251             return nums[n];
3252         }
3253     }
3254     return 0;
3255 }
3256
3257
3258 /* free all slabs above current one  - called during LEAVE_SCOPE */
3259
3260 STATIC void
3261 S_clear_backtrack_stack(pTHX_ void *p)
3262 {
3263     regmatch_slab *s = PL_regmatch_slab->next;
3264     PERL_UNUSED_ARG(p);
3265
3266     if (!s)
3267         return;
3268     PL_regmatch_slab->next = NULL;
3269     while (s) {
3270         regmatch_slab * const osl = s;
3271         s = s->next;
3272         Safefree(osl);
3273     }
3274 }
3275 static bool
3276 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3277 {
3278     /* This function determines if there are one or two characters that match
3279      * the first character of the passed-in EXACTish node <text_node>, and if
3280      * so, returns them in the passed-in pointers.
3281      *
3282      * If it determines that no possible character in the target string can
3283      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3284      * the first character in <text_node> requires UTF-8 to represent, and the
3285      * target string isn't in UTF-8.)
3286      *
3287      * If there are more than two characters that could match the beginning of
3288      * <text_node>, or if more context is required to determine a match or not,
3289      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3290      *
3291      * The motiviation behind this function is to allow the caller to set up
3292      * tight loops for matching.  If <text_node> is of type EXACT, there is
3293      * only one possible character that can match its first character, and so
3294      * the situation is quite simple.  But things get much more complicated if
3295      * folding is involved.  It may be that the first character of an EXACTFish
3296      * node doesn't participate in any possible fold, e.g., punctuation, so it
3297      * can be matched only by itself.  The vast majority of characters that are
3298      * in folds match just two things, their lower and upper-case equivalents.
3299      * But not all are like that; some have multiple possible matches, or match
3300      * sequences of more than one character.  This function sorts all that out.
3301      *
3302      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3303      * loop of trying to match A*, we know we can't exit where the thing
3304      * following it isn't a B.  And something can't be a B unless it is the
3305      * beginning of B.  By putting a quick test for that beginning in a tight
3306      * loop, we can rule out things that can't possibly be B without having to
3307      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3308      * character, we can make a tight loop matching A*, using the outputs of
3309      * this function.
3310      *
3311      * If the target string to match isn't in UTF-8, and there aren't
3312      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3313      * the one or two possible octets (which are characters in this situation)
3314      * that can match.  In all cases, if there is only one character that can
3315      * match, *<c1p> and *<c2p> will be identical.
3316      *
3317      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3318      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3319      * can match the beginning of <text_node>.  They should be declared with at
3320      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3321      * undefined what these contain.)  If one or both of the buffers are
3322      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3323      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3324      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3325      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3326      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3327
3328     const bool utf8_target = PL_reg_match_utf8;
3329
3330     UV c1 = CHRTEST_NOT_A_CP_1;
3331     UV c2 = CHRTEST_NOT_A_CP_2;
3332     bool use_chrtest_void = FALSE;
3333
3334     /* Used when we have both utf8 input and utf8 output, to avoid converting
3335      * to/from code points */
3336     bool utf8_has_been_setup = FALSE;
3337
3338     dVAR;
3339
3340     U8 *pat = (U8*)STRING(text_node);
3341
3342     if (OP(text_node) == EXACT) {
3343
3344         /* In an exact node, only one thing can be matched, that first
3345          * character.  If both the pat and the target are UTF-8, we can just
3346          * copy the input to the output, avoiding finding the code point of
3347          * that character */
3348         if (! UTF_PATTERN) {
3349             c2 = c1 = *pat;
3350         }
3351         else if (utf8_target) {
3352             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3353             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3354             utf8_has_been_setup = TRUE;
3355         }
3356         else {
3357             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3358         }
3359     }
3360     else /* an EXACTFish node */
3361          if ((UTF_PATTERN
3362                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3363                                                     pat + STR_LEN(text_node)))
3364              || (! UTF_PATTERN
3365                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3366                                                     pat + STR_LEN(text_node))))
3367     {
3368         /* Multi-character folds require more context to sort out.  Also
3369          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3370          * handled outside this routine */
3371         use_chrtest_void = TRUE;
3372     }
3373     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3374         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3375         if (c1 > 256) {
3376             /* Load the folds hash, if not already done */
3377             SV** listp;
3378             if (! PL_utf8_foldclosures) {
3379                 if (! PL_utf8_tofold) {
3380                     U8 dummy[UTF8_MAXBYTES+1];
3381
3382                     /* Force loading this by folding an above-Latin1 char */
3383                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3384                     assert(PL_utf8_tofold); /* Verify that worked */
3385                 }
3386                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3387             }
3388
3389             /* The fold closures data structure is a hash with the keys being
3390              * the UTF-8 of every character that is folded to, like 'k', and
3391              * the values each an array of all code points that fold to its
3392              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3393              * not included */
3394             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3395                                      (char *) pat,
3396                                      UTF8SKIP(pat),
3397                                      FALSE))))
3398             {
3399                 /* Not found in the hash, therefore there are no folds
3400                  * containing it, so there is only a single character that
3401                  * could match */
3402                 c2 = c1;
3403             }
3404             else {  /* Does participate in folds */
3405                 AV* list = (AV*) *listp;
3406                 if (av_len(list) != 1) {
3407
3408                     /* If there aren't exactly two folds to this, it is outside
3409                      * the scope of this function */
3410                     use_chrtest_void = TRUE;
3411                 }
3412                 else {  /* There are two.  Get them */
3413                     SV** c_p = av_fetch(list, 0, FALSE);
3414                     if (c_p == NULL) {
3415                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3416                     }
3417                     c1 = SvUV(*c_p);
3418
3419                     c_p = av_fetch(list, 1, FALSE);
3420                     if (c_p == NULL) {
3421                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3422                     }
3423                     c2 = SvUV(*c_p);
3424
3425                     /* Folds that cross the 255/256 boundary are forbidden if
3426                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3427                      * pattern character is above 256, and its only other match
3428                      * is below 256, the only legal match will be to itself.
3429                      * We have thrown away the original, so have to compute
3430                      * which is the one above 255 */
3431                     if ((c1 < 256) != (c2 < 256)) {
3432                         if (OP(text_node) == EXACTFL
3433                             || (OP(text_node) == EXACTFA
3434                                 && (isASCII(c1) || isASCII(c2))))
3435                         {
3436                             if (c1 < 256) {
3437                                 c1 = c2;
3438                             }
3439                             else {
3440                                 c2 = c1;
3441                             }
3442                         }
3443                     }
3444                 }
3445             }
3446         }
3447         else /* Here, c1 is < 255 */
3448              if (utf8_target
3449                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3450                  && OP(text_node) != EXACTFL
3451                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3452         {
3453             /* Here, there could be something above Latin1 in the target which
3454              * folds to this character in the pattern.  All such cases except
3455              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3456              * involved in their folds, so are outside the scope of this
3457              * function */
3458             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3459                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3460             }
3461             else {
3462                 use_chrtest_void = TRUE;
3463             }
3464         }
3465         else { /* Here nothing above Latin1 can fold to the pattern character */
3466             switch (OP(text_node)) {
3467
3468                 case EXACTFL:   /* /l rules */
3469                     c2 = PL_fold_locale[c1];
3470                     break;
3471
3472                 case EXACTF:
3473                     if (! utf8_target) {    /* /d rules */
3474                         c2 = PL_fold[c1];
3475                         break;
3476                     }
3477                     /* FALLTHROUGH */
3478                     /* /u rules for all these.  This happens to work for
3479                      * EXACTFA as nothing in Latin1 folds to ASCII */
3480                 case EXACTFA:
3481                 case EXACTFU_TRICKYFOLD:
3482                 case EXACTFU_SS:
3483                 case EXACTFU:
3484                     c2 = PL_fold_latin1[c1];
3485                     break;
3486
3487                 default:
3488                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3489                     assert(0); /* NOTREACHED */
3490             }
3491         }
3492     }
3493
3494     /* Here have figured things out.  Set up the returns */
3495     if (use_chrtest_void) {
3496         *c2p = *c1p = CHRTEST_VOID;
3497     }
3498     else if (utf8_target) {
3499         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3500             uvchr_to_utf8(c1_utf8, c1);
3501             uvchr_to_utf8(c2_utf8, c2);
3502         }
3503
3504         /* Invariants are stored in both the utf8 and byte outputs; Use
3505          * negative numbers otherwise for the byte ones.  Make sure that the
3506          * byte ones are the same iff the utf8 ones are the same */
3507         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3508         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3509                 ? *c2_utf8
3510                 : (c1 == c2)
3511                   ? CHRTEST_NOT_A_CP_1
3512                   : CHRTEST_NOT_A_CP_2;
3513     }
3514     else if (c1 > 255) {
3515        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3516                            can represent */
3517            return FALSE;
3518        }
3519
3520        *c1p = *c2p = c2;    /* c2 is the only representable value */
3521     }
3522     else {  /* c1 is representable; see about c2 */
3523        *c1p = c1;
3524        *c2p = (c2 < 256) ? c2 : c1;
3525     }
3526
3527     return TRUE;
3528 }
3529
3530 /* returns -1 on failure, $+[0] on success */
3531 STATIC I32
3532 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3533 {
3534 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3535     dMY_CXT;
3536 #endif
3537     dVAR;
3538     const bool utf8_target = PL_reg_match_utf8;
3539     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3540     REGEXP *rex_sv = reginfo->prog;
3541     regexp *rex = ReANY(rex_sv);
3542     RXi_GET_DECL(rex,rexi);
3543     I32 oldsave;
3544     /* the current state. This is a cached copy of PL_regmatch_state */
3545     regmatch_state *st;
3546     /* cache heavy used fields of st in registers */
3547     regnode *scan;
3548     regnode *next;
3549     U32 n = 0;  /* general value; init to avoid compiler warning */
3550     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3551     char *locinput = startpos;
3552     char *pushinput; /* where to continue after a PUSH */
3553     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3554
3555     bool result = 0;        /* return value of S_regmatch */
3556     int depth = 0;          /* depth of backtrack stack */
3557     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3558     const U32 max_nochange_depth =
3559         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3560         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3561     regmatch_state *yes_state = NULL; /* state to pop to on success of
3562                                                             subpattern */
3563     /* mark_state piggy backs on the yes_state logic so that when we unwind
3564        the stack on success we can update the mark_state as we go */
3565     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3566     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3567     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3568     U32 state_num;
3569     bool no_final = 0;      /* prevent failure from backtracking? */
3570     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3571     char *startpoint = locinput;
3572     SV *popmark = NULL;     /* are we looking for a mark? */
3573     SV *sv_commit = NULL;   /* last mark name seen in failure */
3574     SV *sv_yes_mark = NULL; /* last mark name we have seen
3575                                during a successful match */
3576     U32 lastopen = 0;       /* last open we saw */
3577     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3578     SV* const oreplsv = GvSV(PL_replgv);
3579     /* these three flags are set by various ops to signal information to
3580      * the very next op. They have a useful lifetime of exactly one loop
3581      * iteration, and are not preserved or restored by state pushes/pops
3582      */
3583     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3584     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3585     int logical = 0;        /* the following EVAL is:
3586                                 0: (?{...})
3587                                 1: (?(?{...})X|Y)
3588                                 2: (??{...})
3589                                or the following IFMATCH/UNLESSM is:
3590                                 false: plain (?=foo)
3591                                 true:  used as a condition: (?(?=foo))
3592                             */
3593     PAD* last_pad = NULL;
3594     dMULTICALL;
3595     I32 gimme = G_SCALAR;
3596     CV *caller_cv = NULL;       /* who called us */
3597     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3598     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3599
3600 #ifdef DEBUGGING
3601     GET_RE_DEBUG_FLAGS_DECL;
3602 #endif
3603
3604     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3605     multicall_oldcatch = 0;
3606     multicall_cv = NULL;
3607     cx = NULL;
3608     PERL_UNUSED_VAR(multicall_cop);
3609     PERL_UNUSED_VAR(newsp);
3610
3611
3612     PERL_ARGS_ASSERT_REGMATCH;
3613
3614     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3615             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3616     }));
3617     /* on first ever call to regmatch, allocate first slab */
3618     if (!PL_regmatch_slab) {
3619         Newx(PL_regmatch_slab, 1, regmatch_slab);
3620         PL_regmatch_slab->prev = NULL;
3621         PL_regmatch_slab->next = NULL;
3622         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3623     }
3624
3625     oldsave = PL_savestack_ix;
3626     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3627     SAVEVPTR(PL_regmatch_slab);
3628     SAVEVPTR(PL_regmatch_state);
3629
3630     /* grab next free state slot */
3631     st = ++PL_regmatch_state;
3632     if (st >  SLAB_LAST(PL_regmatch_slab))
3633         st = PL_regmatch_state = S_push_slab(aTHX);
3634
3635     /* Note that nextchr is a byte even in UTF */
3636     SET_nextchr;
3637     scan = prog;
3638     while (scan != NULL) {
3639
3640         DEBUG_EXECUTE_r( {
3641             SV * const prop = sv_newmortal();
3642             regnode *rnext=regnext(scan);
3643             DUMP_EXEC_POS( locinput, scan, utf8_target );
3644             regprop(rex, prop, scan);
3645
3646             PerlIO_printf(Perl_debug_log,
3647                     "%3"IVdf":%*s%s(%"IVdf")\n",
3648                     (IV)(scan - rexi->program), depth*2, "",
3649                     SvPVX_const(prop),
3650                     (PL_regkind[OP(scan)] == END || !rnext) ?
3651                         0 : (IV)(rnext - rexi->program));
3652         });
3653
3654         next = scan + NEXT_OFF(scan);
3655         if (next == scan)
3656             next = NULL;
3657         state_num = OP(scan);
3658
3659       reenter_switch:
3660
3661         SET_nextchr;
3662         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3663
3664         switch (state_num) {
3665         case BOL: /*  /^../  */
3666             if (locinput == PL_bostr)
3667             {
3668                 /* reginfo->till = reginfo->bol; */
3669                 break;
3670             }
3671             sayNO;
3672
3673         case MBOL: /*  /^../m  */
3674             if (locinput == PL_bostr ||
3675                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3676             {
3677                 break;
3678             }
3679             sayNO;
3680
3681         case SBOL: /*  /^../s  */
3682             if (locinput == PL_bostr)
3683                 break;
3684             sayNO;
3685
3686         case GPOS: /*  \G  */
3687             if (locinput == reginfo->ganch)
3688                 break;
3689             sayNO;
3690
3691         case KEEPS: /*   \K  */
3692             /* update the startpoint */
3693             st->u.keeper.val = rex->offs[0].start;
3694             rex->offs[0].start = locinput - PL_bostr;
3695             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3696             assert(0); /*NOTREACHED*/
3697         case KEEPS_next_fail:
3698             /* rollback the start point change */
3699             rex->offs[0].start = st->u.keeper.val;
3700             sayNO_SILENT;
3701             assert(0); /*NOTREACHED*/
3702
3703         case EOL: /* /..$/  */
3704                 goto seol;
3705
3706         case MEOL: /* /..$/m  */
3707             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3708                 sayNO;
3709             break;
3710
3711         case SEOL: /* /..$/s  */
3712           seol:
3713             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3714                 sayNO;
3715             if (PL_regeol - locinput > 1)
3716                 sayNO;
3717             break;
3718
3719         case EOS: /*  \z  */
3720             if (!NEXTCHR_IS_EOS)
3721                 sayNO;
3722             break;
3723
3724         case SANY: /*  /./s  */
3725             if (NEXTCHR_IS_EOS)
3726                 sayNO;
3727             goto increment_locinput;
3728
3729         case CANY: /*  \C  */
3730             if (NEXTCHR_IS_EOS)
3731                 sayNO;
3732             locinput++;
3733             break;
3734
3735         case REG_ANY: /*  /./  */
3736             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3737                 sayNO;
3738             goto increment_locinput;
3739
3740
3741 #undef  ST
3742 #define ST st->u.trie
3743         case TRIEC: /* (ab|cd) with known charclass */
3744             /* In this case the charclass data is available inline so
3745                we can fail fast without a lot of extra overhead.
3746              */
3747             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3748                 DEBUG_EXECUTE_r(
3749                     PerlIO_printf(Perl_debug_log,
3750                               "%*s  %sfailed to match trie start class...%s\n",
3751                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3752                 );
3753                 sayNO_SILENT;
3754                 assert(0); /* NOTREACHED */
3755             }
3756             /* FALL THROUGH */
3757         case TRIE:  /* (ab|cd)  */
3758             /* the basic plan of execution of the trie is:
3759              * At the beginning, run though all the states, and
3760              * find the longest-matching word. Also remember the position
3761              * of the shortest matching word. For example, this pattern:
3762              *    1  2 3 4    5
3763              *    ab|a|x|abcd|abc
3764              * when matched against the string "abcde", will generate
3765              * accept states for all words except 3, with the longest
3766              * matching word being 4, and the shortest being 2 (with
3767              * the position being after char 1 of the string).
3768              *
3769              * Then for each matching word, in word order (i.e. 1,2,4,5),
3770              * we run the remainder of the pattern; on each try setting
3771              * the current position to the character following the word,
3772              * returning to try the next word on failure.
3773              *
3774              * We avoid having to build a list of words at runtime by
3775              * using a compile-time structure, wordinfo[].prev, which
3776              * gives, for each word, the previous accepting word (if any).
3777              * In the case above it would contain the mappings 1->2, 2->0,
3778              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3779              * the longest word (4 above), a list of all words, by
3780              * following the list of prev pointers; this gives us the
3781              * unordered list 4,5,1,2. Then given the current word we have
3782              * just tried, we can go through the list and find the
3783              * next-biggest word to try (so if we just failed on word 2,
3784              * the next in the list is 4).
3785              *
3786              * Since at runtime we don't record the matching position in
3787              * the string for each word, we have to work that out for
3788              * each word we're about to process. The wordinfo table holds
3789              * the character length of each word; given that we recorded
3790              * at the start: the position of the shortest word and its
3791              * length in chars, we just need to move the pointer the
3792              * difference between the two char lengths. Depending on
3793              * Unicode status and folding, that's cheap or expensive.
3794              *
3795              * This algorithm is optimised for the case where are only a
3796              * small number of accept states, i.e. 0,1, or maybe 2.
3797              * With lots of accepts states, and having to try all of them,
3798              * it becomes quadratic on number of accept states to find all
3799              * the next words.
3800              */
3801
3802             {
3803                 /* what type of TRIE am I? (utf8 makes this contextual) */
3804                 DECL_TRIE_TYPE(scan);
3805
3806                 /* what trie are we using right now */
3807                 reg_trie_data * const trie
3808                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3809                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3810                 U32 state = trie->startstate;
3811
3812                 if (   trie->bitmap
3813                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3814                 {
3815                     if (trie->states[ state ].wordnum) {
3816                          DEBUG_EXECUTE_r(
3817                             PerlIO_printf(Perl_debug_log,
3818                                           "%*s  %smatched empty string...%s\n",
3819                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3820                         );
3821                         if (!trie->jump)
3822                             break;
3823                     } else {
3824                         DEBUG_EXECUTE_r(
3825                             PerlIO_printf(Perl_debug_log,
3826                                           "%*s  %sfailed to match trie start class...%s\n",
3827                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3828                         );
3829                         sayNO_SILENT;
3830                    }
3831                 }
3832
3833             {
3834                 U8 *uc = ( U8* )locinput;
3835
3836                 STRLEN len = 0;
3837                 STRLEN foldlen = 0;
3838                 U8 *uscan = (U8*)NULL;
3839                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3840                 U32 charcount = 0; /* how many input chars we have matched */
3841                 U32 accepted = 0; /* have we seen any accepting states? */
3842
3843                 ST.jump = trie->jump;
3844                 ST.me = scan;
3845                 ST.firstpos = NULL;
3846                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3847                 ST.nextword = 0;
3848
3849                 /* fully traverse the TRIE; note the position of the
3850                    shortest accept state and the wordnum of the longest
3851                    accept state */
3852
3853                 while ( state && uc <= (U8*)PL_regeol ) {
3854                     U32 base = trie->states[ state ].trans.base;
3855                     UV uvc = 0;
3856                     U16 charid = 0;
3857                     U16 wordnum;
3858                     wordnum = trie->states[ state ].wordnum;
3859
3860                     if (wordnum) { /* it's an accept state */
3861                         if (!accepted) {
3862                             accepted = 1;
3863                             /* record first match position */
3864                             if (ST.longfold) {
3865                                 ST.firstpos = (U8*)locinput;
3866                                 ST.firstchars = 0;
3867                             }
3868                             else {
3869                                 ST.firstpos = uc;
3870                                 ST.firstchars = charcount;
3871                             }
3872                         }
3873                         if (!ST.nextword || wordnum < ST.nextword)
3874                             ST.nextword = wordnum;
3875                         ST.topword = wordnum;
3876                     }
3877
3878                     DEBUG_TRIE_EXECUTE_r({
3879                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3880                                 PerlIO_printf( Perl_debug_log,
3881                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3882                                     2+depth * 2, "", PL_colors[4],
3883                                     (UV)state, (accepted ? 'Y' : 'N'));
3884                     });
3885
3886                     /* read a char and goto next state */
3887                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3888                         I32 offset;
3889                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3890                                              uscan, len, uvc, charid, foldlen,
3891                                              foldbuf, uniflags);
3892                         charcount++;
3893                         if (foldlen>0)
3894                             ST.longfold = TRUE;
3895                         if (charid &&
3896                              ( ((offset =
3897                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3898
3899                              && ((U32)offset < trie->lasttrans)
3900                              && trie->trans[offset].check == state)
3901                         {
3902                             state = trie->trans[offset].next;
3903                         }
3904                         else {
3905                             state = 0;
3906                         }
3907                         uc += len;
3908
3909                     }
3910                     else {
3911                         state = 0;
3912                     }
3913                     DEBUG_TRIE_EXECUTE_r(
3914                         PerlIO_printf( Perl_debug_log,
3915                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3916                             charid, uvc, (UV)state, PL_colors[5] );
3917                     );
3918                 }
3919                 if (!accepted)
3920                    sayNO;
3921
3922                 /* calculate total number of accept states */
3923                 {
3924                     U16 w = ST.topword;
3925                     accepted = 0;
3926                     while (w) {
3927                         w = trie->wordinfo[w].prev;
3928                         accepted++;
3929                     }
3930                     ST.accepted = accepted;
3931                 }
3932
3933                 DEBUG_EXECUTE_r(
3934                     PerlIO_printf( Perl_debug_log,
3935                         "%*s  %sgot %"IVdf" possible matches%s\n",
3936                         REPORT_CODE_OFF + depth * 2, "",
3937                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3938                 );
3939                 goto trie_first_try; /* jump into the fail handler */
3940             }}
3941             assert(0); /* NOTREACHED */
3942
3943         case TRIE_next_fail: /* we failed - try next alternative */
3944         {
3945             U8 *uc;
3946             if ( ST.jump) {
3947                 REGCP_UNWIND(ST.cp);
3948                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3949             }
3950             if (!--ST.accepted) {
3951                 DEBUG_EXECUTE_r({
3952                     PerlIO_printf( Perl_debug_log,
3953                         "%*s  %sTRIE failed...%s\n",
3954                         REPORT_CODE_OFF+depth*2, "",
3955                         PL_colors[4],
3956                         PL_colors[5] );
3957                 });
3958                 sayNO_SILENT;
3959             }
3960             {
3961                 /* Find next-highest word to process.  Note that this code
3962                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3963                 U16 min = 0;
3964                 U16 word;
3965                 U16 const nextword = ST.nextword;
3966                 reg_trie_wordinfo * const wordinfo
3967                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3968                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3969                     if (word > nextword && (!min || word < min))
3970                         min = word;
3971                 }
3972                 ST.nextword = min;
3973             }
3974
3975           trie_first_try:
3976             if (do_cutgroup) {
3977                 do_cutgroup = 0;
3978                 no_final = 0;
3979             }
3980
3981             if ( ST.jump) {
3982                 ST.lastparen = rex->lastparen;
3983                 ST.lastcloseparen = rex->lastcloseparen;
3984                 REGCP_SET(ST.cp);
3985             }
3986
3987             /* find start char of end of current word */
3988             {
3989                 U32 chars; /* how many chars to skip */
3990                 reg_trie_data * const trie
3991                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3992
3993                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3994                             >=  ST.firstchars);
3995                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3996                             - ST.firstchars;
3997                 uc = ST.firstpos;
3998
3999                 if (ST.longfold) {
4000                     /* the hard option - fold each char in turn and find
4001                      * its folded length (which may be different */
4002                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4003                     STRLEN foldlen;
4004                     STRLEN len;
4005                     UV uvc;
4006                     U8 *uscan;
4007
4008                     while (chars) {
4009                         if (utf8_target) {
4010                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4011                                                     uniflags);
4012                             uc += len;
4013                         }
4014                         else {
4015                             uvc = *uc;
4016                             uc++;
4017                         }
4018                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4019                         uscan = foldbuf;
4020                         while (foldlen) {
4021                             if (!--chars)
4022                                 break;
4023                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4024                                             uniflags);
4025                             uscan += len;
4026                             foldlen -= len;
4027                         }
4028                     }
4029                 }
4030                 else {
4031                     if (utf8_target)
4032                         while (chars--)
4033                             uc += UTF8SKIP(uc);
4034                     else
4035                         uc += chars;
4036                 }
4037             }
4038
4039             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4040                             ? ST.jump[ST.nextword]
4041                             : NEXT_OFF(ST.me));
4042
4043             DEBUG_EXECUTE_r({
4044                 PerlIO_printf( Perl_debug_log,
4045                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4046                     REPORT_CODE_OFF+depth*2, "",
4047                     PL_colors[4],
4048                     ST.nextword,
4049                     PL_colors[5]
4050                     );
4051             });
4052
4053             if (ST.accepted > 1 || has_cutgroup) {
4054                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4055                 assert(0); /* NOTREACHED */
4056             }
4057             /* only one choice left - just continue */
4058             DEBUG_EXECUTE_r({
4059                 AV *const trie_words
4060                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4061                 SV ** const tmp = av_fetch( trie_words,
4062                     ST.nextword-1, 0 );
4063                 SV *sv= tmp ? sv_newmortal() : NULL;
4064
4065                 PerlIO_printf( Perl_debug_log,
4066                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4067                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4068                     ST.nextword,
4069                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4070                             PL_colors[0], PL_colors[1],
4071                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4072                         )
4073                     : "not compiled under -Dr",
4074                     PL_colors[5] );
4075             });
4076
4077             locinput = (char*)uc;
4078             continue; /* execute rest of RE */
4079             assert(0); /* NOTREACHED */
4080         }
4081 #undef  ST
4082
4083         case EXACT: {            /*  /abc/        */
4084             char *s = STRING(scan);
4085             ln = STR_LEN(scan);
4086             if (utf8_target != UTF_PATTERN) {
4087                 /* The target and the pattern have differing utf8ness. */
4088                 char *l = locinput;
4089                 const char * const e = s + ln;
4090
4091                 if (utf8_target) {
4092                     /* The target is utf8, the pattern is not utf8.
4093                      * Above-Latin1 code points can't match the pattern;
4094                      * invariants match exactly, and the other Latin1 ones need
4095                      * to be downgraded to a single byte in order to do the
4096                      * comparison.  (If we could be confident that the target
4097                      * is not malformed, this could be refactored to have fewer
4098                      * tests by just assuming that if the first bytes match, it
4099                      * is an invariant, but there are tests in the test suite
4100                      * dealing with (??{...}) which violate this) */
4101                     while (s < e) {
4102                         if (l >= PL_regeol)
4103                              sayNO;
4104                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4105                             sayNO;
4106                         }
4107                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4108                             if (*l != *s) {
4109                                 sayNO;
4110                             }
4111                             l++;
4112                         }
4113                         else {
4114                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4115                                 sayNO;
4116                             }
4117                             l += 2;
4118                         }
4119                         s++;
4120                     }
4121                 }
4122                 else {
4123                     /* The target is not utf8, the pattern is utf8. */
4124                     while (s < e) {
4125                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4126                         {
4127                             sayNO;
4128                         }
4129                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4130                             if (*s != *l) {
4131                                 sayNO;
4132                             }
4133                             s++;
4134                         }
4135                         else {
4136                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4137                                 sayNO;
4138                             }
4139                             s += 2;
4140                         }
4141                         l++;
4142                     }
4143                 }
4144                 locinput = l;
4145                 break;
4146             }
4147             /* The target and the pattern have the same utf8ness. */
4148             /* Inline the first character, for speed. */
4149             if (UCHARAT(s) != nextchr)
4150                 sayNO;
4151             if (PL_regeol - locinput < ln)
4152                 sayNO;
4153             if (ln > 1 && memNE(s, locinput, ln))
4154                 sayNO;
4155             locinput += ln;
4156             break;
4157             }
4158
4159         case EXACTFL: {          /*  /abc/il      */
4160             re_fold_t folder;
4161             const U8 * fold_array;
4162             const char * s;
4163             U32 fold_utf8_flags;
4164
4165             PL_reg_flags |= RF_tainted;
4166             folder = foldEQ_locale;
4167             fold_array = PL_fold_locale;
4168             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4169             goto do_exactf;
4170
4171         case EXACTFU_SS:         /*  /\x{df}/iu   */
4172         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4173         case EXACTFU:            /*  /abc/iu      */
4174             folder = foldEQ_latin1;
4175             fold_array = PL_fold_latin1;
4176             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4177             goto do_exactf;
4178
4179         case EXACTFA:            /*  /abc/iaa     */
4180             folder = foldEQ_latin1;
4181             fold_array = PL_fold_latin1;
4182             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4183             goto do_exactf;
4184
4185         case EXACTF:             /*  /abc/i       */
4186             folder = foldEQ;
4187             fold_array = PL_fold;
4188             fold_utf8_flags = 0;
4189
4190           do_exactf:
4191             s = STRING(scan);
4192             ln = STR_LEN(scan);
4193
4194             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4195               /* Either target or the pattern are utf8, or has the issue where
4196                * the fold lengths may differ. */
4197                 const char * const l = locinput;
4198                 char *e = PL_regeol;
4199
4200                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4201                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4202                 {
4203                     sayNO;
4204                 }
4205                 locinput = e;
4206                 break;
4207             }
4208
4209             /* Neither the target nor the pattern are utf8 */
4210             if (UCHARAT(s) != nextchr
4211                 && !NEXTCHR_IS_EOS
4212                 && UCHARAT(s) != fold_array[nextchr])
4213             {
4214                 sayNO;
4215             }
4216             if (PL_regeol - locinput < ln)
4217                 sayNO;
4218             if (ln > 1 && ! folder(s, locinput, ln))
4219                 sayNO;
4220             locinput += ln;
4221             break;
4222         }
4223
4224         /* XXX Could improve efficiency by separating these all out using a
4225          * macro or in-line function.  At that point regcomp.c would no longer
4226          * have to set the FLAGS fields of these */
4227         case BOUNDL:  /*  /\b/l  */
4228         case NBOUNDL: /*  /\B/l  */
4229             PL_reg_flags |= RF_tainted;
4230             /* FALL THROUGH */
4231         case BOUND:   /*  /\b/   */
4232         case BOUNDU:  /*  /\b/u  */
4233         case BOUNDA:  /*  /\b/a  */
4234         case NBOUND:  /*  /\B/   */
4235         case NBOUNDU: /*  /\B/u  */
4236         case NBOUNDA: /*  /\B/a  */
4237             /* was last char in word? */
4238             if (utf8_target
4239                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4240                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4241             {
4242                 if (locinput == PL_bostr)
4243                     ln = '\n';
4244                 else {
4245                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4246
4247                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4248                 }
4249                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4250                     ln = isALNUM_uni(ln);
4251                     if (NEXTCHR_IS_EOS)
4252                         n = 0;
4253                     else {
4254                         LOAD_UTF8_CHARCLASS_ALNUM();
4255                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4256                                                                 utf8_target);
4257                     }
4258                 }
4259                 else {
4260                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4261                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4262                 }
4263             }
4264             else {
4265
4266                 /* Here the string isn't utf8, or is utf8 and only ascii
4267                  * characters are to match \w.  In the latter case looking at
4268                  * the byte just prior to the current one may be just the final
4269                  * byte of a multi-byte character.  This is ok.  There are two
4270                  * cases:
4271                  * 1) it is a single byte character, and then the test is doing
4272                  *      just what it's supposed to.
4273                  * 2) it is a multi-byte character, in which case the final
4274                  *      byte is never mistakable for ASCII, and so the test
4275                  *      will say it is not a word character, which is the
4276                  *      correct answer. */
4277                 ln = (locinput != PL_bostr) ?
4278                     UCHARAT(locinput - 1) : '\n';
4279                 switch (FLAGS(scan)) {
4280                     case REGEX_UNICODE_CHARSET:
4281                         ln = isWORDCHAR_L1(ln);
4282                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4283                         break;
4284                     case REGEX_LOCALE_CHARSET:
4285                         ln = isALNUM_LC(ln);
4286                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4287                         break;
4288                     case REGEX_DEPENDS_CHARSET:
4289                         ln = isALNUM(ln);
4290                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4291                         break;
4292                     case REGEX_ASCII_RESTRICTED_CHARSET:
4293                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4294                         ln = isWORDCHAR_A(ln);
4295                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4296                         break;
4297                     default:
4298                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4299                         break;
4300                 }
4301             }
4302             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4303              * regcomp.sym */
4304             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4305                     sayNO;
4306             break;
4307
4308         case ANYOF:  /*  /[abc]/       */
4309             if (NEXTCHR_IS_EOS)
4310                 sayNO;
4311             if (utf8_target) {
4312                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4313                     sayNO;
4314                 locinput += UTF8SKIP(locinput);
4315                 break;
4316             }
4317             else {
4318                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4319                     sayNO;
4320                 locinput++;
4321                 break;
4322             }
4323             break;
4324
4325         /* Special char classes: \d, \w etc.
4326          * The defines start on line 166 or so */
4327         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4328                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4329                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4330                   ALNUMA, NALNUMA, isWORDCHAR_A,
4331                   alnum, "a");
4332
4333         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
4334                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4335                   SPACEU, NSPACEU, isSPACE_L1,
4336                   SPACEA, NSPACEA, isSPACE_A,
4337                   space, " ");
4338
4339         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4340                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4341                 DIGITA, NDIGITA, isDIGIT_A,
4342                 digit, "0");
4343
4344         case POSIXA: /* /[[:ascii:]]/ etc */
4345             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4346                 sayNO;
4347             }
4348             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4349             locinput++;
4350             break;
4351
4352         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4353             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4354                 sayNO;
4355             }
4356             goto increment_locinput;
4357
4358         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4359                        a Unicode extended Grapheme Cluster */
4360             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4361               extended Grapheme Cluster is:
4362
4363                CR LF
4364                | Prepend* Begin Extend*
4365                | .
4366
4367                Begin is:           ( Special_Begin | ! Control )
4368                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4369                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4370                Control is:         [ GCB_Control  CR  LF ]
4371                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4372
4373                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4374                we can rewrite
4375
4376                    Begin is ( Regular_Begin + Special Begin )
4377
4378                It turns out that 98.4% of all Unicode code points match
4379                Regular_Begin.  Doing it this way eliminates a table match in
4380                the previous implementation for almost all Unicode code points.
4381
4382                There is a subtlety with Prepend* which showed up in testing.
4383                Note that the Begin, and only the Begin is required in:
4384                 | Prepend* Begin Extend*
4385                Also, Begin contains '! Control'.  A Prepend must be a
4386                '!  Control', which means it must also be a Begin.  What it
4387                comes down to is that if we match Prepend* and then find no
4388                suitable Begin afterwards, that if we backtrack the last
4389                Prepend, that one will be a suitable Begin.
4390             */
4391
4392             if (NEXTCHR_IS_EOS)
4393                 sayNO;
4394             if  (! utf8_target) {
4395
4396                 /* Match either CR LF  or '.', as all the other possibilities
4397                  * require utf8 */
4398                 locinput++;         /* Match the . or CR */
4399                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4400                                        match the LF */
4401                     && locinput < PL_regeol
4402                     && UCHARAT(locinput) == '\n') locinput++;
4403             }
4404             else {
4405
4406                 /* Utf8: See if is ( CR LF ); already know that locinput <
4407                  * PL_regeol, so locinput+1 is in bounds */
4408                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4409                         && UCHARAT(locinput + 1) == '\n')
4410                 {
4411                     locinput += 2;
4412                 }
4413                 else {
4414                     STRLEN len;
4415
4416                     /* In case have to backtrack to beginning, then match '.' */
4417                     char *starting = locinput;
4418
4419                     /* In case have to backtrack the last prepend */
4420                     char *previous_prepend = 0;
4421
4422                     LOAD_UTF8_CHARCLASS_GCB();
4423
4424                     /* Match (prepend)*   */
4425                     while (locinput < PL_regeol
4426                            && (len = is_GCB_Prepend_utf8(locinput)))
4427                     {
4428                         previous_prepend = locinput;
4429                         locinput += len;
4430                     }
4431
4432                     /* As noted above, if we matched a prepend character, but
4433                      * the next thing won't match, back off the last prepend we
4434                      * matched, as it is guaranteed to match the begin */
4435                     if (previous_prepend
4436                         && (locinput >=  PL_regeol
4437                             || (! swash_fetch(PL_utf8_X_regular_begin,
4438                                              (U8*)locinput, utf8_target)
4439                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4440                         )
4441                     {
4442                         locinput = previous_prepend;
4443                     }
4444
4445                     /* Note that here we know PL_regeol > locinput, as we
4446                      * tested that upon input to this switch case, and if we
4447                      * moved locinput forward, we tested the result just above
4448                      * and it either passed, or we backed off so that it will
4449                      * now pass */
4450                     if (swash_fetch(PL_utf8_X_regular_begin,
4451                                     (U8*)locinput, utf8_target)) {
4452                         locinput += UTF8SKIP(locinput);
4453                     }
4454                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4455
4456                         /* Here did not match the required 'Begin' in the
4457                          * second term.  So just match the very first
4458                          * character, the '.' of the final term of the regex */
4459                         locinput = starting + UTF8SKIP(starting);
4460                         goto exit_utf8;
4461                     } else {
4462
4463                         /* Here is a special begin.  It can be composed of
4464                          * several individual characters.  One possibility is
4465                          * RI+ */
4466                         if ((len = is_GCB_RI_utf8(locinput))) {
4467                             locinput += len;
4468                             while (locinput < PL_regeol
4469                                    && (len = is_GCB_RI_utf8(locinput)))
4470                             {
4471                                 locinput += len;
4472                             }
4473                         } else if ((len = is_GCB_T_utf8(locinput))) {
4474                             /* Another possibility is T+ */
4475                             locinput += len;
4476                             while (locinput < PL_regeol
4477                                 && (len = is_GCB_T_utf8(locinput)))
4478                             {
4479                                 locinput += len;
4480                             }
4481                         } else {
4482
4483                             /* Here, neither RI+ nor T+; must be some other
4484                              * Hangul.  That means it is one of the others: L,
4485                              * LV, LVT or V, and matches:
4486                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4487
4488                             /* Match L*           */
4489                             while (locinput < PL_regeol
4490                                    && (len = is_GCB_L_utf8(locinput)))
4491                             {
4492                                 locinput += len;
4493                             }
4494
4495                             /* Here, have exhausted L*.  If the next character
4496                              * is not an LV, LVT nor V, it means we had to have
4497                              * at least one L, so matches L+ in the original
4498                              * equation, we have a complete hangul syllable.
4499                              * Are done. */
4500
4501                             if (locinput < PL_regeol
4502                                 && is_GCB_LV_LVT_V_utf8(locinput))
4503                             {
4504
4505                                 /* Otherwise keep going.  Must be LV, LVT or V.
4506                                  * See if LVT */
4507                                 if (is_utf8_X_LVT((U8*)locinput)) {
4508                                     locinput += UTF8SKIP(locinput);
4509                                 } else {
4510
4511                                     /* Must be  V or LV.  Take it, then match
4512                                      * V*     */
4513                                     locinput += UTF8SKIP(locinput);
4514                                     while (locinput < PL_regeol
4515                                            && (len = is_GCB_V_utf8(locinput)))
4516                                     {
4517                                         locinput += len;
4518                                     }
4519                                 }
4520
4521                                 /* And any of LV, LVT, or V can be followed
4522                                  * by T*            */
4523                                 while (locinput < PL_regeol
4524                                        && (len = is_GCB_T_utf8(locinput)))
4525                                 {
4526                                     locinput += len;
4527                                 }
4528                             }
4529                         }
4530                     }
4531
4532                     /* Match any extender */
4533                     while (locinput < PL_regeol
4534                             && swash_fetch(PL_utf8_X_extend,
4535                                             (U8*)locinput, utf8_target))
4536                     {
4537                         locinput += UTF8SKIP(locinput);
4538                     }
4539                 }
4540             exit_utf8:
4541                 if (locinput > PL_regeol) sayNO;
4542             }
4543             break;
4544
4545         case NREFFL:  /*  /\g{name}/il  */
4546         {   /* The capture buffer cases.  The ones beginning with N for the
4547                named buffers just convert to the equivalent numbered and
4548                pretend they were called as the corresponding numbered buffer
4549                op.  */
4550             /* don't initialize these in the declaration, it makes C++
4551                unhappy */
4552             char *s;
4553             char type;
4554             re_fold_t folder;
4555             const U8 *fold_array;
4556             UV utf8_fold_flags;
4557
4558             PL_reg_flags |= RF_tainted;
4559             folder = foldEQ_locale;
4560             fold_array = PL_fold_locale;
4561             type = REFFL;
4562             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4563             goto do_nref;
4564
4565         case NREFFA:  /*  /\g{name}/iaa  */
4566             folder = foldEQ_latin1;
4567             fold_array = PL_fold_latin1;
4568             type = REFFA;
4569             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4570             goto do_nref;
4571
4572         case NREFFU:  /*  /\g{name}/iu  */
4573             folder = foldEQ_latin1;
4574             fold_array = PL_fold_latin1;
4575             type = REFFU;
4576             utf8_fold_flags = 0;
4577             goto do_nref;
4578
4579         case NREFF:  /*  /\g{name}/i  */
4580             folder = foldEQ;
4581             fold_array = PL_fold;
4582             type = REFF;
4583             utf8_fold_flags = 0;
4584             goto do_nref;
4585
4586         case NREF:  /*  /\g{name}/   */
4587             type = REF;
4588             folder = NULL;
4589             fold_array = NULL;
4590             utf8_fold_flags = 0;
4591           do_nref:
4592
4593             /* For the named back references, find the corresponding buffer
4594              * number */
4595             n = reg_check_named_buff_matched(rex,scan);
4596
4597             if ( ! n ) {
4598                 sayNO;
4599             }
4600             goto do_nref_ref_common;
4601
4602         case REFFL:  /*  /\1/il  */
4603             PL_reg_flags |= RF_tainted;
4604             folder = foldEQ_locale;
4605             fold_array = PL_fold_locale;
4606             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4607             goto do_ref;
4608
4609         case REFFA:  /*  /\1/iaa  */
4610             folder = foldEQ_latin1;
4611             fold_array = PL_fold_latin1;
4612             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4613             goto do_ref;
4614
4615         case REFFU:  /*  /\1/iu  */
4616             folder = foldEQ_latin1;
4617             fold_array = PL_fold_latin1;
4618             utf8_fold_flags = 0;
4619             goto do_ref;
4620
4621         case REFF:  /*  /\1/i  */
4622             folder = foldEQ;
4623             fold_array = PL_fold;
4624             utf8_fold_flags = 0;
4625             goto do_ref;
4626
4627         case REF:  /*  /\1/    */
4628             folder = NULL;
4629             fold_array = NULL;
4630             utf8_fold_flags = 0;
4631
4632           do_ref:
4633             type = OP(scan);
4634             n = ARG(scan);  /* which paren pair */
4635
4636           do_nref_ref_common:
4637             ln = rex->offs[n].start;
4638             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4639             if (rex->lastparen < n || ln == -1)
4640                 sayNO;                  /* Do not match unless seen CLOSEn. */
4641             if (ln == rex->offs[n].end)
4642                 break;
4643
4644             s = PL_bostr + ln;
4645             if (type != REF     /* REF can do byte comparison */
4646                 && (utf8_target || type == REFFU))
4647             { /* XXX handle REFFL better */
4648                 char * limit = PL_regeol;
4649
4650                 /* This call case insensitively compares the entire buffer
4651                     * at s, with the current input starting at locinput, but
4652                     * not going off the end given by PL_regeol, and returns in
4653                     * <limit> upon success, how much of the current input was
4654                     * matched */
4655                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4656                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4657                 {
4658                     sayNO;
4659                 }
4660                 locinput = limit;
4661                 break;
4662             }
4663
4664             /* Not utf8:  Inline the first character, for speed. */
4665             if (!NEXTCHR_IS_EOS &&
4666                 UCHARAT(s) != nextchr &&
4667                 (type == REF ||
4668                  UCHARAT(s) != fold_array[nextchr]))
4669                 sayNO;
4670             ln = rex->offs[n].end - ln;
4671             if (locinput + ln > PL_regeol)
4672                 sayNO;
4673             if (ln > 1 && (type == REF
4674                            ? memNE(s, locinput, ln)
4675                            : ! folder(s, locinput, ln)))
4676                 sayNO;
4677             locinput += ln;
4678             break;
4679         }
4680
4681         case NOTHING: /* null op; e.g. the 'nothing' following
4682                        * the '*' in m{(a+|b)*}' */
4683             break;
4684         case TAIL: /* placeholder while compiling (A|B|C) */
4685             break;
4686
4687         case BACK: /* ??? doesn't appear to be used ??? */
4688             break;
4689
4690 #undef  ST
4691 #define ST st->u.eval
4692         {
4693             SV *ret;
4694             REGEXP *re_sv;
4695             regexp *re;
4696             regexp_internal *rei;
4697             regnode *startpoint;
4698
4699         case GOSTART: /*  (?R)  */
4700         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4701             if (cur_eval && cur_eval->locinput==locinput) {
4702                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4703                     Perl_croak(aTHX_ "Infinite recursion in regex");
4704                 if ( ++nochange_depth > max_nochange_depth )
4705                     Perl_croak(aTHX_
4706                         "Pattern subroutine nesting without pos change"
4707                         " exceeded limit in regex");
4708             } else {
4709                 nochange_depth = 0;
4710             }
4711             re_sv = rex_sv;
4712             re = rex;
4713             rei = rexi;
4714             if (OP(scan)==GOSUB) {
4715                 startpoint = scan + ARG2L(scan);
4716                 ST.close_paren = ARG(scan);
4717             } else {
4718                 startpoint = rei->program+1;
4719                 ST.close_paren = 0;
4720             }
4721             goto eval_recurse_doit;
4722             assert(0); /* NOTREACHED */
4723
4724         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4725             if (cur_eval && cur_eval->locinput==locinput) {
4726                 if ( ++nochange_depth > max_nochange_depth )
4727                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4728             } else {
4729                 nochange_depth = 0;
4730             }
4731             {
4732                 /* execute the code in the {...} */
4733
4734                 dSP;
4735                 PADOFFSET before;
4736                 OP * const oop = PL_op;
4737                 COP * const ocurcop = PL_curcop;
4738                 OP *nop;
4739                 char *saved_regeol = PL_regeol;
4740                 struct re_save_state saved_state;
4741                 CV *newcv;
4742
4743                 /* save *all* paren positions */
4744                 regcppush(rex, 0);
4745                 REGCP_SET(runops_cp);
4746
4747                 /* To not corrupt the existing regex state while executing the
4748                  * eval we would normally put it on the save stack, like with
4749                  * save_re_context. However, re-evals have a weird scoping so we
4750                  * can't just add ENTER/LEAVE here. With that, things like
4751                  *
4752                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4753                  *
4754                  * would break, as they expect the localisation to be unwound
4755                  * only when the re-engine backtracks through the bit that
4756                  * localised it.
4757                  *
4758                  * What we do instead is just saving the state in a local c
4759                  * variable.
4760                  */
4761                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4762
4763                 PL_reg_state.re_reparsing = FALSE;
4764
4765                 if (!caller_cv)
4766                     caller_cv = find_runcv(NULL);
4767
4768                 n = ARG(scan);
4769
4770                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4771                     newcv = (ReANY(
4772                                                 (REGEXP*)(rexi->data->data[n])
4773                                             ))->qr_anoncv
4774                                         ;
4775                     nop = (OP*)rexi->data->data[n+1];
4776                 }
4777                 else if (rexi->data->what[n] == 'l') { /* literal code */
4778                     newcv = caller_cv;
4779                     nop = (OP*)rexi->data->data[n];
4780                     assert(CvDEPTH(newcv));
4781                 }
4782                 else {
4783                     /* literal with own CV */
4784                     assert(rexi->data->what[n] == 'L');
4785                     newcv = rex->qr_anoncv;
4786                     nop = (OP*)rexi->data->data[n];
4787                 }
4788
4789                 /* normally if we're about to execute code from the same
4790                  * CV that we used previously, we just use the existing
4791                  * CX stack entry. However, its possible that in the
4792                  * meantime we may have backtracked, popped from the save
4793                  * stack, and undone the SAVECOMPPAD(s) associated with
4794                  * PUSH_MULTICALL; in which case PL_comppad no longer
4795                  * points to newcv's pad. */
4796                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4797                 {
4798                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4799                     if (last_pushed_cv) {
4800                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4801                     }
4802                     else {
4803                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4804                     }
4805                     last_pushed_cv = newcv;
4806                 }
4807                 last_pad = PL_comppad;
4808
4809                 /* the initial nextstate you would normally execute
4810                  * at the start of an eval (which would cause error
4811                  * messages to come from the eval), may be optimised
4812                  * away from the execution path in the regex code blocks;
4813                  * so manually set PL_curcop to it initially */
4814                 {
4815                     OP *o = cUNOPx(nop)->op_first;
4816                     assert(o->op_type == OP_NULL);
4817                     if (o->op_targ == OP_SCOPE) {
4818                         o = cUNOPo->op_first;
4819                     }
4820                     else {
4821                         assert(o->op_targ == OP_LEAVE);
4822                         o = cUNOPo->op_first;
4823                         assert(o->op_type == OP_ENTER);
4824                         o = o->op_sibling;
4825                     }
4826
4827                     if (o->op_type != OP_STUB) {
4828                         assert(    o->op_type == OP_NEXTSTATE
4829                                 || o->op_type == OP_DBSTATE
4830                                 || (o->op_type == OP_NULL
4831                                     &&  (  o->op_targ == OP_NEXTSTATE
4832                                         || o->op_targ == OP_DBSTATE
4833                                         )
4834                                     )
4835                         );
4836                         PL_curcop = (COP*)o;
4837                     }
4838                 }
4839                 nop = nop->op_next;
4840
4841                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4842                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4843
4844                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4845
4846                 if (sv_yes_mark) {
4847                     SV *sv_mrk = get_sv("REGMARK", 1);
4848                     sv_setsv(sv_mrk, sv_yes_mark);
4849                 }
4850
4851                 /* we don't use MULTICALL here as we want to call the
4852                  * first op of the block of interest, rather than the
4853                  * first op of the sub */
4854                 before = SP-PL_stack_base;
4855                 PL_op = nop;
4856                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4857                 SPAGAIN;
4858                 if (SP-PL_stack_base == before)
4859                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4860                 else {
4861                     ret = POPs;
4862                     PUTBACK;
4863                 }
4864
4865                 /* before restoring everything, evaluate the returned
4866                  * value, so that 'uninit' warnings don't use the wrong
4867                  * PL_op or pad. Also need to process any magic vars
4868                  * (e.g. $1) *before* parentheses are restored */
4869
4870                 PL_op = NULL;
4871
4872                 re_sv = NULL;
4873                 if (logical == 0)        /*   (?{})/   */
4874                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4875                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4876                     sw = cBOOL(SvTRUE(ret));
4877                     logical = 0;
4878                 }
4879                 else {                   /*  /(??{})  */
4880                     /*  if its overloaded, let the regex compiler handle
4881                      *  it; otherwise extract regex, or stringify  */
4882                     if (!SvAMAGIC(ret)) {
4883                         SV *sv = ret;
4884                         if (SvROK(sv))
4885                             sv = SvRV(sv);
4886                         if (SvTYPE(sv) == SVt_REGEXP)
4887                             re_sv = (REGEXP*) sv;
4888                         else if (SvSMAGICAL(sv)) {
4889                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4890                             if (mg)
4891                                 re_sv = (REGEXP *) mg->mg_obj;
4892                         }
4893
4894                         /* force any magic, undef warnings here */
4895                         if (!re_sv) {
4896                             ret = sv_mortalcopy(ret);
4897                             (void) SvPV_force_nolen(ret);
4898                         }
4899                     }
4900
4901                 }
4902
4903                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4904
4905                 /* *** Note that at this point we don't restore
4906                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4907                  * be used again soon. This is safe as long as nothing
4908                  * in the regexp code uses the pad ! */
4909                 PL_op = oop;
4910                 PL_curcop = ocurcop;
4911                 PL_regeol = saved_regeol;
4912                 S_regcp_restore(aTHX_ rex, runops_cp);
4913
4914                 if (logical != 2)
4915                     break;
4916             }
4917
4918                 /* only /(??{})/  from now on */
4919                 logical = 0;
4920                 {
4921                     /* extract RE object from returned value; compiling if
4922                      * necessary */
4923
4924                     if (re_sv) {
4925                         re_sv = reg_temp_copy(NULL, re_sv);
4926                     }
4927                     else {
4928                         U32 pm_flags = 0;
4929                         const I32 osize = PL_regsize;
4930
4931                         if (SvUTF8(ret) && IN_BYTES) {
4932                             /* In use 'bytes': make a copy of the octet
4933                              * sequence, but without the flag on */
4934                             STRLEN len;
4935                             const char *const p = SvPV(ret, len);
4936                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4937                         }
4938                         if (rex->intflags & PREGf_USE_RE_EVAL)
4939                             pm_flags |= PMf_USE_RE_EVAL;
4940
4941                         /* if we got here, it should be an engine which
4942                          * supports compiling code blocks and stuff */
4943                         assert(rex->engine && rex->engine->op_comp);
4944                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4945                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4946                                     rex->engine, NULL, NULL,
4947                                     /* copy /msix etc to inner pattern */
4948                                     scan->flags,
4949                                     pm_flags);
4950
4951                         if (!(SvFLAGS(ret)
4952                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4953                                  | SVs_GMG))) {
4954                             /* This isn't a first class regexp. Instead, it's
4955                                caching a regexp onto an existing, Perl visible
4956                                scalar.  */
4957                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4958                         }
4959                         PL_regsize = osize;
4960                         /* safe to do now that any $1 etc has been
4961                          * interpolated into the new pattern string and
4962                          * compiled */
4963                         S_regcp_restore(aTHX_ rex, runops_cp);
4964                     }
4965                     SAVEFREESV(re_sv);
4966                     re = ReANY(re_sv);
4967                 }
4968                 RXp_MATCH_COPIED_off(re);
4969                 re->subbeg = rex->subbeg;
4970                 re->sublen = rex->sublen;
4971                 re->suboffset = rex->suboffset;
4972                 re->subcoffset = rex->subcoffset;
4973                 rei = RXi_GET(re);
4974                 DEBUG_EXECUTE_r(
4975                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4976                         "Matching embedded");
4977                 );
4978                 startpoint = rei->program + 1;
4979                 ST.close_paren = 0; /* only used for GOSUB */
4980
4981         eval_recurse_doit: /* Share code with GOSUB below this line */
4982                 /* run the pattern returned from (??{...}) */
4983                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4984                 REGCP_SET(ST.lastcp);
4985
4986                 re->lastparen = 0;
4987                 re->lastcloseparen = 0;
4988
4989                 PL_regsize = 0;
4990
4991                 /* XXXX This is too dramatic a measure... */
4992                 PL_reg_maxiter = 0;
4993
4994                 ST.toggle_reg_flags = PL_reg_flags;
4995                 if (RX_UTF8(re_sv))
4996                     PL_reg_flags |= RF_utf8;
4997                 else
4998                     PL_reg_flags &= ~RF_utf8;
4999                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5000
5001                 ST.prev_rex = rex_sv;
5002                 ST.prev_curlyx = cur_curlyx;
5003                 rex_sv = re_sv;
5004                 SET_reg_curpm(rex_sv);
5005                 rex = re;
5006                 rexi = rei;
5007                 cur_curlyx = NULL;
5008                 ST.B = next;
5009                 ST.prev_eval = cur_eval;
5010                 cur_eval = st;
5011                 /* now continue from first node in postoned RE */
5012                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5013                 assert(0); /* NOTREACHED */
5014         }
5015
5016         case EVAL_AB: /* cleanup after a successful (??{A})B */
5017             /* note: this is called twice; first after popping B, then A */
5018             PL_reg_flags ^= ST.toggle_reg_flags;
5019             rex_sv = ST.prev_rex;
5020             SET_reg_curpm(rex_sv);
5021             rex = ReANY(rex_sv);
5022             rexi = RXi_GET(rex);
5023             regcpblow(ST.cp);
5024             cur_eval = ST.prev_eval;
5025             cur_curlyx = ST.prev_curlyx;
5026
5027             /* XXXX This is too dramatic a measure... */
5028             PL_reg_maxiter = 0;
5029             if ( nochange_depth )
5030                 nochange_depth--;
5031             sayYES;
5032
5033
5034         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5035             /* note: this is called twice; first after popping B, then A */
5036             PL_reg_flags ^= ST.toggle_reg_flags;
5037             rex_sv = ST.prev_rex;
5038             SET_reg_curpm(rex_sv);
5039             rex = ReANY(rex_sv);
5040             rexi = RXi_GET(rex);
5041
5042             REGCP_UNWIND(ST.lastcp);
5043             regcppop(rex);
5044             cur_eval = ST.prev_eval;
5045             cur_curlyx = ST.prev_curlyx;
5046             /* XXXX This is too dramatic a measure... */
5047             PL_reg_maxiter = 0;
5048             if ( nochange_depth )
5049                 nochange_depth--;
5050             sayNO_SILENT;
5051 #undef ST
5052
5053         case OPEN: /*  (  */
5054             n = ARG(scan);  /* which paren pair */
5055             rex->offs[n].start_tmp = locinput - PL_bostr;
5056             if (n > PL_regsize)
5057                 PL_regsize = n;
5058             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5059                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5060                 PTR2UV(rex),
5061                 PTR2UV(rex->offs),
5062                 (UV)n,
5063                 (IV)rex->offs[n].start_tmp,
5064                 (UV)PL_regsize
5065             ));
5066             lastopen = n;
5067             break;
5068
5069 /* XXX really need to log other places start/end are set too */
5070 #define CLOSE_CAPTURE \
5071     rex->offs[n].start = rex->offs[n].start_tmp; \
5072     rex->offs[n].end = locinput - PL_bostr; \
5073     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5074         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5075         PTR2UV(rex), \
5076         PTR2UV(rex->offs), \
5077         (UV)n, \
5078         (IV)rex->offs[n].start, \
5079         (IV)rex->offs[n].end \
5080     ))
5081
5082         case CLOSE:  /*  )  */
5083             n = ARG(scan);  /* which paren pair */
5084             CLOSE_CAPTURE;
5085             /*if (n > PL_regsize)
5086                 PL_regsize = n;*/
5087             if (n > rex->lastparen)
5088                 rex->lastparen = n;
5089             rex->lastcloseparen = n;
5090             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5091                 goto fake_end;
5092             }
5093             break;
5094
5095         case ACCEPT:  /*  (*ACCEPT)  */
5096             if (ARG(scan)){
5097                 regnode *cursor;
5098                 for (cursor=scan;
5099                      cursor && OP(cursor)!=END;
5100                      cursor=regnext(cursor))
5101                 {
5102                     if ( OP(cursor)==CLOSE ){
5103                         n = ARG(cursor);
5104                         if ( n <= lastopen ) {
5105                             CLOSE_CAPTURE;
5106                             /*if (n > PL_regsize)
5107                             PL_regsize = n;*/
5108                             if (n > rex->lastparen)
5109                                 rex->lastparen = n;
5110                             rex->lastcloseparen = n;
5111                             if ( n == ARG(scan) || (cur_eval &&
5112                                 cur_eval->u.eval.close_paren == n))
5113                                 break;
5114                         }
5115                     }
5116                 }
5117             }
5118             goto fake_end;
5119             /*NOTREACHED*/
5120
5121         case GROUPP:  /*  (?(1))  */
5122             n = ARG(scan);  /* which paren pair */
5123             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5124             break;
5125
5126         case NGROUPP:  /*  (?(<name>))  */
5127             /* reg_check_named_buff_matched returns 0 for no match */
5128             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5129             break;
5130
5131         case INSUBP:   /*  (?(R))  */
5132             n = ARG(scan);
5133             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5134             break;
5135
5136         case DEFINEP:  /*  (?(DEFINE))  */
5137             sw = 0;
5138             break;
5139
5140         case IFTHEN:   /*  (?(cond)A|B)  */
5141             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5142             if (sw)
5143                 next = NEXTOPER(NEXTOPER(scan));
5144             else {
5145                 next = scan + ARG(scan);
5146                 if (OP(next) == IFTHEN) /* Fake one. */
5147                     next = NEXTOPER(NEXTOPER(next));
5148             }
5149             break;
5150
5151         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5152             logical = scan->flags;
5153             break;
5154
5155 /*******************************************************************
5156
5157 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5158 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5159 STAR/PLUS/CURLY/CURLYN are used instead.)
5160
5161 A*B is compiled as <CURLYX><A><WHILEM><B>
5162
5163 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5164 state, which contains the current count, initialised to -1. It also sets
5165 cur_curlyx to point to this state, with any previous value saved in the
5166 state block.
5167
5168 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5169 since the pattern may possibly match zero times (i.e. it's a while {} loop
5170 rather than a do {} while loop).
5171
5172 Each entry to WHILEM represents a successful match of A. The count in the
5173 CURLYX block is incremented, another WHILEM state is pushed, and execution
5174 passes to A or B depending on greediness and the current count.
5175
5176 For example, if matching against the string a1a2a3b (where the aN are
5177 substrings that match /A/), then the match progresses as follows: (the
5178 pushed states are interspersed with the bits of strings matched so far):
5179
5180     <CURLYX cnt=-1>
5181     <CURLYX cnt=0><WHILEM>
5182     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5183     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5184     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5185     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5186
5187 (Contrast this with something like CURLYM, which maintains only a single
5188 backtrack state:
5189
5190     <CURLYM cnt=0> a1
5191     a1 <CURLYM cnt=1> a2
5192     a1 a2 <CURLYM cnt=2> a3
5193     a1 a2 a3 <CURLYM cnt=3> b
5194 )
5195
5196 Each WHILEM state block marks a point to backtrack to upon partial failure
5197 of A or B, and also contains some minor state data related to that
5198 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5199 overall state, such as the count, and pointers to the A and B ops.
5200
5201 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5202 must always point to the *current* CURLYX block, the rules are:
5203
5204 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5205 and set cur_curlyx to point the new block.
5206
5207 When popping the CURLYX block after a successful or unsuccessful match,
5208 restore the previous cur_curlyx.
5209
5210 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5211 to the outer one saved in the CURLYX block.
5212
5213 When popping the WHILEM block after a successful or unsuccessful B match,
5214 restore the previous cur_curlyx.
5215
5216 Here's an example for the pattern (AI* BI)*BO
5217 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5218
5219 cur_
5220 curlyx backtrack stack
5221 ------ ---------------
5222 NULL
5223 CO     <CO prev=NULL> <WO>
5224 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5225 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5226 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5227
5228 At this point the pattern succeeds, and we work back down the stack to
5229 clean up, restoring as we go:
5230
5231 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5232 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5233 CO     <CO prev=NULL> <WO>
5234 NULL
5235
5236 *******************************************************************/
5237
5238 #define ST st->u.curlyx
5239
5240         case CURLYX:    /* start of /A*B/  (for complex A) */
5241         {
5242             /* No need to save/restore up to this paren */
5243             I32 parenfloor = scan->flags;
5244
5245             assert(next); /* keep Coverity happy */
5246             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5247                 next += ARG(next);
5248
5249             /* XXXX Probably it is better to teach regpush to support
5250                parenfloor > PL_regsize... */
5251             if (parenfloor > (I32)rex->lastparen)
5252                 parenfloor = rex->lastparen; /* Pessimization... */
5253
5254             ST.prev_curlyx= cur_curlyx;
5255             cur_curlyx = st;
5256             ST.cp = PL_savestack_ix;
5257
5258             /* these fields contain the state of the current curly.
5259              * they are accessed by subsequent WHILEMs */
5260             ST.parenfloor = parenfloor;
5261             ST.me = scan;
5262             ST.B = next;
5263             ST.minmod = minmod;
5264             minmod = 0;
5265             ST.count = -1;      /* this will be updated by WHILEM */
5266             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5267
5268             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5269             assert(0); /* NOTREACHED */
5270         }
5271
5272         case CURLYX_end: /* just finished matching all of A*B */
5273             cur_curlyx = ST.prev_curlyx;
5274             sayYES;
5275             assert(0); /* NOTREACHED */
5276
5277         case CURLYX_end_fail: /* just failed to match all of A*B */
5278             regcpblow(ST.cp);
5279             cur_curlyx = ST.prev_curlyx;
5280             sayNO;
5281             assert(0); /* NOTREACHED */
5282
5283
5284 #undef ST
5285 #define ST st->u.whilem
5286
5287         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5288         {
5289             /* see the discussion above about CURLYX/WHILEM */
5290             I32 n;
5291             int min = ARG1(cur_curlyx->u.curlyx.me);
5292             int max = ARG2(cur_curlyx->u.curlyx.me);
5293             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5294
5295             assert(cur_curlyx); /* keep Coverity happy */
5296             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5297             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5298             ST.cache_offset = 0;
5299             ST.cache_mask = 0;
5300
5301
5302             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5303                   "%*s  whilem: matched %ld out of %d..%d\n",
5304                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5305             );
5306
5307             /* First just match a string of min A's. */
5308
5309             if (n < min) {
5310                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5311                 cur_curlyx->u.curlyx.lastloc = locinput;
5312                 REGCP_SET(ST.lastcp);
5313
5314                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5315                 assert(0); /* NOTREACHED */
5316             }
5317
5318             /* If degenerate A matches "", assume A done. */
5319
5320             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5321                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5322                    "%*s  whilem: empty match detected, trying continuation...\n",
5323                    REPORT_CODE_OFF+depth*2, "")
5324                 );
5325                 goto do_whilem_B_max;
5326             }
5327
5328             /* super-linear cache processing */
5329
5330             if (scan->flags) {
5331
5332                 if (!PL_reg_maxiter) {
5333                     /* start the countdown: Postpone detection until we
5334                      * know the match is not *that* much linear. */
5335                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5336                     /* possible overflow for long strings and many CURLYX's */
5337                     if (PL_reg_maxiter < 0)
5338                         PL_reg_maxiter = I32_MAX;
5339                     PL_reg_leftiter = PL_reg_maxiter;
5340                 }
5341
5342                 if (PL_reg_leftiter-- == 0) {
5343                     /* initialise cache */
5344                     const I32 size = (PL_reg_maxiter + 7)/8;
5345                     if (PL_reg_poscache) {
5346                         if ((I32)PL_reg_poscache_size < size) {
5347                             Renew(PL_reg_poscache, size, char);
5348                             PL_reg_poscache_size = size;
5349                         }
5350                         Zero(PL_reg_poscache, size, char);
5351                     }
5352                     else {
5353                         PL_reg_poscache_size = size;
5354                         Newxz(PL_reg_poscache, size, char);
5355                     }
5356                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5357       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5358                               PL_colors[4], PL_colors[5])
5359                     );
5360                 }
5361
5362                 if (PL_reg_leftiter < 0) {
5363                     /* have we already failed at this position? */
5364                     I32 offset, mask;
5365                     offset  = (scan->flags & 0xf) - 1
5366                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5367                     mask    = 1 << (offset % 8);
5368                     offset /= 8;
5369                     if (PL_reg_poscache[offset] & mask) {
5370                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5371                             "%*s  whilem: (cache) already tried at this position...\n",
5372                             REPORT_CODE_OFF+depth*2, "")
5373                         );
5374                         sayNO; /* cache records failure */
5375                     }
5376                     ST.cache_offset = offset;
5377                     ST.cache_mask   = mask;
5378                 }
5379             }
5380
5381             /* Prefer B over A for minimal matching. */
5382
5383             if (cur_curlyx->u.curlyx.minmod) {
5384                 ST.save_curlyx = cur_curlyx;
5385                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5386                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5387                 REGCP_SET(ST.lastcp);
5388                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5389                                     locinput);
5390                 assert(0); /* NOTREACHED */
5391             }
5392
5393             /* Prefer A over B for maximal matching. */
5394
5395             if (n < max) { /* More greed allowed? */
5396                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5397                 cur_curlyx->u.curlyx.lastloc = locinput;
5398                 REGCP_SET(ST.lastcp);
5399                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5400                 assert(0); /* NOTREACHED */
5401             }
5402             goto do_whilem_B_max;
5403         }
5404         assert(0); /* NOTREACHED */
5405
5406         case WHILEM_B_min: /* just matched B in a minimal match */
5407         case WHILEM_B_max: /* just matched B in a maximal match */
5408             cur_curlyx = ST.save_curlyx;
5409             sayYES;
5410             assert(0); /* NOTREACHED */
5411
5412         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5413             cur_curlyx = ST.save_curlyx;
5414             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5415             cur_curlyx->u.curlyx.count--;
5416             CACHEsayNO;
5417             assert(0); /* NOTREACHED */
5418
5419         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5420             /* FALL THROUGH */
5421         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5422             REGCP_UNWIND(ST.lastcp);
5423             regcppop(rex);
5424             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5425             cur_curlyx->u.curlyx.count--;
5426             CACHEsayNO;
5427             assert(0); /* NOTREACHED */
5428
5429         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5430             REGCP_UNWIND(ST.lastcp);
5431             regcppop(rex);      /* Restore some previous $<digit>s? */
5432             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5433                 "%*s  whilem: failed, trying continuation...\n",
5434                 REPORT_CODE_OFF+depth*2, "")
5435             );
5436           do_whilem_B_max:
5437             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5438                 && ckWARN(WARN_REGEXP)
5439                 && !(PL_reg_flags & RF_warned))
5440             {
5441                 PL_reg_flags |= RF_warned;
5442                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5443                      "Complex regular subexpression recursion limit (%d) "
5444                      "exceeded",
5445                      REG_INFTY - 1);
5446             }
5447
5448             /* now try B */
5449             ST.save_curlyx = cur_curlyx;
5450             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5451             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5452                                 locinput);
5453             assert(0); /* NOTREACHED */
5454
5455         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5456             cur_curlyx = ST.save_curlyx;
5457             REGCP_UNWIND(ST.lastcp);
5458             regcppop(rex);
5459
5460             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5461                 /* Maximum greed exceeded */
5462                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5463                     && ckWARN(WARN_REGEXP)
5464                     && !(PL_reg_flags & RF_warned))
5465                 {
5466                     PL_reg_flags |= RF_warned;
5467                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5468                         "Complex regular subexpression recursion "
5469                         "limit (%d) exceeded",
5470                         REG_INFTY - 1);
5471                 }
5472                 cur_curlyx->u.curlyx.count--;
5473                 CACHEsayNO;
5474             }
5475
5476             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5477                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5478             );
5479             /* Try grabbing another A and see if it helps. */
5480             cur_curlyx->u.curlyx.lastloc = locinput;
5481             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5482             REGCP_SET(ST.lastcp);
5483             PUSH_STATE_GOTO(WHILEM_A_min,
5484                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5485                 locinput);
5486             assert(0); /* NOTREACHED */
5487
5488 #undef  ST
5489 #define ST st->u.branch
5490
5491         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5492             next = scan + ARG(scan);
5493             if (next == scan)
5494                 next = NULL;
5495             scan = NEXTOPER(scan);
5496             /* FALL THROUGH */
5497
5498         case BRANCH:        /*  /(...|A|...)/ */
5499             scan = NEXTOPER(scan); /* scan now points to inner node */
5500             ST.lastparen = rex->lastparen;
5501             ST.lastcloseparen = rex->lastcloseparen;
5502             ST.next_branch = next;
5503             REGCP_SET(ST.cp);
5504
5505             /* Now go into the branch */
5506             if (has_cutgroup) {
5507                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5508             } else {
5509                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5510             }
5511             assert(0); /* NOTREACHED */
5512
5513         case CUTGROUP:  /*  /(*THEN)/  */
5514             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5515                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5516             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5517             assert(0); /* NOTREACHED */
5518
5519         case CUTGROUP_next_fail:
5520             do_cutgroup = 1;
5521             no_final = 1;
5522             if (st->u.mark.mark_name)
5523                 sv_commit = st->u.mark.mark_name;
5524             sayNO;
5525             assert(0); /* NOTREACHED */
5526
5527         case BRANCH_next:
5528             sayYES;
5529             assert(0); /* NOTREACHED */
5530
5531         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5532             if (do_cutgroup) {
5533                 do_cutgroup = 0;
5534                 no_final = 0;
5535             }
5536             REGCP_UNWIND(ST.cp);
5537             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5538             scan = ST.next_branch;
5539             /* no more branches? */
5540             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5541                 DEBUG_EXECUTE_r({
5542                     PerlIO_printf( Perl_debug_log,
5543                         "%*s  %sBRANCH failed...%s\n",
5544                         REPORT_CODE_OFF+depth*2, "",
5545                         PL_colors[4],
5546                         PL_colors[5] );
5547                 });
5548                 sayNO_SILENT;
5549             }
5550             continue; /* execute next BRANCH[J] op */
5551             assert(0); /* NOTREACHED */
5552
5553         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5554             minmod = 1;
5555             break;
5556
5557 #undef  ST
5558 #define ST st->u.curlym
5559
5560         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5561
5562             /* This is an optimisation of CURLYX that enables us to push
5563              * only a single backtracking state, no matter how many matches
5564              * there are in {m,n}. It relies on the pattern being constant
5565              * length, with no parens to influence future backrefs
5566              */
5567
5568             ST.me = scan;
5569             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5570
5571             ST.lastparen      = rex->lastparen;
5572             ST.lastcloseparen = rex->lastcloseparen;
5573
5574             /* if paren positive, emulate an OPEN/CLOSE around A */
5575             if (ST.me->flags) {
5576                 U32 paren = ST.me->flags;
5577                 if (paren > PL_regsize)
5578                     PL_regsize = paren;
5579                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5580             }
5581             ST.A = scan;
5582             ST.B = next;
5583             ST.alen = 0;
5584             ST.count = 0;
5585             ST.minmod = minmod;
5586             minmod = 0;
5587             ST.c1 = CHRTEST_UNINIT;
5588             REGCP_SET(ST.cp);
5589
5590             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5591                 goto curlym_do_B;
5592
5593           curlym_do_A: /* execute the A in /A{m,n}B/  */
5594             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5595             assert(0); /* NOTREACHED */
5596
5597         case CURLYM_A: /* we've just matched an A */
5598             ST.count++;
5599             /* after first match, determine A's length: u.curlym.alen */
5600             if (ST.count == 1) {
5601                 if (PL_reg_match_utf8) {
5602                     char *s = st->locinput;
5603                     while (s < locinput) {
5604                         ST.alen++;
5605                         s += UTF8SKIP(s);
5606                     }
5607                 }
5608                 else {
5609                     ST.alen = locinput - st->locinput;
5610                 }
5611                 if (ST.alen == 0)
5612                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5613             }
5614             DEBUG_EXECUTE_r(
5615                 PerlIO_printf(Perl_debug_log,
5616                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5617                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5618                           (IV) ST.count, (IV)ST.alen)
5619             );
5620
5621             if (cur_eval && cur_eval->u.eval.close_paren &&
5622                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5623                 goto fake_end;
5624
5625             {
5626                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5627                 if ( max == REG_INFTY || ST.count < max )
5628                     goto curlym_do_A; /* try to match another A */
5629             }
5630             goto curlym_do_B; /* try to match B */
5631
5632         case CURLYM_A_fail: /* just failed to match an A */
5633             REGCP_UNWIND(ST.cp);
5634
5635             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5636                 || (cur_eval && cur_eval->u.eval.close_paren &&
5637                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5638                 sayNO;
5639
5640           curlym_do_B: /* execute the B in /A{m,n}B/  */
5641             if (ST.c1 == CHRTEST_UNINIT) {
5642                 /* calculate c1 and c2 for possible match of 1st char
5643                  * following curly */
5644                 ST.c1 = ST.c2 = CHRTEST_VOID;
5645                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5646                     regnode *text_node = ST.B;
5647                     if (! HAS_TEXT(text_node))
5648                         FIND_NEXT_IMPT(text_node);
5649                     /* this used to be
5650
5651                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5652
5653                         But the former is redundant in light of the latter.
5654
5655                         if this changes back then the macro for
5656                         IS_TEXT and friends need to change.
5657                      */
5658                     if (PL_regkind[OP(text_node)] == EXACT) {
5659                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5660                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5661                         {
5662                             sayNO;
5663                         }
5664                     }
5665                 }
5666             }
5667
5668             DEBUG_EXECUTE_r(
5669                 PerlIO_printf(Perl_debug_log,
5670                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5671                     (int)(REPORT_CODE_OFF+(depth*2)),
5672                     "", (IV)ST.count)
5673                 );
5674             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5675                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5676                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5677                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5678                     {
5679                         /* simulate B failing */
5680                         DEBUG_OPTIMISE_r(
5681                             PerlIO_printf(Perl_debug_log,
5682                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5683                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5684                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5685                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5686                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5687                         );
5688                         state_num = CURLYM_B_fail;
5689                         goto reenter_switch;
5690                     }
5691                 }
5692                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5693                     /* simulate B failing */
5694                     DEBUG_OPTIMISE_r(
5695                         PerlIO_printf(Perl_debug_log,
5696                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5697                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5698                             (int) nextchr, ST.c1, ST.c2)
5699                     );
5700                     state_num = CURLYM_B_fail;
5701                     goto reenter_switch;
5702                 }
5703             }
5704
5705             if (ST.me->flags) {
5706                 /* emulate CLOSE: mark current A as captured */
5707                 I32 paren = ST.me->flags;
5708                 if (ST.count) {
5709                     rex->offs[paren].start
5710                         = HOPc(locinput, -ST.alen) - PL_bostr;
5711                     rex->offs[paren].end = locinput - PL_bostr;
5712                     if ((U32)paren > rex->lastparen)
5713                         rex->lastparen = paren;
5714                     rex->lastcloseparen = paren;
5715                 }
5716                 else
5717                     rex->offs[paren].end = -1;
5718                 if (cur_eval && cur_eval->u.eval.close_paren &&
5719                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5720                 {
5721                     if (ST.count)
5722                         goto fake_end;
5723                     else
5724                         sayNO;
5725                 }
5726             }
5727
5728             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5729             assert(0); /* NOTREACHED */
5730
5731         case CURLYM_B_fail: /* just failed to match a B */
5732             REGCP_UNWIND(ST.cp);
5733             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5734             if (ST.minmod) {
5735                 I32 max = ARG2(ST.me);
5736                 if (max != REG_INFTY && ST.count == max)
5737                     sayNO;
5738                 goto curlym_do_A; /* try to match a further A */
5739             }
5740             /* backtrack one A */
5741             if (ST.count == ARG1(ST.me) /* min */)
5742                 sayNO;
5743             ST.count--;
5744             SET_locinput(HOPc(locinput, -ST.alen));
5745             goto curlym_do_B; /* try to match B */
5746
5747 #undef ST
5748 #define ST st->u.curly
5749
5750 #define CURLY_SETPAREN(paren, success) \
5751     if (paren) { \
5752         if (success) { \
5753             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5754             rex->offs[paren].end = locinput - PL_bostr; \
5755             if (paren > rex->lastparen) \
5756                 rex->lastparen = paren; \
5757             rex->lastcloseparen = paren; \
5758         } \
5759         else { \
5760             rex->offs[paren].end = -1; \
5761             rex->lastparen      = ST.lastparen; \
5762             rex->lastcloseparen = ST.lastcloseparen; \
5763         } \
5764     }
5765
5766         case STAR:              /*  /A*B/ where A is width 1 char */
5767             ST.paren = 0;
5768             ST.min = 0;
5769             ST.max = REG_INFTY;
5770             scan = NEXTOPER(scan);
5771             goto repeat;
5772
5773         case PLUS:              /*  /A+B/ where A is width 1 char */
5774             ST.paren = 0;
5775             ST.min = 1;
5776             ST.max = REG_INFTY;
5777             scan = NEXTOPER(scan);
5778             goto repeat;
5779
5780         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5781             ST.paren = scan->flags;     /* Which paren to set */
5782             ST.lastparen      = rex->lastparen;
5783             ST.lastcloseparen = rex->lastcloseparen;
5784             if (ST.paren > PL_regsize)
5785                 PL_regsize = ST.paren;
5786             ST.min = ARG1(scan);  /* min to match */
5787             ST.max = ARG2(scan);  /* max to match */
5788             if (cur_eval && cur_eval->u.eval.close_paren &&
5789                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5790                 ST.min=1;
5791                 ST.max=1;
5792             }
5793             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5794             goto repeat;
5795
5796         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5797             ST.paren = 0;
5798             ST.min = ARG1(scan);  /* min to match */
5799             ST.max = ARG2(scan);  /* max to match */
5800             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5801           repeat:
5802             /*
5803             * Lookahead to avoid useless match attempts
5804             * when we know what character comes next.
5805             *
5806             * Used to only do .*x and .*?x, but now it allows
5807             * for )'s, ('s and (?{ ... })'s to be in the way
5808             * of the quantifier and the EXACT-like node.  -- japhy
5809             */
5810
5811             assert(ST.min <= ST.max);
5812             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5813                 ST.c1 = ST.c2 = CHRTEST_VOID;
5814             }
5815             else {
5816                 regnode *text_node = next;
5817
5818                 if (! HAS_TEXT(text_node))
5819                     FIND_NEXT_IMPT(text_node);
5820
5821                 if (! HAS_TEXT(text_node))
5822                     ST.c1 = ST.c2 = CHRTEST_VOID;
5823                 else {
5824                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5825                         ST.c1 = ST.c2 = CHRTEST_VOID;
5826                     }
5827                     else {
5828
5829                     /*  Currently we only get here when
5830
5831                         PL_rekind[OP(text_node)] == EXACT
5832
5833                         if this changes back then the macro for IS_TEXT and
5834                         friends need to change. */
5835                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5836                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5837                         {
5838                             sayNO;
5839                         }
5840                     }
5841                 }
5842             }
5843
5844             ST.A = scan;
5845             ST.B = next;
5846             if (minmod) {
5847                 char *li = locinput;
5848                 minmod = 0;
5849                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5850                     sayNO;
5851                 SET_locinput(li);
5852                 ST.count = ST.min;
5853                 REGCP_SET(ST.cp);
5854                 if (ST.c1 == CHRTEST_VOID)
5855                     goto curly_try_B_min;
5856
5857                 ST.oldloc = locinput;
5858
5859                 /* set ST.maxpos to the furthest point along the
5860                  * string that could possibly match */
5861                 if  (ST.max == REG_INFTY) {
5862                     ST.maxpos = PL_regeol - 1;
5863                     if (utf8_target)
5864                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5865                             ST.maxpos--;
5866                 }
5867                 else if (utf8_target) {
5868                     int m = ST.max - ST.min;
5869                     for (ST.maxpos = locinput;
5870                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5871                         ST.maxpos += UTF8SKIP(ST.maxpos);
5872                 }
5873                 else {
5874                     ST.maxpos = locinput + ST.max - ST.min;
5875                     if (ST.maxpos >= PL_regeol)
5876                         ST.maxpos = PL_regeol - 1;
5877                 }
5878                 goto curly_try_B_min_known;
5879
5880             }
5881             else {
5882                 /* avoid taking address of locinput, so it can remain
5883                  * a register var */
5884                 char *li = locinput;
5885                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5886                 if (ST.count < ST.min)
5887                     sayNO;
5888                 SET_locinput(li);
5889                 if ((ST.count > ST.min)
5890                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5891                 {
5892                     /* A{m,n} must come at the end of the string, there's
5893                      * no point in backing off ... */
5894                     ST.min = ST.count;
5895                     /* ...except that $ and \Z can match before *and* after
5896                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5897                        We may back off by one in this case. */
5898                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5899                         ST.min--;
5900                 }
5901                 REGCP_SET(ST.cp);
5902                 goto curly_try_B_max;
5903             }
5904             assert(0); /* NOTREACHED */
5905
5906
5907         case CURLY_B_min_known_fail:
5908             /* failed to find B in a non-greedy match where c1,c2 valid */
5909
5910             REGCP_UNWIND(ST.cp);
5911             if (ST.paren) {
5912                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5913             }
5914             /* Couldn't or didn't -- move forward. */
5915             ST.oldloc = locinput;
5916             if (utf8_target)
5917                 locinput += UTF8SKIP(locinput);
5918             else
5919                 locinput++;
5920             ST.count++;
5921           curly_try_B_min_known:
5922              /* find the next place where 'B' could work, then call B */
5923             {
5924                 int n;
5925                 if (utf8_target) {
5926                     n = (ST.oldloc == locinput) ? 0 : 1;
5927                     if (ST.c1 == ST.c2) {
5928                         /* set n to utf8_distance(oldloc, locinput) */
5929                         while (locinput <= ST.maxpos
5930                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5931                         {
5932                             locinput += UTF8SKIP(locinput);
5933                             n++;
5934                         }
5935                     }
5936                     else {
5937                         /* set n to utf8_distance(oldloc, locinput) */
5938                         while (locinput <= ST.maxpos
5939                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5940                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5941                         {
5942                             locinput += UTF8SKIP(locinput);
5943                             n++;
5944                         }
5945                     }
5946                 }
5947                 else {  /* Not utf8_target */
5948                     if (ST.c1 == ST.c2) {
5949                         while (locinput <= ST.maxpos &&
5950                                UCHARAT(locinput) != ST.c1)
5951                             locinput++;
5952                     }
5953                     else {
5954                         while (locinput <= ST.maxpos
5955                                && UCHARAT(locinput) != ST.c1
5956                                && UCHARAT(locinput) != ST.c2)
5957                             locinput++;
5958                     }
5959                     n = locinput - ST.oldloc;
5960                 }
5961                 if (locinput > ST.maxpos)
5962                     sayNO;
5963                 if (n) {
5964                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5965                      * at b; check that everything between oldloc and
5966                      * locinput matches */
5967                     char *li = ST.oldloc;
5968                     ST.count += n;
5969                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
5970                         sayNO;
5971                     assert(n == REG_INFTY || locinput == li);
5972                 }
5973                 CURLY_SETPAREN(ST.paren, ST.count);
5974                 if (cur_eval && cur_eval->u.eval.close_paren &&
5975                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5976                     goto fake_end;
5977                 }
5978                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5979             }
5980             assert(0); /* NOTREACHED */
5981
5982
5983         case CURLY_B_min_fail:
5984             /* failed to find B in a non-greedy match where c1,c2 invalid */
5985
5986             REGCP_UNWIND(ST.cp);
5987             if (ST.paren) {
5988                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5989             }
5990             /* failed -- move forward one */
5991             {
5992                 char *li = locinput;
5993                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5994                     sayNO;
5995                 }
5996                 locinput = li;
5997             }
5998             {
5999                 ST.count++;
6000                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6001                         ST.count > 0)) /* count overflow ? */
6002                 {
6003                   curly_try_B_min:
6004                     CURLY_SETPAREN(ST.paren, ST.count);
6005                     if (cur_eval && cur_eval->u.eval.close_paren &&
6006                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6007                         goto fake_end;
6008                     }
6009                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6010                 }
6011             }
6012             sayNO;
6013             assert(0); /* NOTREACHED */
6014
6015
6016         curly_try_B_max:
6017             /* a successful greedy match: now try to match B */
6018             if (cur_eval && cur_eval->u.eval.close_paren &&
6019                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6020                 goto fake_end;
6021             }
6022             {
6023                 bool could_match = locinput < PL_regeol;
6024
6025                 /* If it could work, try it. */
6026                 if (ST.c1 != CHRTEST_VOID && could_match) {
6027                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6028                     {
6029                         could_match = memEQ(locinput,
6030                                             ST.c1_utf8,
6031                                             UTF8SKIP(locinput))
6032                                     || memEQ(locinput,
6033                                              ST.c2_utf8,
6034                                              UTF8SKIP(locinput));
6035                     }
6036                     else {
6037                         could_match = UCHARAT(locinput) == ST.c1
6038                                       || UCHARAT(locinput) == ST.c2;
6039                     }
6040                 }
6041                 if (ST.c1 == CHRTEST_VOID || could_match) {
6042                     CURLY_SETPAREN(ST.paren, ST.count);
6043                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6044                     assert(0); /* NOTREACHED */
6045                 }
6046             }
6047             /* FALL THROUGH */
6048
6049         case CURLY_B_max_fail:
6050             /* failed to find B in a greedy match */
6051
6052             REGCP_UNWIND(ST.cp);
6053             if (ST.paren) {
6054                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6055             }
6056             /*  back up. */
6057             if (--ST.count < ST.min)
6058                 sayNO;
6059             locinput = HOPc(locinput, -1);
6060             goto curly_try_B_max;
6061
6062 #undef ST
6063
6064         case END: /*  last op of main pattern  */
6065             fake_end:
6066             if (cur_eval) {
6067                 /* we've just finished A in /(??{A})B/; now continue with B */
6068                 st->u.eval.toggle_reg_flags
6069                             = cur_eval->u.eval.toggle_reg_flags;
6070                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6071
6072                 st->u.eval.prev_rex = rex_sv;           /* inner */
6073                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6074                 rex_sv = cur_eval->u.eval.prev_rex;
6075                 SET_reg_curpm(rex_sv);
6076                 rex = ReANY(rex_sv);
6077                 rexi = RXi_GET(rex);
6078                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6079
6080                 REGCP_SET(st->u.eval.lastcp);
6081
6082                 /* Restore parens of the outer rex without popping the
6083                  * savestack */
6084                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6085
6086                 st->u.eval.prev_eval = cur_eval;
6087                 cur_eval = cur_eval->u.eval.prev_eval;
6088                 DEBUG_EXECUTE_r(
6089                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6090                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6091                 if ( nochange_depth )
6092                     nochange_depth--;
6093
6094                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6095                                     locinput); /* match B */
6096             }
6097
6098             if (locinput < reginfo->till) {
6099                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6100                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6101                                       PL_colors[4],
6102                                       (long)(locinput - PL_reg_starttry),
6103                                       (long)(reginfo->till - PL_reg_starttry),
6104                                       PL_colors[5]));
6105
6106                 sayNO_SILENT;           /* Cannot match: too short. */
6107             }
6108             sayYES;                     /* Success! */
6109
6110         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6111             DEBUG_EXECUTE_r(
6112             PerlIO_printf(Perl_debug_log,
6113                 "%*s  %ssubpattern success...%s\n",
6114                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6115             sayYES;                     /* Success! */
6116
6117 #undef  ST
6118 #define ST st->u.ifmatch
6119
6120         {
6121             char *newstart;
6122
6123         case SUSPEND:   /* (?>A) */
6124             ST.wanted = 1;
6125             newstart = locinput;
6126             goto do_ifmatch;
6127
6128         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6129             ST.wanted = 0;
6130             goto ifmatch_trivial_fail_test;
6131
6132         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6133             ST.wanted = 1;
6134           ifmatch_trivial_fail_test:
6135             if (scan->flags) {
6136                 char * const s = HOPBACKc(locinput, scan->flags);
6137                 if (!s) {
6138                     /* trivial fail */
6139                     if (logical) {
6140                         logical = 0;
6141                         sw = 1 - cBOOL(ST.wanted);
6142                     }
6143                     else if (ST.wanted)
6144                         sayNO;
6145                     next = scan + ARG(scan);
6146                     if (next == scan)
6147                         next = NULL;
6148                     break;
6149                 }
6150                 newstart = s;
6151             }
6152             else
6153                 newstart = locinput;
6154
6155           do_ifmatch:
6156             ST.me = scan;
6157             ST.logical = logical;
6158             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6159
6160             /* execute body of (?...A) */
6161             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6162             assert(0); /* NOTREACHED */
6163         }
6164
6165         case IFMATCH_A_fail: /* body of (?...A) failed */
6166             ST.wanted = !ST.wanted;
6167             /* FALL THROUGH */
6168
6169         case IFMATCH_A: /* body of (?...A) succeeded */
6170             if (ST.logical) {
6171                 sw = cBOOL(ST.wanted);
6172             }
6173             else if (!ST.wanted)
6174                 sayNO;
6175
6176             if (OP(ST.me) != SUSPEND) {
6177                 /* restore old position except for (?>...) */
6178                 locinput = st->locinput;
6179             }
6180             scan = ST.me + ARG(ST.me);
6181             if (scan == ST.me)
6182                 scan = NULL;
6183             continue; /* execute B */
6184
6185 #undef ST
6186
6187         case LONGJMP: /*  alternative with many branches compiles to
6188                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6189             next = scan + ARG(scan);
6190             if (next == scan)
6191                 next = NULL;
6192             break;
6193
6194         case COMMIT:  /*  (*COMMIT)  */
6195             reginfo->cutpoint = PL_regeol;
6196             /* FALLTHROUGH */
6197
6198         case PRUNE:   /*  (*PRUNE)   */
6199             if (!scan->flags)
6200                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6201             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6202             assert(0); /* NOTREACHED */
6203
6204         case COMMIT_next_fail:
6205             no_final = 1;
6206             /* FALLTHROUGH */
6207
6208         case OPFAIL:   /* (*FAIL)  */
6209             sayNO;
6210             assert(0); /* NOTREACHED */
6211
6212 #define ST st->u.mark
6213         case MARKPOINT: /*  (*MARK:foo)  */
6214             ST.prev_mark = mark_state;
6215             ST.mark_name = sv_commit = sv_yes_mark
6216                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6217             mark_state = st;
6218             ST.mark_loc = locinput;
6219             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6220             assert(0); /* NOTREACHED */
6221
6222         case MARKPOINT_next:
6223             mark_state = ST.prev_mark;
6224             sayYES;
6225             assert(0); /* NOTREACHED */
6226
6227         case MARKPOINT_next_fail:
6228             if (popmark && sv_eq(ST.mark_name,popmark))
6229             {
6230                 if (ST.mark_loc > startpoint)
6231                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6232                 popmark = NULL; /* we found our mark */
6233                 sv_commit = ST.mark_name;
6234
6235                 DEBUG_EXECUTE_r({
6236                         PerlIO_printf(Perl_debug_log,
6237                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6238                             REPORT_CODE_OFF+depth*2, "",
6239                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6240                 });
6241             }
6242             mark_state = ST.prev_mark;
6243             sv_yes_mark = mark_state ?
6244                 mark_state->u.mark.mark_name : NULL;
6245             sayNO;
6246             assert(0); /* NOTREACHED */
6247
6248         case SKIP:  /*  (*SKIP)  */
6249             if (scan->flags) {
6250                 /* (*SKIP) : if we fail we cut here*/
6251                 ST.mark_name = NULL;
6252                 ST.mark_loc = locinput;
6253                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6254             } else {
6255                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6256                    otherwise do nothing.  Meaning we need to scan
6257                  */
6258                 regmatch_state *cur = mark_state;
6259                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6260
6261                 while (cur) {
6262                     if ( sv_eq( cur->u.mark.mark_name,
6263                                 find ) )
6264                     {
6265                         ST.mark_name = find;
6266                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6267                     }
6268                     cur = cur->u.mark.prev_mark;
6269                 }
6270             }
6271             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6272             break;
6273
6274         case SKIP_next_fail:
6275             if (ST.mark_name) {
6276                 /* (*CUT:NAME) - Set up to search for the name as we
6277                    collapse the stack*/
6278                 popmark = ST.mark_name;
6279             } else {
6280                 /* (*CUT) - No name, we cut here.*/
6281                 if (ST.mark_loc > startpoint)
6282                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6283                 /* but we set sv_commit to latest mark_name if there
6284                    is one so they can test to see how things lead to this
6285                    cut */
6286                 if (mark_state)
6287                     sv_commit=mark_state->u.mark.mark_name;
6288             }
6289             no_final = 1;
6290             sayNO;
6291             assert(0); /* NOTREACHED */
6292 #undef ST
6293
6294         case LNBREAK: /* \R */
6295             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6296                 locinput += n;
6297             } else
6298                 sayNO;
6299             break;
6300
6301 #define CASE_CLASS(nAmE)                              \
6302         case nAmE:                                    \
6303             if (NEXTCHR_IS_EOS)                       \
6304                 sayNO;                                \
6305             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6306                 locinput += n;                        \
6307             } else                                    \
6308                 sayNO;                                \
6309             break;                                    \
6310         case N##nAmE:                                 \
6311             if (NEXTCHR_IS_EOS)                       \
6312                 sayNO;                                \
6313             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6314                 sayNO;                                \
6315             } else {                                  \
6316                 locinput += UTF8SKIP(locinput);       \
6317             }                                         \
6318             break
6319
6320         CASE_CLASS(VERTWS);  /*  \v \V  */
6321         CASE_CLASS(HORIZWS); /*  \h \H  */
6322 #undef CASE_CLASS
6323
6324         default:
6325             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6326                           PTR2UV(scan), OP(scan));
6327             Perl_croak(aTHX_ "regexp memory corruption");
6328
6329         /* this is a point to jump to in order to increment
6330          * locinput by one character */
6331         increment_locinput:
6332             assert(!NEXTCHR_IS_EOS);
6333             if (utf8_target) {
6334                 locinput += PL_utf8skip[nextchr];
6335                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6336                 if (locinput > PL_regeol)
6337                     sayNO;
6338             }
6339             else
6340                 locinput++;
6341             break;
6342
6343         } /* end switch */
6344
6345         /* switch break jumps here */
6346         scan = next; /* prepare to execute the next op and ... */
6347         continue;    /* ... jump back to the top, reusing st */
6348         assert(0); /* NOTREACHED */
6349
6350       push_yes_state:
6351         /* push a state that backtracks on success */
6352         st->u.yes.prev_yes_state = yes_state;
6353         yes_state = st;
6354         /* FALL THROUGH */
6355       push_state:
6356         /* push a new regex state, then continue at scan  */
6357         {
6358             regmatch_state *newst;
6359
6360             DEBUG_STACK_r({
6361                 regmatch_state *cur = st;
6362                 regmatch_state *curyes = yes_state;
6363                 int curd = depth;
6364                 regmatch_slab *slab = PL_regmatch_slab;
6365                 for (;curd > -1;cur--,curd--) {
6366                     if (cur < SLAB_FIRST(slab)) {
6367                         slab = slab->prev;
6368                         cur = SLAB_LAST(slab);
6369                     }
6370                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6371                         REPORT_CODE_OFF + 2 + depth * 2,"",
6372                         curd, PL_reg_name[cur->resume_state],
6373                         (curyes == cur) ? "yes" : ""
6374                     );
6375                     if (curyes == cur)
6376                         curyes = cur->u.yes.prev_yes_state;
6377                 }
6378             } else
6379                 DEBUG_STATE_pp("push")
6380             );
6381             depth++;
6382             st->locinput = locinput;
6383             newst = st+1;
6384             if (newst >  SLAB_LAST(PL_regmatch_slab))
6385                 newst = S_push_slab(aTHX);
6386             PL_regmatch_state = newst;
6387
6388             locinput = pushinput;
6389             st = newst;
6390             continue;
6391             assert(0); /* NOTREACHED */
6392         }
6393     }
6394
6395     /*
6396     * We get here only if there's trouble -- normally "case END" is
6397     * the terminating point.
6398     */
6399     Perl_croak(aTHX_ "corrupted regexp pointers");
6400     /*NOTREACHED*/
6401     sayNO;
6402
6403 yes:
6404     if (yes_state) {
6405         /* we have successfully completed a subexpression, but we must now
6406          * pop to the state marked by yes_state and continue from there */
6407         assert(st != yes_state);
6408 #ifdef DEBUGGING
6409         while (st != yes_state) {
6410             st--;
6411             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6412                 PL_regmatch_slab = PL_regmatch_slab->prev;
6413                 st = SLAB_LAST(PL_regmatch_slab);
6414             }
6415             DEBUG_STATE_r({
6416                 if (no_final) {
6417                     DEBUG_STATE_pp("pop (no final)");
6418                 } else {
6419                     DEBUG_STATE_pp("pop (yes)");
6420                 }
6421             });
6422             depth--;
6423         }
6424 #else
6425         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6426             || yes_state > SLAB_LAST(PL_regmatch_slab))
6427         {
6428             /* not in this slab, pop slab */
6429             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6430             PL_regmatch_slab = PL_regmatch_slab->prev;
6431             st = SLAB_LAST(PL_regmatch_slab);
6432         }
6433         depth -= (st - yes_state);
6434 #endif
6435         st = yes_state;
6436         yes_state = st->u.yes.prev_yes_state;
6437         PL_regmatch_state = st;
6438
6439         if (no_final)
6440             locinput= st->locinput;
6441         state_num = st->resume_state + no_final;
6442         goto reenter_switch;
6443     }
6444
6445     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6446                           PL_colors[4], PL_colors[5]));
6447
6448     if (PL_reg_state.re_state_eval_setup_done) {
6449         /* each successfully executed (?{...}) block does the equivalent of
6450          *   local $^R = do {...}
6451          * When popping the save stack, all these locals would be undone;
6452          * bypass this by setting the outermost saved $^R to the latest
6453          * value */
6454         if (oreplsv != GvSV(PL_replgv))
6455             sv_setsv(oreplsv, GvSV(PL_replgv));
6456     }
6457     result = 1;
6458     goto final_exit;
6459
6460 no:
6461     DEBUG_EXECUTE_r(
6462         PerlIO_printf(Perl_debug_log,
6463             "%*s  %sfailed...%s\n",
6464             REPORT_CODE_OFF+depth*2, "",
6465             PL_colors[4], PL_colors[5])
6466         );
6467
6468 no_silent:
6469     if (no_final) {
6470         if (yes_state) {
6471             goto yes;
6472         } else {
6473             goto final_exit;
6474         }
6475     }
6476     if (depth) {
6477         /* there's a previous state to backtrack to */
6478         st--;
6479         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6480             PL_regmatch_slab = PL_regmatch_slab->prev;
6481             st = SLAB_LAST(PL_regmatch_slab);
6482         }
6483         PL_regmatch_state = st;
6484         locinput= st->locinput;
6485
6486         DEBUG_STATE_pp("pop");
6487         depth--;
6488         if (yes_state == st)
6489             yes_state = st->u.yes.prev_yes_state;
6490
6491         state_num = st->resume_state + 1; /* failure = success + 1 */
6492         goto reenter_switch;
6493     }
6494     result = 0;
6495
6496   final_exit:
6497     if (rex->intflags & PREGf_VERBARG_SEEN) {
6498         SV *sv_err = get_sv("REGERROR", 1);
6499         SV *sv_mrk = get_sv("REGMARK", 1);
6500         if (result) {
6501             sv_commit = &PL_sv_no;
6502             if (!sv_yes_mark)
6503                 sv_yes_mark = &PL_sv_yes;
6504         } else {
6505             if (!sv_commit)
6506                 sv_commit = &PL_sv_yes;
6507             sv_yes_mark = &PL_sv_no;
6508         }
6509         sv_setsv(sv_err, sv_commit);
6510         sv_setsv(sv_mrk, sv_yes_mark);
6511     }
6512
6513
6514     if (last_pushed_cv) {
6515         dSP;
6516         POP_MULTICALL;
6517         PERL_UNUSED_VAR(SP);
6518     }
6519
6520     /* clean up; in particular, free all slabs above current one */
6521     LEAVE_SCOPE(oldsave);
6522
6523     assert(!result ||  locinput - PL_bostr >= 0);
6524     return result ?  locinput - PL_bostr : -1;
6525 }
6526
6527 /*
6528  - regrepeat - repeatedly match something simple, report how many
6529  *
6530  * What 'simple' means is a node which can be the operand of a quantifier like
6531  * '+', or {1,3}
6532  *
6533  * startposp - pointer a pointer to the start position.  This is updated
6534  *             to point to the byte following the highest successful
6535  *             match.
6536  * p         - the regnode to be repeatedly matched against.
6537  * max       - maximum number of things to match.
6538  * depth     - (for debugging) backtracking depth.
6539  */
6540 STATIC I32
6541 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6542 {
6543     dVAR;
6544     char *scan;     /* Pointer to current position in target string */
6545     I32 c;
6546     char *loceol = PL_regeol;   /* local version */
6547     I32 hardcount = 0;  /* How many matches so far */
6548     bool utf8_target = PL_reg_match_utf8;
6549     UV utf8_flags;
6550 #ifndef DEBUGGING
6551     PERL_UNUSED_ARG(depth);
6552 #endif
6553
6554     PERL_ARGS_ASSERT_REGREPEAT;
6555
6556     scan = *startposp;
6557     if (max == REG_INFTY)
6558         max = I32_MAX;
6559     else if (! utf8_target && scan + max < loceol)
6560         loceol = scan + max;
6561
6562     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6563      * to the maximum of how far we should go in it (leaving it set to the real
6564      * end, if the maximum permissible would take us beyond that).  This allows
6565      * us to make the loop exit condition that we haven't gone past <loceol> to
6566      * also mean that we haven't exceeded the max permissible count, saving a
6567      * test each time through the loop.  But it assumes that the OP matches a
6568      * single byte, which is true for most of the OPs below when applied to a
6569      * non-UTF-8 target.  Those relatively few OPs that don't have this
6570      * characteristic will have to compensate.
6571      *
6572      * There is no adjustment for UTF-8 targets, as the number of bytes per
6573      * character varies.  OPs will have to test both that the count is less
6574      * than the max permissible (using <hardcount> to keep track), and that we
6575      * are still within the bounds of the string (using <loceol>.  A few OPs
6576      * match a single byte no matter what the encoding.  They can omit the max
6577      * test if, for the UTF-8 case, they do the adjustment that was skipped
6578      * above.
6579      *
6580      * Thus, the code above sets things up for the common case; and exceptional
6581      * cases need extra work; the common case is to make sure <scan> doesn't
6582      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6583      * count doesn't exceed the maximum permissible */
6584
6585     switch (OP(p)) {
6586     case REG_ANY:
6587         if (utf8_target) {
6588             while (scan < loceol && hardcount < max && *scan != '\n') {
6589                 scan += UTF8SKIP(scan);
6590                 hardcount++;
6591             }
6592         } else {
6593             while (scan < loceol && *scan != '\n')
6594                 scan++;
6595         }
6596         break;
6597     case SANY:
6598         if (utf8_target) {
6599             while (scan < loceol && hardcount < max) {
6600                 scan += UTF8SKIP(scan);
6601                 hardcount++;
6602             }
6603         }
6604         else
6605             scan = loceol;
6606         break;
6607     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6608         if (utf8_target && scan + max < loceol) {
6609
6610             /* <loceol> hadn't been adjusted in the UTF-8 case */
6611             scan +=  max;
6612         }
6613         else {
6614             scan = loceol;
6615         }
6616         break;
6617     case EXACT:
6618         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6619
6620         c = (U8)*STRING(p);
6621
6622         /* Can use a simple loop if the pattern char to match on is invariant
6623          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6624          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6625          * true iff it doesn't matter if the argument is in UTF-8 or not */
6626         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6627             if (utf8_target && scan + max < loceol) {
6628                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6629                  * since here, to match at all, 1 char == 1 byte */
6630                 loceol = scan + max;
6631             }
6632             while (scan < loceol && UCHARAT(scan) == c) {
6633                 scan++;
6634             }
6635         }
6636         else if (UTF_PATTERN) {
6637             if (utf8_target) {
6638                 STRLEN scan_char_len;
6639
6640                 /* When both target and pattern are UTF-8, we have to do
6641                  * string EQ */
6642                 while (hardcount < max
6643                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6644                        && scan_char_len <= STR_LEN(p)
6645                        && memEQ(scan, STRING(p), scan_char_len))
6646                 {
6647                     scan += scan_char_len;
6648                     hardcount++;
6649                 }
6650             }
6651             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6652
6653                 /* Target isn't utf8; convert the character in the UTF-8
6654                  * pattern to non-UTF8, and do a simple loop */
6655                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6656                 while (scan < loceol && UCHARAT(scan) == c) {
6657                     scan++;
6658                 }
6659             } /* else pattern char is above Latin1, can't possibly match the
6660                  non-UTF-8 target */
6661         }
6662         else {
6663
6664             /* Here, the string must be utf8; pattern isn't, and <c> is
6665              * different in utf8 than not, so can't compare them directly.
6666              * Outside the loop, find the two utf8 bytes that represent c, and
6667              * then look for those in sequence in the utf8 string */
6668             U8 high = UTF8_TWO_BYTE_HI(c);
6669             U8 low = UTF8_TWO_BYTE_LO(c);
6670
6671             while (hardcount < max
6672                     && scan + 1 < loceol
6673                     && UCHARAT(scan) == high
6674                     && UCHARAT(scan + 1) == low)
6675             {
6676                 scan += 2;
6677                 hardcount++;
6678             }
6679         }
6680         break;
6681
6682     case EXACTFA:
6683         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6684         goto do_exactf;
6685
6686     case EXACTFL:
6687         PL_reg_flags |= RF_tainted;
6688         utf8_flags = FOLDEQ_UTF8_LOCALE;
6689         goto do_exactf;
6690
6691     case EXACTF:
6692             utf8_flags = 0;
6693             goto do_exactf;
6694
6695     case EXACTFU_SS:
6696     case EXACTFU_TRICKYFOLD:
6697     case EXACTFU:
6698         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6699
6700     do_exactf: {
6701         int c1, c2;
6702         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6703
6704         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6705
6706         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6707             if (c1 == CHRTEST_VOID) {
6708                 /* Use full Unicode fold matching */
6709                 char *tmpeol = PL_regeol;
6710                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6711                 while (hardcount < max
6712                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6713                                              STRING(p), NULL, pat_len,
6714                                              cBOOL(UTF_PATTERN), utf8_flags))
6715                 {
6716                     scan = tmpeol;
6717                     tmpeol = PL_regeol;
6718                     hardcount++;
6719                 }
6720             }
6721             else if (utf8_target) {
6722                 if (c1 == c2) {
6723                     while (scan < loceol
6724                            && hardcount < max
6725                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6726                     {
6727                         scan += UTF8SKIP(scan);
6728                         hardcount++;
6729                     }
6730                 }
6731                 else {
6732                     while (scan < loceol
6733                            && hardcount < max
6734                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6735                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6736                     {
6737                         scan += UTF8SKIP(scan);
6738                         hardcount++;
6739                     }
6740                 }
6741             }
6742             else if (c1 == c2) {
6743                 while (scan < loceol && UCHARAT(scan) == c1) {
6744                     scan++;
6745                 }
6746             }
6747             else {
6748                 while (scan < loceol &&
6749                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6750                 {
6751                     scan++;
6752                 }
6753             }
6754         }
6755         break;
6756     }
6757     case ANYOF:
6758         if (utf8_target) {
6759             STRLEN inclasslen;
6760             while (hardcount < max
6761                    && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6762                    && reginclass(prog, p, (U8*)scan, utf8_target))
6763             {
6764                 scan += inclasslen;
6765                 hardcount++;
6766             }
6767         } else {
6768             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6769                 scan++;
6770         }
6771         break;
6772     case ALNUMU:
6773         if (utf8_target) {
6774     utf8_wordchar:
6775             LOAD_UTF8_CHARCLASS_ALNUM();
6776             while (hardcount < max && scan < loceol &&
6777                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6778             {
6779                 scan += UTF8SKIP(scan);
6780                 hardcount++;
6781             }
6782         } else {
6783             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6784                 scan++;
6785             }
6786         }
6787         break;
6788     case ALNUM:
6789         if (utf8_target)
6790             goto utf8_wordchar;
6791         while (scan < loceol && isALNUM((U8) *scan)) {
6792             scan++;
6793         }
6794         break;
6795     case ALNUMA:
6796         if (utf8_target && scan + max < loceol) {
6797
6798             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6799              * since here, to match, 1 char == 1 byte */
6800             loceol = scan + max;
6801         }
6802         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6803             scan++;
6804         }
6805         break;
6806     case ALNUML:
6807         PL_reg_flags |= RF_tainted;
6808         if (utf8_target) {
6809             while (hardcount < max && scan < loceol &&
6810                    isALNUM_LC_utf8((U8*)scan)) {
6811                 scan += UTF8SKIP(scan);
6812                 hardcount++;
6813             }
6814         } else {
6815             while (scan < loceol && isALNUM_LC(*scan))
6816                 scan++;
6817         }
6818         break;
6819     case NALNUMU:
6820         if (utf8_target) {
6821
6822     utf8_Nwordchar:
6823
6824             LOAD_UTF8_CHARCLASS_ALNUM();
6825             while (hardcount < max && scan < loceol &&
6826                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6827             {
6828                 scan += UTF8SKIP(scan);
6829                 hardcount++;
6830             }
6831         } else {
6832             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6833                 scan++;
6834             }
6835         }
6836         break;
6837     case NALNUM:
6838         if (utf8_target)
6839             goto utf8_Nwordchar;
6840         while (scan < loceol && ! isALNUM((U8) *scan)) {
6841             scan++;
6842         }
6843         break;
6844
6845     case POSIXA:
6846         if (utf8_target && scan + max < loceol) {
6847
6848             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6849              * since here, to match, 1 char == 1 byte */
6850             loceol = scan + max;
6851         }
6852         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6853             scan++;
6854         }
6855         break;
6856     case NPOSIXA:
6857         if (utf8_target) {
6858             while (scan < loceol && hardcount < max
6859                    && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6860             {
6861                 scan += UTF8SKIP(scan);
6862                 hardcount++;
6863             }
6864         }
6865         else {
6866             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6867                 scan++;
6868             }
6869         }
6870         break;
6871     case NALNUMA:
6872         if (utf8_target) {
6873             while (scan < loceol && hardcount < max
6874                    && ! isWORDCHAR_A((U8) *scan))
6875             {
6876                 scan += UTF8SKIP(scan);
6877                 hardcount++;
6878             }
6879         }
6880         else {
6881             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6882                 scan++;
6883             }
6884         }
6885         break;
6886     case NALNUML:
6887         PL_reg_flags |= RF_tainted;
6888         if (utf8_target) {
6889             while (hardcount < max && scan < loceol &&
6890                    !isALNUM_LC_utf8((U8*)scan)) {
6891                 scan += UTF8SKIP(scan);
6892                 hardcount++;
6893             }
6894         } else {
6895             while (scan < loceol && !isALNUM_LC(*scan))
6896                 scan++;
6897         }
6898         break;
6899     case SPACEU:
6900         if (utf8_target) {
6901
6902     utf8_space:
6903
6904             LOAD_UTF8_CHARCLASS_SPACE();
6905             while (hardcount < max && scan < loceol &&
6906                    (*scan == ' ' ||
6907                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6908             {
6909                 scan += UTF8SKIP(scan);
6910                 hardcount++;
6911             }
6912             break;
6913         }
6914         else {
6915             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6916                 scan++;
6917             }
6918             break;
6919         }
6920     case SPACE:
6921         if (utf8_target)
6922             goto utf8_space;
6923
6924         while (scan < loceol && isSPACE((U8) *scan)) {
6925             scan++;
6926         }
6927         break;
6928     case SPACEA:
6929         if (utf8_target && scan + max < loceol) {
6930
6931             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6932              * since here, to match, 1 char == 1 byte */
6933             loceol = scan + max;
6934         }
6935         while (scan < loceol && isSPACE_A((U8) *scan)) {
6936             scan++;
6937         }
6938         break;
6939     case SPACEL:
6940         PL_reg_flags |= RF_tainted;
6941         if (utf8_target) {
6942             while (hardcount < max && scan < loceol &&
6943                    isSPACE_LC_utf8((U8*)scan)) {
6944                 scan += UTF8SKIP(scan);
6945                 hardcount++;
6946             }
6947         } else {
6948             while (scan < loceol && isSPACE_LC(*scan))
6949                 scan++;
6950         }
6951         break;
6952     case NSPACEU:
6953         if (utf8_target) {
6954
6955     utf8_Nspace:
6956
6957             LOAD_UTF8_CHARCLASS_SPACE();
6958             while (hardcount < max && scan < loceol &&
6959                    ! (*scan == ' ' ||
6960                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6961             {
6962                 scan += UTF8SKIP(scan);
6963                 hardcount++;
6964             }
6965             break;
6966         }
6967         else {
6968             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6969                 scan++;
6970             }
6971         }
6972         break;
6973     case NSPACE:
6974         if (utf8_target)
6975             goto utf8_Nspace;
6976
6977         while (scan < loceol && ! isSPACE((U8) *scan)) {
6978             scan++;
6979         }
6980         break;
6981     case NSPACEA:
6982         if (utf8_target) {
6983             while (hardcount < max && scan < loceol
6984                    && ! isSPACE_A((U8) *scan))
6985             {
6986                 scan += UTF8SKIP(scan);
6987                 hardcount++;
6988             }
6989         }
6990         else {
6991             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6992                 scan++;
6993             }
6994         }
6995         break;
6996     case NSPACEL:
6997         PL_reg_flags |= RF_tainted;
6998         if (utf8_target) {
6999             while (hardcount < max && scan < loceol &&
7000                    !isSPACE_LC_utf8((U8*)scan)) {
7001                 scan += UTF8SKIP(scan);
7002                 hardcount++;
7003             }
7004         } else {
7005             while (scan < loceol && !isSPACE_LC(*scan))
7006                 scan++;
7007         }
7008         break;
7009     case DIGIT:
7010         if (utf8_target) {
7011             LOAD_UTF8_CHARCLASS_DIGIT();
7012             while (hardcount < max && scan < loceol &&
7013                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7014                 scan += UTF8SKIP(scan);
7015                 hardcount++;
7016             }
7017         } else {
7018             while (scan < loceol && isDIGIT(*scan))
7019                 scan++;
7020         }
7021         break;
7022     case DIGITA:
7023         if (utf8_target && scan + max < loceol) {
7024
7025             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7026              * since here, to match, 1 char == 1 byte */
7027             loceol = scan + max;
7028         }
7029         while (scan < loceol && isDIGIT_A((U8) *scan)) {
7030             scan++;
7031         }
7032         break;
7033     case DIGITL:
7034         PL_reg_flags |= RF_tainted;
7035         if (utf8_target) {
7036             while (hardcount < max && scan < loceol &&
7037                    isDIGIT_LC_utf8((U8*)scan)) {
7038                 scan += UTF8SKIP(scan);
7039                 hardcount++;
7040             }
7041         } else {
7042             while (scan < loceol && isDIGIT_LC(*scan))
7043                 scan++;
7044         }
7045         break;
7046     case NDIGIT:
7047         if (utf8_target) {
7048             LOAD_UTF8_CHARCLASS_DIGIT();
7049             while (hardcount < max && scan < loceol &&
7050                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7051                 scan += UTF8SKIP(scan);
7052                 hardcount++;
7053             }
7054         } else {
7055             while (scan < loceol && !isDIGIT(*scan))
7056                 scan++;
7057         }
7058         break;
7059     case NDIGITA:
7060         if (utf8_target) {
7061             while (hardcount < max && scan < loceol
7062                    && ! isDIGIT_A((U8) *scan)) {
7063                 scan += UTF8SKIP(scan);
7064                 hardcount++;
7065             }
7066         }
7067         else {
7068             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7069                 scan++;
7070             }
7071         }
7072         break;
7073     case NDIGITL:
7074         PL_reg_flags |= RF_tainted;
7075         if (utf8_target) {
7076             while (hardcount < max && scan < loceol &&
7077                    !isDIGIT_LC_utf8((U8*)scan)) {
7078                 scan += UTF8SKIP(scan);
7079                 hardcount++;
7080             }
7081         } else {
7082             while (scan < loceol && !isDIGIT_LC(*scan))
7083                 scan++;
7084         }
7085         break;
7086     case LNBREAK:
7087         if (utf8_target) {
7088             while (hardcount < max && scan < loceol &&
7089                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7090                 scan += c;
7091                 hardcount++;
7092             }
7093         } else {
7094             /* LNBREAK can match one or two latin chars, which is ok, but we
7095              * have to use hardcount in this situation, and throw away the
7096              * adjustment to <loceol> done before the switch statement */
7097             loceol = PL_regeol;
7098             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7099                 scan+=c;
7100                 hardcount++;
7101             }
7102         }
7103         break;
7104     case HORIZWS:
7105         if (utf8_target) {
7106             while (hardcount < max && scan < loceol &&
7107                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7108             {
7109                 scan += c;
7110                 hardcount++;
7111             }
7112         } else {
7113             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7114                 scan++;
7115         }
7116         break;
7117     case NHORIZWS:
7118         if (utf8_target) {
7119             while (hardcount < max && scan < loceol &&
7120                         !is_HORIZWS_utf8_safe(scan, loceol))
7121             {
7122                 scan += UTF8SKIP(scan);
7123                 hardcount++;
7124             }
7125         } else {
7126             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7127                 scan++;
7128
7129         }
7130         break;
7131     case VERTWS:
7132         if (utf8_target) {
7133             while (hardcount < max && scan < loceol &&
7134                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7135             {
7136                 scan += c;
7137                 hardcount++;
7138             }
7139         } else {
7140             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7141                 scan++;
7142
7143         }
7144         break;
7145     case NVERTWS:
7146         if (utf8_target) {
7147             while (hardcount < max && scan < loceol &&
7148                                 !is_VERTWS_utf8_safe(scan, loceol))
7149             {
7150                 scan += UTF8SKIP(scan);
7151                 hardcount++;
7152             }
7153         } else {
7154             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7155                 scan++;
7156
7157         }
7158         break;
7159
7160     case BOUND:
7161     case BOUNDA:
7162     case BOUNDL:
7163     case BOUNDU:
7164     case EOS:
7165     case GPOS:
7166     case KEEPS:
7167     case NBOUND:
7168     case NBOUNDA:
7169     case NBOUNDL:
7170     case NBOUNDU:
7171     case OPFAIL:
7172     case SBOL:
7173     case SEOL:
7174         /* These are all 0 width, so match right here or not at all. */
7175         break;
7176
7177     default:
7178         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7179         assert(0); /* NOTREACHED */
7180
7181     }
7182
7183     if (hardcount)
7184         c = hardcount;
7185     else
7186         c = scan - *startposp;
7187     *startposp = scan;
7188
7189     DEBUG_r({
7190         GET_RE_DEBUG_FLAGS_DECL;
7191         DEBUG_EXECUTE_r({
7192             SV * const prop = sv_newmortal();
7193             regprop(prog, prop, p);
7194             PerlIO_printf(Perl_debug_log,
7195                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7196                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7197         });
7198     });
7199
7200     return(c);
7201 }
7202
7203
7204 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7205 /*
7206 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7207 create a copy so that changes the caller makes won't change the shared one.
7208 If <altsvp> is non-null, will return NULL in it, for back-compat.
7209  */
7210 SV *
7211 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7212 {
7213     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7214
7215     if (altsvp) {
7216         *altsvp = NULL;
7217     }
7218
7219     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7220 }
7221 #endif
7222
7223 STATIC SV *
7224 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7225 {
7226     /* Returns the swash for the input 'node' in the regex 'prog'.
7227      * If <doinit> is true, will attempt to create the swash if not already
7228      *    done.
7229      * If <listsvp> is non-null, will return the swash initialization string in
7230      *    it.
7231      * Tied intimately to how regcomp.c sets up the data structure */
7232
7233     dVAR;
7234     SV *sw  = NULL;
7235     SV *si  = NULL;
7236     SV*  invlist = NULL;
7237
7238     RXi_GET_DECL(prog,progi);
7239     const struct reg_data * const data = prog ? progi->data : NULL;
7240
7241     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7242
7243     assert(ANYOF_NONBITMAP(node));
7244
7245     if (data && data->count) {
7246         const U32 n = ARG(node);
7247
7248         if (data->what[n] == 's') {
7249             SV * const rv = MUTABLE_SV(data->data[n]);
7250             AV * const av = MUTABLE_AV(SvRV(rv));
7251             SV **const ary = AvARRAY(av);
7252             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7253
7254             si = *ary;  /* ary[0] = the string to initialize the swash with */
7255
7256             /* Elements 2 and 3 are either both present or both absent. [2] is
7257              * any inversion list generated at compile time; [3] indicates if
7258              * that inversion list has any user-defined properties in it. */
7259             if (av_len(av) >= 2) {
7260                 invlist = ary[2];
7261                 if (SvUV(ary[3])) {
7262                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7263                 }
7264             }
7265             else {
7266                 invlist = NULL;
7267             }
7268
7269             /* Element [1] is reserved for the set-up swash.  If already there,
7270              * return it; if not, create it and store it there */
7271             if (SvROK(ary[1])) {
7272                 sw = ary[1];
7273             }
7274             else if (si && doinit) {
7275
7276                 sw = _core_swash_init("utf8", /* the utf8 package */
7277                                       "", /* nameless */
7278                                       si,
7279                                       1, /* binary */
7280                                       0, /* not from tr/// */
7281                                       invlist,
7282                                       &swash_init_flags);
7283                 (void)av_store(av, 1, sw);
7284             }
7285         }
7286     }
7287
7288     if (listsvp) {
7289         SV* matches_string = newSVpvn("", 0);
7290
7291         /* Use the swash, if any, which has to have incorporated into it all
7292          * possibilities */
7293         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7294             && (si && si != &PL_sv_undef))
7295         {
7296
7297             /* If no swash, use the input initialization string, if available */
7298             sv_catsv(matches_string, si);
7299         }
7300
7301         /* Add the inversion list to whatever we have.  This may have come from
7302          * the swash, or from an input parameter */
7303         if (invlist) {
7304             sv_catsv(matches_string, _invlist_contents(invlist));
7305         }
7306         *listsvp = matches_string;
7307     }
7308
7309     return sw;
7310 }
7311
7312 /*
7313  - reginclass - determine if a character falls into a character class
7314
7315   n is the ANYOF regnode
7316   p is the target string
7317   utf8_target tells whether p is in UTF-8.
7318
7319   Returns true if matched; false otherwise.
7320
7321   Note that this can be a synthetic start class, a combination of various
7322   nodes, so things you think might be mutually exclusive, such as locale,
7323   aren't.  It can match both locale and non-locale
7324
7325  */
7326
7327 STATIC bool
7328 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
7329 {
7330     dVAR;
7331     const char flags = ANYOF_FLAGS(n);
7332     bool match = FALSE;
7333     UV c = *p;
7334
7335     PERL_ARGS_ASSERT_REGINCLASS;
7336
7337     /* If c is not already the code point, get it.  Note that
7338      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7339     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7340         STRLEN c_len = 0;
7341         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7342                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7343                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7344                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7345                  * UTF8_ALLOW_FFFF */
7346         if (c_len == (STRLEN)-1)
7347             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7348     }
7349
7350     /* If this character is potentially in the bitmap, check it */
7351     if (c < 256) {
7352         if (ANYOF_BITMAP_TEST(n, c))
7353             match = TRUE;
7354         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7355                 && ! utf8_target
7356                 && ! isASCII(c))
7357         {
7358             match = TRUE;
7359         }
7360         else if (flags & ANYOF_LOCALE) {
7361             PL_reg_flags |= RF_tainted;
7362
7363             if ((flags & ANYOF_LOC_FOLD)
7364                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7365             {
7366                 match = TRUE;
7367             }
7368             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7369                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7370                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7371                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7372                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7373                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7374                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7375                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7376                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7377                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7378                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7379                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7380                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7381                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7382                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7383                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7384                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7385                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7386                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7387                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7388                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7389                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7390                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7391                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7392                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7393                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7394                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7395                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7396                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7397                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7398                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7399                      ) /* How's that for a conditional? */
7400             ) {
7401                 match = TRUE;
7402             }
7403         }
7404     }
7405
7406     /* If the bitmap didn't (or couldn't) match, and something outside the
7407      * bitmap could match, try that.  Locale nodes specify completely the
7408      * behavior of code points in the bit map (otherwise, a utf8 target would
7409      * cause them to be treated as Unicode and not locale), except in
7410      * the very unlikely event when this node is a synthetic start class, which
7411      * could be a combination of locale and non-locale nodes.  So allow locale
7412      * to match for the synthetic start class, which will give a false
7413      * positive that will be resolved when the match is done again as not part
7414      * of the synthetic start class */
7415     if (!match) {
7416         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7417             match = TRUE;       /* Everything above 255 matches */
7418         }
7419         else if (ANYOF_NONBITMAP(n)
7420                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7421                      || (utf8_target
7422                          && (c >=256
7423                              || (! (flags & ANYOF_LOCALE))
7424                              || (flags & ANYOF_IS_SYNTHETIC)))))
7425         {
7426             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7427             if (sw) {
7428                 U8 * utf8_p;
7429                 if (utf8_target) {
7430                     utf8_p = (U8 *) p;
7431                 } else { /* Convert to utf8 */
7432                     STRLEN len = 1;
7433                     utf8_p = bytes_to_utf8(p, &len);
7434                 }
7435
7436                 if (swash_fetch(sw, utf8_p, TRUE)) {
7437                     match = TRUE;
7438                 }
7439
7440                 /* If we allocated a string above, free it */
7441                 if (! utf8_target) Safefree(utf8_p);
7442             }
7443         }
7444
7445         if (UNICODE_IS_SUPER(c)
7446             && (flags & ANYOF_WARN_SUPER)
7447             && ckWARN_d(WARN_NON_UNICODE))
7448         {
7449             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7450                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7451         }
7452     }
7453
7454     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7455     return cBOOL(flags & ANYOF_INVERT) ^ match;
7456 }
7457
7458 STATIC U8 *
7459 S_reghop3(U8 *s, I32 off, const U8* lim)
7460 {
7461     /* return the position 'off' UTF-8 characters away from 's', forward if
7462      * 'off' >= 0, backwards if negative.  But don't go outside of position
7463      * 'lim', which better be < s  if off < 0 */
7464
7465     dVAR;
7466
7467     PERL_ARGS_ASSERT_REGHOP3;
7468
7469     if (off >= 0) {
7470         while (off-- && s < lim) {
7471             /* XXX could check well-formedness here */
7472             s += UTF8SKIP(s);
7473         }
7474     }
7475     else {
7476         while (off++ && s > lim) {
7477             s--;
7478             if (UTF8_IS_CONTINUED(*s)) {
7479                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7480                     s--;
7481             }
7482             /* XXX could check well-formedness here */
7483         }
7484     }
7485     return s;
7486 }
7487
7488 #ifdef XXX_dmq
7489 /* there are a bunch of places where we use two reghop3's that should
7490    be replaced with this routine. but since thats not done yet
7491    we ifdef it out - dmq
7492 */
7493 STATIC U8 *
7494 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7495 {
7496     dVAR;
7497
7498     PERL_ARGS_ASSERT_REGHOP4;
7499
7500     if (off >= 0) {
7501         while (off-- && s < rlim) {
7502             /* XXX could check well-formedness here */
7503             s += UTF8SKIP(s);
7504         }
7505     }
7506     else {
7507         while (off++ && s > llim) {
7508             s--;
7509             if (UTF8_IS_CONTINUED(*s)) {
7510                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7511                     s--;
7512             }
7513             /* XXX could check well-formedness here */
7514         }
7515     }
7516     return s;
7517 }
7518 #endif
7519
7520 STATIC U8 *
7521 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7522 {
7523     dVAR;
7524
7525     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7526
7527     if (off >= 0) {
7528         while (off-- && s < lim) {
7529             /* XXX could check well-formedness here */
7530             s += UTF8SKIP(s);
7531         }
7532         if (off >= 0)
7533             return NULL;
7534     }
7535     else {
7536         while (off++ && s > lim) {
7537             s--;
7538             if (UTF8_IS_CONTINUED(*s)) {
7539                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7540                     s--;
7541             }
7542             /* XXX could check well-formedness here */
7543         }
7544         if (off <= 0)
7545             return NULL;
7546     }
7547     return s;
7548 }
7549
7550 static void
7551 restore_pos(pTHX_ void *arg)
7552 {
7553     dVAR;
7554     regexp * const rex = (regexp *)arg;
7555     if (PL_reg_state.re_state_eval_setup_done) {
7556         if (PL_reg_oldsaved) {
7557             rex->subbeg = PL_reg_oldsaved;
7558             rex->sublen = PL_reg_oldsavedlen;
7559             rex->suboffset = PL_reg_oldsavedoffset;
7560             rex->subcoffset = PL_reg_oldsavedcoffset;
7561 #ifdef PERL_OLD_COPY_ON_WRITE
7562             rex->saved_copy = PL_nrs;
7563 #endif
7564             RXp_MATCH_COPIED_on(rex);
7565         }
7566         PL_reg_magic->mg_len = PL_reg_oldpos;
7567         PL_reg_state.re_state_eval_setup_done = FALSE;
7568         PL_curpm = PL_reg_oldcurpm;
7569     }
7570 }
7571
7572 STATIC void
7573 S_to_utf8_substr(pTHX_ register regexp *prog)
7574 {
7575     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7576      * on the converted value */
7577
7578     int i = 1;
7579
7580     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7581
7582     do {
7583         if (prog->substrs->data[i].substr
7584             && !prog->substrs->data[i].utf8_substr) {
7585             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7586             prog->substrs->data[i].utf8_substr = sv;
7587             sv_utf8_upgrade(sv);
7588             if (SvVALID(prog->substrs->data[i].substr)) {
7589                 if (SvTAIL(prog->substrs->data[i].substr)) {
7590                     /* Trim the trailing \n that fbm_compile added last
7591                        time.  */
7592                     SvCUR_set(sv, SvCUR(sv) - 1);
7593                     /* Whilst this makes the SV technically "invalid" (as its
7594                        buffer is no longer followed by "\0") when fbm_compile()
7595                        adds the "\n" back, a "\0" is restored.  */
7596                     fbm_compile(sv, FBMcf_TAIL);
7597                 } else
7598                     fbm_compile(sv, 0);
7599             }
7600             if (prog->substrs->data[i].substr == prog->check_substr)
7601                 prog->check_utf8 = sv;
7602         }
7603     } while (i--);
7604 }
7605
7606 STATIC bool
7607 S_to_byte_substr(pTHX_ register regexp *prog)
7608 {
7609     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7610      * on the converted value; returns FALSE if can't be converted. */
7611
7612     dVAR;
7613     int i = 1;
7614
7615     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7616
7617     do {
7618         if (prog->substrs->data[i].utf8_substr
7619             && !prog->substrs->data[i].substr) {
7620             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7621             if (! sv_utf8_downgrade(sv, TRUE)) {
7622                 return FALSE;
7623             }
7624             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7625                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7626                     /* Trim the trailing \n that fbm_compile added last
7627                         time.  */
7628                     SvCUR_set(sv, SvCUR(sv) - 1);
7629                     fbm_compile(sv, FBMcf_TAIL);
7630                 } else
7631                     fbm_compile(sv, 0);
7632             }
7633             prog->substrs->data[i].substr = sv;
7634             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7635                 prog->check_substr = sv;
7636         }
7637     } while (i--);
7638
7639     return TRUE;
7640 }
7641
7642 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7643  * pre-composed Hangul syllables, which are all in a contiguous block and
7644  * arranged there in such a way so as to facilitate alorithmic determination of
7645  * their characteristics.  As such, they don't need a swash, but can be
7646  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7647  * is a GCB=LV */
7648 #define SBASE 0xAC00    /* Start of block */
7649 #define SCount 11172    /* Length of block */
7650 #define TCount 28
7651
7652 #if 0   /* This routine is not currently used */
7653 PERL_STATIC_INLINE bool
7654 S_is_utf8_X_LV(pTHX_ const U8 *p)
7655 {
7656     /* Unlike most other similarly named routines here, this does not create a
7657      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7658
7659     dVAR;
7660
7661     UV cp = valid_utf8_to_uvchr(p, NULL);
7662
7663     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7664
7665     /* The earliest Unicode releases did not have these precomposed Hangul
7666      * syllables.  Set to point to undef in that case, so will return false on
7667      * every call */
7668     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7669         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7670         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7671             SvREFCNT_dec(PL_utf8_X_LV);
7672             PL_utf8_X_LV = &PL_sv_undef;
7673         }
7674     }
7675
7676     return (PL_utf8_X_LV != &PL_sv_undef
7677             && cp >= SBASE && cp < SBASE + SCount
7678             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7679 }
7680 #endif
7681
7682 PERL_STATIC_INLINE bool
7683 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7684 {
7685     /* Unlike most other similarly named routines here, this does not create a
7686      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7687
7688     dVAR;
7689
7690     UV cp = valid_utf8_to_uvchr(p, NULL);
7691
7692     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7693
7694     /* The earliest Unicode releases did not have these precomposed Hangul
7695      * syllables.  Set to point to undef in that case, so will return false on
7696      * every call */
7697     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7698         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7699         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7700             SvREFCNT_dec(PL_utf8_X_LVT);
7701             PL_utf8_X_LVT = &PL_sv_undef;
7702         }
7703     }
7704
7705     return (PL_utf8_X_LVT != &PL_sv_undef
7706             && cp >= SBASE && cp < SBASE + SCount
7707             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7708 }
7709
7710 /*
7711  * Local variables:
7712  * c-indentation-style: bsd
7713  * c-basic-offset: 4
7714  * indent-tabs-mode: nil
7715  * End:
7716  *
7717  * ex: set ts=8 sts=4 sw=4 et:
7718  */