regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define RF_tainted      1       /* tainted information used? e.g. locale */
  97 #define RF_warned       2               /* warned about big count? */
  98
  99 #define RF_utf8         8               /* Pattern contains multibyte chars? */
 100
 101 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
 102
 103 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 104
 105 #ifndef STATIC
 106 #define STATIC  static
 107 #endif
 108
 109 /* Valid for non-utf8 strings: avoids the reginclass
 110  * call if there are no complications: i.e., if everything matchable is
 111  * straight forward in the bitmap */
 112 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 113                                               : ANYOF_BITMAP_TEST(p,*(c)))
 114
 115 /*
 116  * Forwards.
 117  */
 118
 119 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 120 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 121
 122 #define HOPc(pos,off) \
 123         (char *)(PL_reg_match_utf8 \
 124             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 125             : (U8*)(pos + off))
 126 #define HOPBACKc(pos, off) \
 127         (char*)(PL_reg_match_utf8\
 128             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 129             : (pos - off >= PL_bostr)           \
 130                 ? (U8*)pos - off                \
 131                 : NULL)
 132
 133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 135
 136
 137 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 138 #define NEXTCHR_IS_EOS (nextchr < 0)
 139
 140 #define SET_nextchr \
 141     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 142
 143 #define SET_locinput(p) \
 144     locinput = (p);  \
 145     SET_nextchr
 146
 147
 148 /* these are unrolled below in the CCC_TRY_XXX defined */
 149 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 150     if (!CAT2(PL_utf8_,class)) { \
 151         bool ok; \
 152         ENTER; save_re_context(); \
 153         ok=CAT2(is_utf8_,class)((const U8*)str); \
 154         PERL_UNUSED_VAR(ok); \
 155         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 156 /* Doesn't do an assert to verify that is correct */
 157 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 158     if (!CAT2(PL_utf8_,class)) { \
 159         bool throw_away PERL_UNUSED_DECL; \
 160         ENTER; save_re_context(); \
 161         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 162         PERL_UNUSED_VAR(throw_away); \
 163         LEAVE; } } STMT_END
 164
 165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 166 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 167
 168 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 169         /* No asserts are done for some of these, in case called on a   */  \
 170         /* Unicode version in which they map to nothing */                  \
 171         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                  \
 172         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 173
 174 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 175
 176 /* The actual code for CCC_TRY, which uses several variables from the routine
 177  * it's callable from.  It is designed to be the bulk of a case statement.
 178  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 179  *      nextchr matches the class.
 180  * UTF8_TEST is the whole test string to use for utf8 targets
 181  * LOAD is what to use to test, and if not present to load in the swash for the
 182  *      class
 183  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 184  *      UTF8_TEST test.
 185  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 186  * utf8 and a variant, load the swash if necessary and test using the utf8
 187  * test.  Advance to the next character if test is ok, otherwise fail; If not
 188  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 189  * fails, or advance to the next character */
 190
 191 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 192     if (NEXTCHR_IS_EOS) {                                                     \
 193         sayNO;                                                                \
 194     }                                                                         \
 195     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 196         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 197         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 198             sayNO;                                                            \
 199         }                                                                     \
 200     }                                                                         \
 201     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 202             sayNO;                                                            \
 203     }                                                                         \
 204     goto increment_locinput;
 205
 206 /* Handle the non-locale cases for a character class and its complement.  It
 207  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 208  * This is because that code fails when the test succeeds, so we want to have
 209  * the test fail so that the code succeeds.  The swash is stored in a
 210  * predictable PL_ place */
 211 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 212                            CLASS, STR)                                        \
 213     case NAME:                                                                \
 214         _CCC_TRY_CODE( !, FUNC,                                               \
 215                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 216                                             (U8*)locinput, TRUE)),            \
 217                           CLASS, STR)                                         \
 218     case NNAME:                                                               \
 219         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 220                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 221                                             (U8*)locinput, TRUE)),            \
 222                           CLASS, STR)
 223 /* Generate the case statements for both locale and non-locale character
 224  * classes in regmatch for classes that don't have special unicode semantics.
 225  * Locales don't use an immediate swash, but an intermediary special locale
 226  * function that is called on the pointer to the current place in the input
 227  * string.  That function will resolve to needing the same swash.  One might
 228  * think that because we don't know what the locale will match, we shouldn't
 229  * check with the swash loading function that it loaded properly; ie, that we
 230  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 231  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 232  * irrelevant here */
 233 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 234                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 235                 NAMEA, NNAMEA, FUNCA,                                         \
 236                 CLASS, STR)                                                   \
 237     case NAMEL:                                                               \
 238         PL_reg_flags |= RF_tainted;                                           \
 239         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 240     case NNAMEL:                                                              \
 241         PL_reg_flags |= RF_tainted;                                           \
 242         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 243                        CLASS, STR)                                            \
 244     case NAMEA:                                                               \
 245         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 246             sayNO;                                                            \
 247         }                                                                     \
 248         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 249         locinput++;                                        \
 250         break;                                                                \
 251     case NNAMEA:                                                              \
 252         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 253             sayNO;                                                            \
 254         }                                                                     \
 255         goto increment_locinput;                                              \
 256     /* Generate the non-locale cases */                                       \
 257     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 258
 259 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 260  * statements to handle separate Unicode semantics nodes */
 261 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 262                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 263                   NAMEU, NNAMEU, FUNCU,                                        \
 264                   NAMEA, NNAMEA, FUNCA,                                        \
 265                   CLASS, STR)                                                  \
 266     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 267             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 268             NAMEA, NNAMEA, FUNCA,                                              \
 269             CLASS, STR)                                                        \
 270     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 271
 272 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 273
 274 /* for use after a quantifier and before an EXACT-like node -- japhy */
 275 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 276  *
 277  * NOTE that *nothing* that affects backtracking should be in here, specifically
 278  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 279  * node that is in between two EXACT like nodes when ascertaining what the required
 280  * "follow" character is. This should probably be moved to regex compile time
 281  * although it may be done at run time beause of the REF possibility - more
 282  * investigation required. -- demerphq
 283 */
 284 #define JUMPABLE(rn) (      \
 285     OP(rn) == OPEN ||       \
 286     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 287     OP(rn) == EVAL ||   \
 288     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 289     OP(rn) == PLUS || OP(rn) == MINMOD || \
 290     OP(rn) == KEEPS || \
 291     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 292 )
 293 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 294
 295 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 296
 297 #if 0
 298 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 299    we don't need this definition. */
 300 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 301 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 302 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 303
 304 #else
 305 /* ... so we use this as its faster. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 307 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 308 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 309 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 310
 311 #endif
 312
 313 /*
 314   Search for mandatory following text node; for lookahead, the text must
 315   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 316 */
 317 #define FIND_NEXT_IMPT(rn) STMT_START { \
 318     while (JUMPABLE(rn)) { \
 319         const OPCODE type = OP(rn); \
 320         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 321             rn = NEXTOPER(NEXTOPER(rn)); \
 322         else if (type == PLUS) \
 323             rn = NEXTOPER(rn); \
 324         else if (type == IFMATCH) \
 325             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 326         else rn += NEXT_OFF(rn); \
 327     } \
 328 } STMT_END
 329
 330
 331 static void restore_pos(pTHX_ void *arg);
 332
 333 #define REGCP_PAREN_ELEMS 3
 334 #define REGCP_OTHER_ELEMS 3
 335 #define REGCP_FRAME_ELEMS 1
 336 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 337  * are needed for the regexp context stack bookkeeping. */
 338
 339 STATIC CHECKPOINT
 340 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 341 {
 342     dVAR;
 343     const int retval = PL_savestack_ix;
 344     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 345     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 346     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 347     I32 p;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     PERL_ARGS_ASSERT_REGCPPUSH;
 351
 352     if (paren_elems_to_push < 0)
 353         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 354                    paren_elems_to_push);
 355
 356     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 357         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 358                    " out of range (%lu-%ld)",
 359                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 360
 361     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 362
 363     DEBUG_BUFFERS_r(
 364         if ((int)PL_regsize > (int)parenfloor)
 365             PerlIO_printf(Perl_debug_log,
 366                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 367                 PTR2UV(rex),
 368                 PTR2UV(rex->offs)
 369             );
 370     );
 371     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 372 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 373         SSPUSHINT(rex->offs[p].end);
 374         SSPUSHINT(rex->offs[p].start);
 375         SSPUSHINT(rex->offs[p].start_tmp);
 376         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 377             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 378             (UV)p,
 379             (IV)rex->offs[p].start,
 380             (IV)rex->offs[p].start_tmp,
 381             (IV)rex->offs[p].end
 382         ));
 383     }
 384 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 385     SSPUSHINT(PL_regsize);
 386     SSPUSHINT(rex->lastparen);
 387     SSPUSHINT(rex->lastcloseparen);
 388     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 389
 390     return retval;
 391 }
 392
 393 /* These are needed since we do not localize EVAL nodes: */
 394 #define REGCP_SET(cp)                                           \
 395     DEBUG_STATE_r(                                              \
 396             PerlIO_printf(Perl_debug_log,                       \
 397                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 398                 (IV)PL_savestack_ix));                          \
 399     cp = PL_savestack_ix
 400
 401 #define REGCP_UNWIND(cp)                                        \
 402     DEBUG_STATE_r(                                              \
 403         if (cp != PL_savestack_ix)                              \
 404             PerlIO_printf(Perl_debug_log,                       \
 405                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 406                 (IV)(cp), (IV)PL_savestack_ix));                \
 407     regcpblow(cp)
 408
 409 #define UNWIND_PAREN(lp, lcp)               \
 410     for (n = rex->lastparen; n > lp; n--)   \
 411         rex->offs[n].end = -1;              \
 412     rex->lastparen = n;                     \
 413     rex->lastcloseparen = lcp;
 414
 415
 416 STATIC void
 417 S_regcppop(pTHX_ regexp *rex)
 418 {
 419     dVAR;
 420     UV i;
 421     U32 paren;
 422     GET_RE_DEBUG_FLAGS_DECL;
 423
 424     PERL_ARGS_ASSERT_REGCPPOP;
 425
 426     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 427     i = SSPOPUV;
 428     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 429     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 430     rex->lastcloseparen = SSPOPINT;
 431     rex->lastparen = SSPOPINT;
 432     PL_regsize = SSPOPINT;
 433
 434     i -= REGCP_OTHER_ELEMS;
 435     /* Now restore the parentheses context. */
 436     DEBUG_BUFFERS_r(
 437         if (i || rex->lastparen + 1 <= rex->nparens)
 438             PerlIO_printf(Perl_debug_log,
 439                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 440                 PTR2UV(rex),
 441                 PTR2UV(rex->offs)
 442             );
 443     );
 444     paren = PL_regsize;
 445     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 446         I32 tmps;
 447         rex->offs[paren].start_tmp = SSPOPINT;
 448         rex->offs[paren].start = SSPOPINT;
 449         tmps = SSPOPINT;
 450         if (paren <= rex->lastparen)
 451             rex->offs[paren].end = tmps;
 452         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 453             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 454             (UV)paren,
 455             (IV)rex->offs[paren].start,
 456             (IV)rex->offs[paren].start_tmp,
 457             (IV)rex->offs[paren].end,
 458             (paren > rex->lastparen ? "(skipped)" : ""));
 459         );
 460         paren--;
 461     }
 462 #if 1
 463     /* It would seem that the similar code in regtry()
 464      * already takes care of this, and in fact it is in
 465      * a better location to since this code can #if 0-ed out
 466      * but the code in regtry() is needed or otherwise tests
 467      * requiring null fields (pat.t#187 and split.t#{13,14}
 468      * (as of patchlevel 7877)  will fail.  Then again,
 469      * this code seems to be necessary or otherwise
 470      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 471      * --jhi updated by dapm */
 472     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 473         if (i > PL_regsize)
 474             rex->offs[i].start = -1;
 475         rex->offs[i].end = -1;
 476         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 477             "    \\%"UVuf": %s   ..-1 undeffing\n",
 478             (UV)i,
 479             (i > PL_regsize) ? "-1" : "  "
 480         ));
 481     }
 482 #endif
 483 }
 484
 485 /* restore the parens and associated vars at savestack position ix,
 486  * but without popping the stack */
 487
 488 STATIC void
 489 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 490 {
 491     I32 tmpix = PL_savestack_ix;
 492     PL_savestack_ix = ix;
 493     regcppop(rex);
 494     PL_savestack_ix = tmpix;
 495 }
 496
 497 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 498
 499 /*
 500  * pregexec and friends
 501  */
 502
 503 #ifndef PERL_IN_XSUB_RE
 504 /*
 505  - pregexec - match a regexp against a string
 506  */
 507 I32
 508 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 509          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 510 /* stringarg: the point in the string at which to begin matching */
 511 /* strend:    pointer to null at end of string */
 512 /* strbeg:    real beginning of string */
 513 /* minend:    end of match must be >= minend bytes after stringarg. */
 514 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 515  *            itself is accessed via the pointers above */
 516 /* nosave:    For optimizations. */
 517 {
 518     PERL_ARGS_ASSERT_PREGEXEC;
 519
 520     return
 521         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 522                       nosave ? 0 : REXEC_COPY_STR);
 523 }
 524 #endif
 525
 526 /*
 527  * Need to implement the following flags for reg_anch:
 528  *
 529  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 530  * USE_INTUIT_ML
 531  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 532  * INTUIT_AUTORITATIVE_ML
 533  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 534  * INTUIT_ONCE_ML
 535  *
 536  * Another flag for this function: SECOND_TIME (so that float substrs
 537  * with giant delta may be not rechecked).
 538  */
 539
 540 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 541
 542 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 543    Otherwise, only SvCUR(sv) is used to get strbeg. */
 544
 545 /* XXXX We assume that strpos is strbeg unless sv. */
 546
 547 /* XXXX Some places assume that there is a fixed substring.
 548         An update may be needed if optimizer marks as "INTUITable"
 549         RExen without fixed substrings.  Similarly, it is assumed that
 550         lengths of all the strings are no more than minlen, thus they
 551         cannot come from lookahead.
 552         (Or minlen should take into account lookahead.)
 553   NOTE: Some of this comment is not correct. minlen does now take account
 554   of lookahead/behind. Further research is required. -- demerphq
 555
 556 */
 557
 558 /* A failure to find a constant substring means that there is no need to make
 559    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 560    finding a substring too deep into the string means that less calls to
 561    regtry() should be needed.
 562
 563    REx compiler's optimizer found 4 possible hints:
 564         a) Anchored substring;
 565         b) Fixed substring;
 566         c) Whether we are anchored (beginning-of-line or \G);
 567         d) First node (of those at offset 0) which may distinguish positions;
 568    We use a)b)d) and multiline-part of c), and try to find a position in the
 569    string which does not contradict any of them.
 570  */
 571
 572 /* Most of decisions we do here should have been done at compile time.
 573    The nodes of the REx which we used for the search should have been
 574    deleted from the finite automaton. */
 575
 576 char *
 577 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 578                      char *strend, const U32 flags, re_scream_pos_data *data)
 579 {
 580     dVAR;
 581     struct regexp *const prog = ReANY(rx);
 582     I32 start_shift = 0;
 583     /* Should be nonnegative! */
 584     I32 end_shift   = 0;
 585     char *s;
 586     SV *check;
 587     char *strbeg;
 588     char *t;
 589     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 590     I32 ml_anch;
 591     char *other_last = NULL;    /* other substr checked before this */
 592     char *check_at = NULL;              /* check substr found at this pos */
 593     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 594     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 595     RXi_GET_DECL(prog,progi);
 596 #ifdef DEBUGGING
 597     const char * const i_strpos = strpos;
 598 #endif
 599     GET_RE_DEBUG_FLAGS_DECL;
 600
 601     PERL_ARGS_ASSERT_RE_INTUIT_START;
 602     PERL_UNUSED_ARG(flags);
 603     PERL_UNUSED_ARG(data);
 604
 605     RX_MATCH_UTF8_set(rx,utf8_target);
 606
 607     if (RX_UTF8(rx)) {
 608         PL_reg_flags |= RF_utf8;
 609     }
 610     DEBUG_EXECUTE_r(
 611         debug_start_match(rx, utf8_target, strpos, strend,
 612             sv ? "Guessing start of match in sv for"
 613                : "Guessing start of match in string for");
 614               );
 615
 616     /* CHR_DIST() would be more correct here but it makes things slow. */
 617     if (prog->minlen > strend - strpos) {
 618         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 619                               "String too short... [re_intuit_start]\n"));
 620         goto fail;
 621     }
 622
 623     /* XXX we need to pass strbeg as a separate arg: the following is
 624      * guesswork and can be wrong... */
 625     if (sv && SvPOK(sv)) {
 626         char * p   = SvPVX(sv);
 627         STRLEN cur = SvCUR(sv);
 628         if (p <= strpos && strpos < p + cur) {
 629             strbeg = p;
 630             assert(p <= strend && strend <= p + cur);
 631         }
 632         else
 633             strbeg = strend - cur;
 634     }
 635     else
 636         strbeg = strpos;
 637
 638     PL_regeol = strend;
 639     if (utf8_target) {
 640         if (!prog->check_utf8 && prog->check_substr)
 641             to_utf8_substr(prog);
 642         check = prog->check_utf8;
 643     } else {
 644         if (!prog->check_substr && prog->check_utf8) {
 645             if (! to_byte_substr(prog)) {
 646                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 647             }
 648         }
 649         check = prog->check_substr;
 650     }
 651     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 652         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 653                      || ( (prog->extflags & RXf_ANCH_BOL)
 654                           && !multiline ) );    /* Check after \n? */
 655
 656         if (!ml_anch) {
 657           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 658                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 659                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 660                && sv && !SvROK(sv)
 661                && (strpos != strbeg)) {
 662               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 663               goto fail;
 664           }
 665           if (prog->check_offset_min == prog->check_offset_max
 666               && !(prog->extflags & RXf_CANY_SEEN)
 667               && ! multiline)   /* /m can cause \n's to match that aren't
 668                                    accounted for in the string max length.
 669                                    See [perl #115242] */
 670           {
 671             /* Substring at constant offset from beg-of-str... */
 672             I32 slen;
 673
 674             s = HOP3c(strpos, prog->check_offset_min, strend);
 675
 676             if (SvTAIL(check)) {
 677                 slen = SvCUR(check);    /* >= 1 */
 678
 679                 if ( strend - s > slen || strend - s < slen - 1
 680                      || (strend - s == slen && strend[-1] != '\n')) {
 681                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 682                     goto fail_finish;
 683                 }
 684                 /* Now should match s[0..slen-2] */
 685                 slen--;
 686                 if (slen && (*SvPVX_const(check) != *s
 687                              || (slen > 1
 688                                  && memNE(SvPVX_const(check), s, slen)))) {
 689                   report_neq:
 690                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 691                     goto fail_finish;
 692                 }
 693             }
 694             else if (*SvPVX_const(check) != *s
 695                      || ((slen = SvCUR(check)) > 1
 696                          && memNE(SvPVX_const(check), s, slen)))
 697                 goto report_neq;
 698             check_at = s;
 699             goto success_at_start;
 700           }
 701         }
 702         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 703         s = strpos;
 704         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 705         end_shift = prog->check_end_shift;
 706
 707         if (!ml_anch) {
 708             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 709                                          - (SvTAIL(check) != 0);
 710             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 711
 712             if (end_shift < eshift)
 713                 end_shift = eshift;
 714         }
 715     }
 716     else {                              /* Can match at random position */
 717         ml_anch = 0;
 718         s = strpos;
 719         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 720         end_shift = prog->check_end_shift;
 721
 722         /* end shift should be non negative here */
 723     }
 724
 725 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 726     if (end_shift < 0)
 727         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 728                    (IV)end_shift, RX_PRECOMP(prog));
 729 #endif
 730
 731   restart:
 732     /* Find a possible match in the region s..strend by looking for
 733        the "check" substring in the region corrected by start/end_shift. */
 734
 735     {
 736         I32 srch_start_shift = start_shift;
 737         I32 srch_end_shift = end_shift;
 738         U8* start_point;
 739         U8* end_point;
 740         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 741             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 742             srch_start_shift = strbeg - s;
 743         }
 744     DEBUG_OPTIMISE_MORE_r({
 745         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 746             (IV)prog->check_offset_min,
 747             (IV)srch_start_shift,
 748             (IV)srch_end_shift,
 749             (IV)prog->check_end_shift);
 750     });
 751
 752         if (prog->extflags & RXf_CANY_SEEN) {
 753             start_point= (U8*)(s + srch_start_shift);
 754             end_point= (U8*)(strend - srch_end_shift);
 755         } else {
 756             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 757             end_point= HOP3(strend, -srch_end_shift, strbeg);
 758         }
 759         DEBUG_OPTIMISE_MORE_r({
 760             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 761                 (int)(end_point - start_point),
 762                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 763                 start_point);
 764         });
 765
 766         s = fbm_instr( start_point, end_point,
 767                       check, multiline ? FBMrf_MULTILINE : 0);
 768     }
 769     /* Update the count-of-usability, remove useless subpatterns,
 770         unshift s.  */
 771
 772     DEBUG_EXECUTE_r({
 773         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 774             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 775         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 776                           (s ? "Found" : "Did not find"),
 777             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 778                 ? "anchored" : "floating"),
 779             quoted,
 780             RE_SV_TAIL(check),
 781             (s ? " at offset " : "...\n") );
 782     });
 783
 784     if (!s)
 785         goto fail_finish;
 786     /* Finish the diagnostic message */
 787     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 788
 789     /* XXX dmq: first branch is for positive lookbehind...
 790        Our check string is offset from the beginning of the pattern.
 791        So we need to do any stclass tests offset forward from that
 792        point. I think. :-(
 793      */
 794
 795
 796
 797     check_at=s;
 798
 799
 800     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 801        Start with the other substr.
 802        XXXX no SCREAM optimization yet - and a very coarse implementation
 803        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 804                 *always* match.  Probably should be marked during compile...
 805        Probably it is right to do no SCREAM here...
 806      */
 807
 808     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 809                 : (prog->float_substr && prog->anchored_substr))
 810     {
 811         /* Take into account the "other" substring. */
 812         /* XXXX May be hopelessly wrong for UTF... */
 813         if (!other_last)
 814             other_last = strpos;
 815         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 816           do_other_anchored:
 817             {
 818                 char * const last = HOP3c(s, -start_shift, strbeg);
 819                 char *last1, *last2;
 820                 char * const saved_s = s;
 821                 SV* must;
 822
 823                 t = s - prog->check_offset_max;
 824                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 825                     && (!utf8_target
 826                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 827                             && t > strpos)))
 828                     NOOP;
 829                 else
 830                     t = strpos;
 831                 t = HOP3c(t, prog->anchored_offset, strend);
 832                 if (t < other_last)     /* These positions already checked */
 833                     t = other_last;
 834                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 835                 if (last < last1)
 836                     last1 = last;
 837                 /* XXXX It is not documented what units *_offsets are in.
 838                    We assume bytes, but this is clearly wrong.
 839                    Meaning this code needs to be carefully reviewed for errors.
 840                    dmq.
 841                   */
 842
 843                 /* On end-of-str: see comment below. */
 844                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 845                 if (must == &PL_sv_undef) {
 846                     s = (char*)NULL;
 847                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 848                 }
 849                 else
 850                     s = fbm_instr(
 851                         (unsigned char*)t,
 852                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 853                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 854                         must,
 855                         multiline ? FBMrf_MULTILINE : 0
 856                     );
 857                 DEBUG_EXECUTE_r({
 858                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 859                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 860                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 861                         (s ? "Found" : "Contradicts"),
 862                         quoted, RE_SV_TAIL(must));
 863                 });
 864
 865
 866                 if (!s) {
 867                     if (last1 >= last2) {
 868                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 869                                                 ", giving up...\n"));
 870                         goto fail_finish;
 871                     }
 872                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 873                         ", trying floating at offset %ld...\n",
 874                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 875                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 876                     s = HOP3c(last, 1, strend);
 877                     goto restart;
 878                 }
 879                 else {
 880                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 881                           (long)(s - i_strpos)));
 882                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 883                     other_last = HOP3c(s, 1, strend);
 884                     s = saved_s;
 885                     if (t == strpos)
 886                         goto try_at_start;
 887                     goto try_at_offset;
 888                 }
 889             }
 890         }
 891         else {          /* Take into account the floating substring. */
 892             char *last, *last1;
 893             char * const saved_s = s;
 894             SV* must;
 895
 896             t = HOP3c(s, -start_shift, strbeg);
 897             last1 = last =
 898                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 899             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 900                 last = HOP3c(t, prog->float_max_offset, strend);
 901             s = HOP3c(t, prog->float_min_offset, strend);
 902             if (s < other_last)
 903                 s = other_last;
 904  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 905             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 906             /* fbm_instr() takes into account exact value of end-of-str
 907                if the check is SvTAIL(ed).  Since false positives are OK,
 908                and end-of-str is not later than strend we are OK. */
 909             if (must == &PL_sv_undef) {
 910                 s = (char*)NULL;
 911                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 912             }
 913             else
 914                 s = fbm_instr((unsigned char*)s,
 915                               (unsigned char*)last + SvCUR(must)
 916                                   - (SvTAIL(must)!=0),
 917                               must, multiline ? FBMrf_MULTILINE : 0);
 918             DEBUG_EXECUTE_r({
 919                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 920                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 921                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 922                     (s ? "Found" : "Contradicts"),
 923                     quoted, RE_SV_TAIL(must));
 924             });
 925             if (!s) {
 926                 if (last1 == last) {
 927                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 928                                             ", giving up...\n"));
 929                     goto fail_finish;
 930                 }
 931                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 932                     ", trying anchored starting at offset %ld...\n",
 933                     (long)(saved_s + 1 - i_strpos)));
 934                 other_last = last;
 935                 s = HOP3c(t, 1, strend);
 936                 goto restart;
 937             }
 938             else {
 939                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 940                       (long)(s - i_strpos)));
 941                 other_last = s; /* Fix this later. --Hugo */
 942                 s = saved_s;
 943                 if (t == strpos)
 944                     goto try_at_start;
 945                 goto try_at_offset;
 946             }
 947         }
 948     }
 949
 950
 951     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 952
 953     DEBUG_OPTIMISE_MORE_r(
 954         PerlIO_printf(Perl_debug_log,
 955             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 956             (IV)prog->check_offset_min,
 957             (IV)prog->check_offset_max,
 958             (IV)(s-strpos),
 959             (IV)(t-strpos),
 960             (IV)(t-s),
 961             (IV)(strend-strpos)
 962         )
 963     );
 964
 965     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 966         && (!utf8_target
 967             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 968                  && t > strpos)))
 969     {
 970         /* Fixed substring is found far enough so that the match
 971            cannot start at strpos. */
 972       try_at_offset:
 973         if (ml_anch && t[-1] != '\n') {
 974             /* Eventually fbm_*() should handle this, but often
 975                anchored_offset is not 0, so this check will not be wasted. */
 976             /* XXXX In the code below we prefer to look for "^" even in
 977                presence of anchored substrings.  And we search even
 978                beyond the found float position.  These pessimizations
 979                are historical artefacts only.  */
 980           find_anchor:
 981             while (t < strend - prog->minlen) {
 982                 if (*t == '\n') {
 983                     if (t < check_at - prog->check_offset_min) {
 984                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 985                             /* Since we moved from the found position,
 986                                we definitely contradict the found anchored
 987                                substr.  Due to the above check we do not
 988                                contradict "check" substr.
 989                                Thus we can arrive here only if check substr
 990                                is float.  Redo checking for "other"=="fixed".
 991                              */
 992                             strpos = t + 1;
 993                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 994                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 995                             goto do_other_anchored;
 996                         }
 997                         /* We don't contradict the found floating substring. */
 998                         /* XXXX Why not check for STCLASS? */
 999                         s = t + 1;
1000                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1001                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1002                         goto set_useful;
1003                     }
1004                     /* Position contradicts check-string */
1005                     /* XXXX probably better to look for check-string
1006                        than for "\n", so one should lower the limit for t? */
1007                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1008                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1009                     other_last = strpos = s = t + 1;
1010                     goto restart;
1011                 }
1012                 t++;
1013             }
1014             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1015                         PL_colors[0], PL_colors[1]));
1016             goto fail_finish;
1017         }
1018         else {
1019             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1020                         PL_colors[0], PL_colors[1]));
1021         }
1022         s = t;
1023       set_useful:
1024         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1025     }
1026     else {
1027         /* The found string does not prohibit matching at strpos,
1028            - no optimization of calling REx engine can be performed,
1029            unless it was an MBOL and we are not after MBOL,
1030            or a future STCLASS check will fail this. */
1031       try_at_start:
1032         /* Even in this situation we may use MBOL flag if strpos is offset
1033            wrt the start of the string. */
1034         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1035             && (strpos != strbeg) && strpos[-1] != '\n'
1036             /* May be due to an implicit anchor of m{.*foo}  */
1037             && !(prog->intflags & PREGf_IMPLICIT))
1038         {
1039             t = strpos;
1040             goto find_anchor;
1041         }
1042         DEBUG_EXECUTE_r( if (ml_anch)
1043             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1044                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1045         );
1046       success_at_start:
1047         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1048             && (utf8_target ? (
1049                 prog->check_utf8                /* Could be deleted already */
1050                 && --BmUSEFUL(prog->check_utf8) < 0
1051                 && (prog->check_utf8 == prog->float_utf8)
1052             ) : (
1053                 prog->check_substr              /* Could be deleted already */
1054                 && --BmUSEFUL(prog->check_substr) < 0
1055                 && (prog->check_substr == prog->float_substr)
1056             )))
1057         {
1058             /* If flags & SOMETHING - do not do it many times on the same match */
1059             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1060             /* XXX Does the destruction order has to change with utf8_target? */
1061             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1062             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1063             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1064             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1065             check = NULL;                       /* abort */
1066             s = strpos;
1067             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1068                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1069             if (prog->intflags & PREGf_IMPLICIT)
1070                 prog->extflags &= ~RXf_ANCH_MBOL;
1071             /* XXXX This is a remnant of the old implementation.  It
1072                     looks wasteful, since now INTUIT can use many
1073                     other heuristics. */
1074             prog->extflags &= ~RXf_USE_INTUIT;
1075             /* XXXX What other flags might need to be cleared in this branch? */
1076         }
1077         else
1078             s = strpos;
1079     }
1080
1081     /* Last resort... */
1082     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1083     /* trie stclasses are too expensive to use here, we are better off to
1084        leave it to regmatch itself */
1085     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1086         /* minlen == 0 is possible if regstclass is \b or \B,
1087            and the fixed substr is ''$.
1088            Since minlen is already taken into account, s+1 is before strend;
1089            accidentally, minlen >= 1 guaranties no false positives at s + 1
1090            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1091            regstclass does not come from lookahead...  */
1092         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1093            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1094         const U8* const str = (U8*)STRING(progi->regstclass);
1095         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1096                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1097                     : 1);
1098         char * endpos;
1099         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1100             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1101         else if (prog->float_substr || prog->float_utf8)
1102             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1103         else
1104             endpos= strend;
1105
1106         if (checked_upto < s)
1107            checked_upto = s;
1108         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1109                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1110
1111         t = s;
1112         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1113         if (s) {
1114             checked_upto = s;
1115         } else {
1116 #ifdef DEBUGGING
1117             const char *what = NULL;
1118 #endif
1119             if (endpos == strend) {
1120                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1121                                 "Could not match STCLASS...\n") );
1122                 goto fail;
1123             }
1124             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1125                                    "This position contradicts STCLASS...\n") );
1126             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1127                 goto fail;
1128             checked_upto = HOPBACKc(endpos, start_shift);
1129             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1130                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1131             /* Contradict one of substrings */
1132             if (prog->anchored_substr || prog->anchored_utf8) {
1133                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1134                     DEBUG_EXECUTE_r( what = "anchored" );
1135                   hop_and_restart:
1136                     s = HOP3c(t, 1, strend);
1137                     if (s + start_shift + end_shift > strend) {
1138                         /* XXXX Should be taken into account earlier? */
1139                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1140                                                "Could not match STCLASS...\n") );
1141                         goto fail;
1142                     }
1143                     if (!check)
1144                         goto giveup;
1145                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1146                                 "Looking for %s substr starting at offset %ld...\n",
1147                                  what, (long)(s + start_shift - i_strpos)) );
1148                     goto restart;
1149                 }
1150                 /* Have both, check_string is floating */
1151                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1152                     goto retry_floating_check;
1153                 /* Recheck anchored substring, but not floating... */
1154                 s = check_at;
1155                 if (!check)
1156                     goto giveup;
1157                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1158                           "Looking for anchored substr starting at offset %ld...\n",
1159                           (long)(other_last - i_strpos)) );
1160                 goto do_other_anchored;
1161             }
1162             /* Another way we could have checked stclass at the
1163                current position only: */
1164             if (ml_anch) {
1165                 s = t = t + 1;
1166                 if (!check)
1167                     goto giveup;
1168                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1169                           "Looking for /%s^%s/m starting at offset %ld...\n",
1170                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1171                 goto try_at_offset;
1172             }
1173             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1174                 goto fail;
1175             /* Check is floating substring. */
1176           retry_floating_check:
1177             t = check_at - start_shift;
1178             DEBUG_EXECUTE_r( what = "floating" );
1179             goto hop_and_restart;
1180         }
1181         if (t != s) {
1182             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1183                         "By STCLASS: moving %ld --> %ld\n",
1184                                   (long)(t - i_strpos), (long)(s - i_strpos))
1185                    );
1186         }
1187         else {
1188             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1189                                   "Does not contradict STCLASS...\n");
1190                    );
1191         }
1192     }
1193   giveup:
1194     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1195                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1196                           PL_colors[5], (long)(s - i_strpos)) );
1197     return s;
1198
1199   fail_finish:                          /* Substring not found */
1200     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1201         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1202   fail:
1203     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1204                           PL_colors[4], PL_colors[5]));
1205     return NULL;
1206 }
1207
1208 #define DECL_TRIE_TYPE(scan) \
1209     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1210                     trie_type = ((scan->flags == EXACT) \
1211                               ? (utf8_target ? trie_utf8 : trie_plain) \
1212                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1213
1214 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1215 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1216     STRLEN skiplen;                                                                 \
1217     switch (trie_type) {                                                            \
1218     case trie_utf8_fold:                                                            \
1219         if ( foldlen>0 ) {                                                          \
1220             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1221             foldlen -= len;                                                         \
1222             uscan += len;                                                           \
1223             len=0;                                                                  \
1224         } else {                                                                    \
1225             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1226             len = UTF8SKIP(uc);                                                     \
1227             skiplen = UNISKIP( uvc );                                               \
1228             foldlen -= skiplen;                                                     \
1229             uscan = foldbuf + skiplen;                                              \
1230         }                                                                           \
1231         break;                                                                      \
1232     case trie_latin_utf8_fold:                                                      \
1233         if ( foldlen>0 ) {                                                          \
1234             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1235             foldlen -= len;                                                         \
1236             uscan += len;                                                           \
1237             len=0;                                                                  \
1238         } else {                                                                    \
1239             len = 1;                                                                \
1240             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1241             skiplen = UNISKIP( uvc );                                               \
1242             foldlen -= skiplen;                                                     \
1243             uscan = foldbuf + skiplen;                                              \
1244         }                                                                           \
1245         break;                                                                      \
1246     case trie_utf8:                                                                 \
1247         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1248         break;                                                                      \
1249     case trie_plain:                                                                \
1250         uvc = (UV)*uc;                                                              \
1251         len = 1;                                                                    \
1252     }                                                                               \
1253     if (uvc < 256) {                                                                \
1254         charid = trie->charmap[ uvc ];                                              \
1255     }                                                                               \
1256     else {                                                                          \
1257         charid = 0;                                                                 \
1258         if (widecharmap) {                                                          \
1259             SV** const svpp = hv_fetch(widecharmap,                                 \
1260                         (char*)&uvc, sizeof(UV), 0);                                \
1261             if (svpp)                                                               \
1262                 charid = (U16)SvIV(*svpp);                                          \
1263         }                                                                           \
1264     }                                                                               \
1265 } STMT_END
1266
1267 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1268 STMT_START {                                              \
1269     while (s <= e) {                                      \
1270         if ( (CoNd)                                       \
1271              && (ln == 1 || folder(s, pat_string, ln))    \
1272              && (!reginfo || regtry(reginfo, &s)) )       \
1273             goto got_it;                                  \
1274         s++;                                              \
1275     }                                                     \
1276 } STMT_END
1277
1278 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1279 STMT_START {                                          \
1280     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1281         CoDe                                          \
1282         s += uskip;                                   \
1283     }                                                 \
1284 } STMT_END
1285
1286 #define REXEC_FBC_SCAN(CoDe)                          \
1287 STMT_START {                                          \
1288     while (s < strend) {                              \
1289         CoDe                                          \
1290         s++;                                          \
1291     }                                                 \
1292 } STMT_END
1293
1294 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1295 REXEC_FBC_UTF8_SCAN(                                  \
1296     if (CoNd) {                                       \
1297         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1298             goto got_it;                              \
1299         else                                          \
1300             tmp = doevery;                            \
1301     }                                                 \
1302     else                                              \
1303         tmp = 1;                                      \
1304 )
1305
1306 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1307 REXEC_FBC_SCAN(                                       \
1308     if (CoNd) {                                       \
1309         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1310             goto got_it;                              \
1311         else                                          \
1312             tmp = doevery;                            \
1313     }                                                 \
1314     else                                              \
1315         tmp = 1;                                      \
1316 )
1317
1318 #define REXEC_FBC_TRYIT               \
1319 if ((!reginfo || regtry(reginfo, &s))) \
1320     goto got_it
1321
1322 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1323     if (utf8_target) {                                             \
1324         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1325     }                                                          \
1326     else {                                                     \
1327         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1328     }
1329
1330 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1331     if (utf8_target) {                                             \
1332         UtFpReLoAd;                                            \
1333         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1334     }                                                          \
1335     else {                                                     \
1336         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1337     }
1338
1339 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1340     PL_reg_flags |= RF_tainted;                                \
1341     if (utf8_target) {                                             \
1342         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1343     }                                                          \
1344     else {                                                     \
1345         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1346     }
1347
1348 #define DUMP_EXEC_POS(li,s,doutf8) \
1349     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1350
1351
1352 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1353         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1354         tmp = TEST_NON_UTF8(tmp);                                              \
1355         REXEC_FBC_UTF8_SCAN(                                                   \
1356             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1357                 tmp = !tmp;                                                    \
1358                 IF_SUCCESS;                                                    \
1359             }                                                                  \
1360             else {                                                             \
1361                 IF_FAIL;                                                       \
1362             }                                                                  \
1363         );                                                                     \
1364
1365 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1366         if (s == PL_bostr) {                                                   \
1367             tmp = '\n';                                                        \
1368         }                                                                      \
1369         else {                                                                 \
1370             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1371             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1372         }                                                                      \
1373         tmp = TeSt1_UtF8;                                                      \
1374         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1375         REXEC_FBC_UTF8_SCAN(                                                   \
1376             if (tmp == ! (TeSt2_UtF8)) { \
1377                 tmp = !tmp;                                                    \
1378                 IF_SUCCESS;                                                    \
1379             }                                                                  \
1380             else {                                                             \
1381                 IF_FAIL;                                                       \
1382             }                                                                  \
1383         );                                                                     \
1384
1385 /* The only difference between the BOUND and NBOUND cases is that
1386  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1387  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1388  * with the other one being empty */
1389 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1390     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1391
1392 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1393     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1394
1395 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1396     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1397
1398 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1399     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1400
1401
1402 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1403  * be passed in completely with the variable name being tested, which isn't
1404  * such a clean interface, but this is easier to read than it was before.  We
1405  * are looking for the boundary (or non-boundary between a word and non-word
1406  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1407  * must be different.  Find the "wordness" of the character just prior to this
1408  * one, and compare it with the wordness of this one.  If they differ, we have
1409  * a boundary.  At the beginning of the string, pretend that the previous
1410  * character was a new-line */
1411 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1412     if (utf8_target) {                                                         \
1413                 UTF8_CODE \
1414     }                                                                          \
1415     else {  /* Not utf8 */                                                     \
1416         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1417         tmp = TEST_NON_UTF8(tmp);                                              \
1418         REXEC_FBC_SCAN(                                                        \
1419             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1420                 tmp = !tmp;                                                    \
1421                 IF_SUCCESS;                                                    \
1422             }                                                                  \
1423             else {                                                             \
1424                 IF_FAIL;                                                       \
1425             }                                                                  \
1426         );                                                                     \
1427     }                                                                          \
1428     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1429         goto got_it;
1430
1431 /* We know what class REx starts with.  Try to find this position... */
1432 /* if reginfo is NULL, its a dryrun */
1433 /* annoyingly all the vars in this routine have different names from their counterparts
1434    in regmatch. /grrr */
1435
1436 STATIC char *
1437 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1438     const char *strend, regmatch_info *reginfo)
1439 {
1440         dVAR;
1441         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1442         char *pat_string;   /* The pattern's exactish string */
1443         char *pat_end;      /* ptr to end char of pat_string */
1444         re_fold_t folder;       /* Function for computing non-utf8 folds */
1445         const U8 *fold_array;   /* array for folding ords < 256 */
1446         STRLEN ln;
1447         STRLEN lnc;
1448         STRLEN uskip;
1449         U8 c1;
1450         U8 c2;
1451         char *e;
1452         I32 tmp = 1;    /* Scratch variable? */
1453         const bool utf8_target = PL_reg_match_utf8;
1454         UV utf8_fold_flags = 0;
1455         RXi_GET_DECL(prog,progi);
1456
1457         PERL_ARGS_ASSERT_FIND_BYCLASS;
1458
1459         /* We know what class it must start with. */
1460         switch (OP(c)) {
1461         case ANYOF:
1462             if (utf8_target) {
1463                 REXEC_FBC_UTF8_CLASS_SCAN(
1464                           reginclass(prog, c, (U8*)s, utf8_target));
1465             }
1466             else {
1467                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1468             }
1469             break;
1470         case CANY:
1471             REXEC_FBC_SCAN(
1472                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1473                     goto got_it;
1474                 else
1475                     tmp = doevery;
1476             );
1477             break;
1478
1479         case EXACTFA:
1480             if (UTF_PATTERN || utf8_target) {
1481                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1482                 goto do_exactf_utf8;
1483             }
1484             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1485             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1486             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1487
1488         case EXACTF:
1489             if (utf8_target) {
1490
1491                 /* regcomp.c already folded this if pattern is in UTF-8 */
1492                 utf8_fold_flags = 0;
1493                 goto do_exactf_utf8;
1494             }
1495             fold_array = PL_fold;
1496             folder = foldEQ;
1497             goto do_exactf_non_utf8;
1498
1499         case EXACTFL:
1500             if (UTF_PATTERN || utf8_target) {
1501                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1502                 goto do_exactf_utf8;
1503             }
1504             fold_array = PL_fold_locale;
1505             folder = foldEQ_locale;
1506             goto do_exactf_non_utf8;
1507
1508         case EXACTFU_SS:
1509             if (UTF_PATTERN) {
1510                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1511             }
1512             goto do_exactf_utf8;
1513
1514         case EXACTFU_TRICKYFOLD:
1515         case EXACTFU:
1516             if (UTF_PATTERN || utf8_target) {
1517                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1518                 goto do_exactf_utf8;
1519             }
1520
1521             /* Any 'ss' in the pattern should have been replaced by regcomp,
1522              * so we don't have to worry here about this single special case
1523              * in the Latin1 range */
1524             fold_array = PL_fold_latin1;
1525             folder = foldEQ_latin1;
1526
1527             /* FALL THROUGH */
1528
1529         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1530                                are no glitches with fold-length differences
1531                                between the target string and pattern */
1532
1533             /* The idea in the non-utf8 EXACTF* cases is to first find the
1534              * first character of the EXACTF* node and then, if necessary,
1535              * case-insensitively compare the full text of the node.  c1 is the
1536              * first character.  c2 is its fold.  This logic will not work for
1537              * Unicode semantics and the german sharp ss, which hence should
1538              * not be compiled into a node that gets here. */
1539             pat_string = STRING(c);
1540             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1541
1542             /* We know that we have to match at least 'ln' bytes (which is the
1543              * same as characters, since not utf8).  If we have to match 3
1544              * characters, and there are only 2 availabe, we know without
1545              * trying that it will fail; so don't start a match past the
1546              * required minimum number from the far end */
1547             e = HOP3c(strend, -((I32)ln), s);
1548
1549             if (!reginfo && e < s) {
1550                 e = s;                  /* Due to minlen logic of intuit() */
1551             }
1552
1553             c1 = *pat_string;
1554             c2 = fold_array[c1];
1555             if (c1 == c2) { /* If char and fold are the same */
1556                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1557             }
1558             else {
1559                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1560             }
1561             break;
1562
1563         do_exactf_utf8:
1564         {
1565             unsigned expansion;
1566
1567
1568             /* If one of the operands is in utf8, we can't use the simpler
1569              * folding above, due to the fact that many different characters
1570              * can have the same fold, or portion of a fold, or different-
1571              * length fold */
1572             pat_string = STRING(c);
1573             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1574             pat_end = pat_string + ln;
1575             lnc = (UTF_PATTERN) /* length to match in characters */
1576                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1577                     : ln;
1578
1579             /* We have 'lnc' characters to match in the pattern, but because of
1580              * multi-character folding, each character in the target can match
1581              * up to 3 characters (Unicode guarantees it will never exceed
1582              * this) if it is utf8-encoded; and up to 2 if not (based on the
1583              * fact that the Latin 1 folds are already determined, and the
1584              * only multi-char fold in that range is the sharp-s folding to
1585              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1586              * string character.  Adjust lnc accordingly, rounding up, so that
1587              * if we need to match at least 4+1/3 chars, that really is 5. */
1588             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1589             lnc = (lnc + expansion - 1) / expansion;
1590
1591             /* As in the non-UTF8 case, if we have to match 3 characters, and
1592              * only 2 are left, it's guaranteed to fail, so don't start a
1593              * match that would require us to go beyond the end of the string
1594              */
1595             e = HOP3c(strend, -((I32)lnc), s);
1596
1597             if (!reginfo && e < s) {
1598                 e = s;                  /* Due to minlen logic of intuit() */
1599             }
1600
1601             /* XXX Note that we could recalculate e to stop the loop earlier,
1602              * as the worst case expansion above will rarely be met, and as we
1603              * go along we would usually find that e moves further to the left.
1604              * This would happen only after we reached the point in the loop
1605              * where if there were no expansion we should fail.  Unclear if
1606              * worth the expense */
1607
1608             while (s <= e) {
1609                 char *my_strend= (char *)strend;
1610                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1611                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1612                     && (!reginfo || regtry(reginfo, &s)) )
1613                 {
1614                     goto got_it;
1615                 }
1616                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1617             }
1618             break;
1619         }
1620         case BOUNDL:
1621             PL_reg_flags |= RF_tainted;
1622             FBC_BOUND(isALNUM_LC,
1623                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1624                       isALNUM_LC_utf8((U8*)s));
1625             break;
1626         case NBOUNDL:
1627             PL_reg_flags |= RF_tainted;
1628             FBC_NBOUND(isALNUM_LC,
1629                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1630                        isALNUM_LC_utf8((U8*)s));
1631             break;
1632         case BOUND:
1633             FBC_BOUND(isWORDCHAR,
1634                       isALNUM_uni(tmp),
1635                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1636             break;
1637         case BOUNDA:
1638             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1639                              isWORDCHAR_A(tmp),
1640                              isWORDCHAR_A((U8*)s));
1641             break;
1642         case NBOUND:
1643             FBC_NBOUND(isWORDCHAR,
1644                        isALNUM_uni(tmp),
1645                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1646             break;
1647         case NBOUNDA:
1648             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1649                               isWORDCHAR_A(tmp),
1650                               isWORDCHAR_A((U8*)s));
1651             break;
1652         case BOUNDU:
1653             FBC_BOUND(isWORDCHAR_L1,
1654                       isALNUM_uni(tmp),
1655                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1656             break;
1657         case NBOUNDU:
1658             FBC_NBOUND(isWORDCHAR_L1,
1659                        isALNUM_uni(tmp),
1660                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1661             break;
1662         case ALNUML:
1663             REXEC_FBC_CSCAN_TAINT(
1664                 isALNUM_LC_utf8((U8*)s),
1665                 isALNUM_LC(*s)
1666             );
1667             break;
1668         case ALNUMU:
1669             REXEC_FBC_CSCAN_PRELOAD(
1670                 LOAD_UTF8_CHARCLASS_ALNUM(),
1671                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1672                 isWORDCHAR_L1((U8) *s)
1673             );
1674             break;
1675         case ALNUM:
1676             REXEC_FBC_CSCAN_PRELOAD(
1677                 LOAD_UTF8_CHARCLASS_ALNUM(),
1678                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1679                 isWORDCHAR((U8) *s)
1680             );
1681             break;
1682         case ALNUMA:
1683             /* Don't need to worry about utf8, as it can match only a single
1684              * byte invariant character */
1685             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1686             break;
1687         case NALNUMU:
1688             REXEC_FBC_CSCAN_PRELOAD(
1689                 LOAD_UTF8_CHARCLASS_ALNUM(),
1690                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1691                 ! isWORDCHAR_L1((U8) *s)
1692             );
1693             break;
1694         case NALNUM:
1695             REXEC_FBC_CSCAN_PRELOAD(
1696                 LOAD_UTF8_CHARCLASS_ALNUM(),
1697                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1698                 ! isALNUM(*s)
1699             );
1700             break;
1701         case NALNUMA:
1702             REXEC_FBC_CSCAN(
1703                 !isWORDCHAR_A(*s),
1704                 !isWORDCHAR_A(*s)
1705             );
1706             break;
1707         case NALNUML:
1708             REXEC_FBC_CSCAN_TAINT(
1709                 !isALNUM_LC_utf8((U8*)s),
1710                 !isALNUM_LC(*s)
1711             );
1712             break;
1713         case SPACEU:
1714             REXEC_FBC_CSCAN(
1715                 is_XPERLSPACE_utf8(s),
1716                 isSPACE_L1((U8) *s)
1717             );
1718             break;
1719         case SPACE:
1720             REXEC_FBC_CSCAN(
1721                 is_XPERLSPACE_utf8(s),
1722                 isSPACE((U8) *s)
1723             );
1724             break;
1725         case SPACEA:
1726             /* Don't need to worry about utf8, as it can match only a single
1727              * byte invariant character */
1728             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1729             break;
1730         case SPACEL:
1731             REXEC_FBC_CSCAN_TAINT(
1732                 isSPACE_LC_utf8((U8*)s),
1733                 isSPACE_LC(*s)
1734             );
1735             break;
1736         case NSPACEU:
1737             REXEC_FBC_CSCAN(
1738                 ! is_XPERLSPACE_utf8(s),
1739                 ! isSPACE_L1((U8) *s)
1740             );
1741             break;
1742         case NSPACE:
1743             REXEC_FBC_CSCAN(
1744                 ! is_XPERLSPACE_utf8(s),
1745                 ! isSPACE((U8) *s)
1746             );
1747             break;
1748         case NSPACEA:
1749             REXEC_FBC_CSCAN(
1750                 !isSPACE_A(*s),
1751                 !isSPACE_A(*s)
1752             );
1753             break;
1754         case NSPACEL:
1755             REXEC_FBC_CSCAN_TAINT(
1756                 !isSPACE_LC_utf8((U8*)s),
1757                 !isSPACE_LC(*s)
1758             );
1759             break;
1760         case DIGIT:
1761             REXEC_FBC_CSCAN_PRELOAD(
1762                 LOAD_UTF8_CHARCLASS_DIGIT(),
1763                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1764                 isDIGIT(*s)
1765             );
1766             break;
1767         case DIGITA:
1768             /* Don't need to worry about utf8, as it can match only a single
1769              * byte invariant character */
1770             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1771             break;
1772         case DIGITL:
1773             REXEC_FBC_CSCAN_TAINT(
1774                 isDIGIT_LC_utf8((U8*)s),
1775                 isDIGIT_LC(*s)
1776             );
1777             break;
1778         case NDIGIT:
1779             REXEC_FBC_CSCAN_PRELOAD(
1780                 LOAD_UTF8_CHARCLASS_DIGIT(),
1781                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1782                 !isDIGIT(*s)
1783             );
1784             break;
1785         case NDIGITA:
1786             REXEC_FBC_CSCAN(
1787                 !isDIGIT_A(*s),
1788                 !isDIGIT_A(*s)
1789             );
1790             break;
1791         case NDIGITL:
1792             REXEC_FBC_CSCAN_TAINT(
1793                 !isDIGIT_LC_utf8((U8*)s),
1794                 !isDIGIT_LC(*s)
1795             );
1796             break;
1797         case LNBREAK:
1798             REXEC_FBC_CSCAN(
1799                 is_LNBREAK_utf8_safe(s, strend),
1800                 is_LNBREAK_latin1_safe(s, strend)
1801             );
1802             break;
1803         case VERTWS:
1804             REXEC_FBC_CSCAN(
1805                 is_VERTWS_utf8_safe(s, strend),
1806                 is_VERTWS_latin1_safe(s, strend)
1807             );
1808             break;
1809         case NVERTWS:
1810             REXEC_FBC_CSCAN(
1811                 !is_VERTWS_utf8_safe(s, strend),
1812                 !is_VERTWS_latin1_safe(s, strend)
1813             );
1814             break;
1815         case HORIZWS:
1816             REXEC_FBC_CSCAN(
1817                 is_HORIZWS_utf8_safe(s, strend),
1818                 is_HORIZWS_latin1_safe(s, strend)
1819             );
1820             break;
1821         case NHORIZWS:
1822             REXEC_FBC_CSCAN(
1823                 !is_HORIZWS_utf8_safe(s, strend),
1824                 !is_HORIZWS_latin1_safe(s, strend)
1825             );
1826             break;
1827         case POSIXA:
1828             /* Don't need to worry about utf8, as it can match only a single
1829             * byte invariant character.  The flag in this node type is the
1830             * class number to pass to _generic_isCC() to build a mask for
1831             * searching in PL_charclass[] */
1832             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1833             break;
1834         case NPOSIXA:
1835             REXEC_FBC_CSCAN(
1836                 !_generic_isCC_A(*s, FLAGS(c)),
1837                 !_generic_isCC_A(*s, FLAGS(c))
1838             );
1839             break;
1840
1841         case AHOCORASICKC:
1842         case AHOCORASICK:
1843             {
1844                 DECL_TRIE_TYPE(c);
1845                 /* what trie are we using right now */
1846                 reg_ac_data *aho
1847                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1848                 reg_trie_data *trie
1849                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1850                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1851
1852                 const char *last_start = strend - trie->minlen;
1853 #ifdef DEBUGGING
1854                 const char *real_start = s;
1855 #endif
1856                 STRLEN maxlen = trie->maxlen;
1857                 SV *sv_points;
1858                 U8 **points; /* map of where we were in the input string
1859                                 when reading a given char. For ASCII this
1860                                 is unnecessary overhead as the relationship
1861                                 is always 1:1, but for Unicode, especially
1862                                 case folded Unicode this is not true. */
1863                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1864                 U8 *bitmap=NULL;
1865
1866
1867                 GET_RE_DEBUG_FLAGS_DECL;
1868
1869                 /* We can't just allocate points here. We need to wrap it in
1870                  * an SV so it gets freed properly if there is a croak while
1871                  * running the match */
1872                 ENTER;
1873                 SAVETMPS;
1874                 sv_points=newSV(maxlen * sizeof(U8 *));
1875                 SvCUR_set(sv_points,
1876                     maxlen * sizeof(U8 *));
1877                 SvPOK_on(sv_points);
1878                 sv_2mortal(sv_points);
1879                 points=(U8**)SvPV_nolen(sv_points );
1880                 if ( trie_type != trie_utf8_fold
1881                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1882                 {
1883                     if (trie->bitmap)
1884                         bitmap=(U8*)trie->bitmap;
1885                     else
1886                         bitmap=(U8*)ANYOF_BITMAP(c);
1887                 }
1888                 /* this is the Aho-Corasick algorithm modified a touch
1889                    to include special handling for long "unknown char"
1890                    sequences. The basic idea being that we use AC as long
1891                    as we are dealing with a possible matching char, when
1892                    we encounter an unknown char (and we have not encountered
1893                    an accepting state) we scan forward until we find a legal
1894                    starting char.
1895                    AC matching is basically that of trie matching, except
1896                    that when we encounter a failing transition, we fall back
1897                    to the current states "fail state", and try the current char
1898                    again, a process we repeat until we reach the root state,
1899                    state 1, or a legal transition. If we fail on the root state
1900                    then we can either terminate if we have reached an accepting
1901                    state previously, or restart the entire process from the beginning
1902                    if we have not.
1903
1904                  */
1905                 while (s <= last_start) {
1906                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1907                     U8 *uc = (U8*)s;
1908                     U16 charid = 0;
1909                     U32 base = 1;
1910                     U32 state = 1;
1911                     UV uvc = 0;
1912                     STRLEN len = 0;
1913                     STRLEN foldlen = 0;
1914                     U8 *uscan = (U8*)NULL;
1915                     U8 *leftmost = NULL;
1916 #ifdef DEBUGGING
1917                     U32 accepted_word= 0;
1918 #endif
1919                     U32 pointpos = 0;
1920
1921                     while ( state && uc <= (U8*)strend ) {
1922                         int failed=0;
1923                         U32 word = aho->states[ state ].wordnum;
1924
1925                         if( state==1 ) {
1926                             if ( bitmap ) {
1927                                 DEBUG_TRIE_EXECUTE_r(
1928                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1929                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1930                                             (char *)uc, utf8_target );
1931                                         PerlIO_printf( Perl_debug_log,
1932                                             " Scanning for legal start char...\n");
1933                                     }
1934                                 );
1935                                 if (utf8_target) {
1936                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1937                                         uc += UTF8SKIP(uc);
1938                                     }
1939                                 } else {
1940                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1941                                         uc++;
1942                                     }
1943                                 }
1944                                 s= (char *)uc;
1945                             }
1946                             if (uc >(U8*)last_start) break;
1947                         }
1948
1949                         if ( word ) {
1950                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1951                             if (!leftmost || lpos < leftmost) {
1952                                 DEBUG_r(accepted_word=word);
1953                                 leftmost= lpos;
1954                             }
1955                             if (base==0) break;
1956
1957                         }
1958                         points[pointpos++ % maxlen]= uc;
1959                         if (foldlen || uc < (U8*)strend) {
1960                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1961                                              widecharmap, uc,
1962                                              uscan, len, uvc, charid, foldlen,
1963                                              foldbuf, uniflags);
1964                             DEBUG_TRIE_EXECUTE_r({
1965                                 dump_exec_pos( (char *)uc, c, strend,
1966                                             real_start, s, utf8_target);
1967                                 PerlIO_printf(Perl_debug_log,
1968                                     " Charid:%3u CP:%4"UVxf" ",
1969                                      charid, uvc);
1970                             });
1971                         }
1972                         else {
1973                             len = 0;
1974                             charid = 0;
1975                         }
1976
1977
1978                         do {
1979 #ifdef DEBUGGING
1980                             word = aho->states[ state ].wordnum;
1981 #endif
1982                             base = aho->states[ state ].trans.base;
1983
1984                             DEBUG_TRIE_EXECUTE_r({
1985                                 if (failed)
1986                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1987                                         s,   utf8_target );
1988                                 PerlIO_printf( Perl_debug_log,
1989                                     "%sState: %4"UVxf", word=%"UVxf,
1990                                     failed ? " Fail transition to " : "",
1991                                     (UV)state, (UV)word);
1992                             });
1993                             if ( base ) {
1994                                 U32 tmp;
1995                                 I32 offset;
1996                                 if (charid &&
1997                                      ( ((offset = base + charid
1998                                         - 1 - trie->uniquecharcount)) >= 0)
1999                                      && ((U32)offset < trie->lasttrans)
2000                                      && trie->trans[offset].check == state
2001                                      && (tmp=trie->trans[offset].next))
2002                                 {
2003                                     DEBUG_TRIE_EXECUTE_r(
2004                                         PerlIO_printf( Perl_debug_log," - legal\n"));
2005                                     state = tmp;
2006                                     break;
2007                                 }
2008                                 else {
2009                                     DEBUG_TRIE_EXECUTE_r(
2010                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2011                                     failed = 1;
2012                                     state = aho->fail[state];
2013                                 }
2014                             }
2015                             else {
2016                                 /* we must be accepting here */
2017                                 DEBUG_TRIE_EXECUTE_r(
2018                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2019                                 failed = 1;
2020                                 break;
2021                             }
2022                         } while(state);
2023                         uc += len;
2024                         if (failed) {
2025                             if (leftmost)
2026                                 break;
2027                             if (!state) state = 1;
2028                         }
2029                     }
2030                     if ( aho->states[ state ].wordnum ) {
2031                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2032                         if (!leftmost || lpos < leftmost) {
2033                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2034                             leftmost = lpos;
2035                         }
2036                     }
2037                     if (leftmost) {
2038                         s = (char*)leftmost;
2039                         DEBUG_TRIE_EXECUTE_r({
2040                             PerlIO_printf(
2041                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2042                                 (UV)accepted_word, (IV)(s - real_start)
2043                             );
2044                         });
2045                         if (!reginfo || regtry(reginfo, &s)) {
2046                             FREETMPS;
2047                             LEAVE;
2048                             goto got_it;
2049                         }
2050                         s = HOPc(s,1);
2051                         DEBUG_TRIE_EXECUTE_r({
2052                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2053                         });
2054                     } else {
2055                         DEBUG_TRIE_EXECUTE_r(
2056                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2057                         break;
2058                     }
2059                 }
2060                 FREETMPS;
2061                 LEAVE;
2062             }
2063             break;
2064         default:
2065             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2066             break;
2067         }
2068         return 0;
2069       got_it:
2070         return s;
2071 }
2072
2073
2074 /*
2075  - regexec_flags - match a regexp against a string
2076  */
2077 I32
2078 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2079               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2080 /* stringarg: the point in the string at which to begin matching */
2081 /* strend:    pointer to null at end of string */
2082 /* strbeg:    real beginning of string */
2083 /* minend:    end of match must be >= minend bytes after stringarg. */
2084 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2085  *            itself is accessed via the pointers above */
2086 /* data:      May be used for some additional optimizations.
2087               Currently its only used, with a U32 cast, for transmitting
2088               the ganch offset when doing a /g match. This will change */
2089 /* nosave:    For optimizations. */
2090
2091 {
2092     dVAR;
2093     struct regexp *const prog = ReANY(rx);
2094     char *s;
2095     regnode *c;
2096     char *startpos = stringarg;
2097     I32 minlen;         /* must match at least this many chars */
2098     I32 dontbother = 0; /* how many characters not to try at end */
2099     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2100     I32 scream_pos = -1;                /* Internal iterator of scream. */
2101     char *scream_olds = NULL;
2102     const bool utf8_target = cBOOL(DO_UTF8(sv));
2103     I32 multiline;
2104     RXi_GET_DECL(prog,progi);
2105     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2106     regexp_paren_pair *swap = NULL;
2107     GET_RE_DEBUG_FLAGS_DECL;
2108
2109     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2110     PERL_UNUSED_ARG(data);
2111
2112     /* Be paranoid... */
2113     if (prog == NULL || startpos == NULL) {
2114         Perl_croak(aTHX_ "NULL regexp parameter");
2115         return 0;
2116     }
2117
2118     multiline = prog->extflags & RXf_PMf_MULTILINE;
2119     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2120
2121     RX_MATCH_UTF8_set(rx, utf8_target);
2122     DEBUG_EXECUTE_r(
2123         debug_start_match(rx, utf8_target, startpos, strend,
2124         "Matching");
2125     );
2126
2127     minlen = prog->minlen;
2128
2129     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2130         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2131                               "String too short [regexec_flags]...\n"));
2132         goto phooey;
2133     }
2134
2135
2136     /* Check validity of program. */
2137     if (UCHARAT(progi->program) != REG_MAGIC) {
2138         Perl_croak(aTHX_ "corrupted regexp program");
2139     }
2140
2141     PL_reg_flags = 0;
2142     PL_reg_state.re_state_eval_setup_done = FALSE;
2143     PL_reg_maxiter = 0;
2144
2145     if (RX_UTF8(rx))
2146         PL_reg_flags |= RF_utf8;
2147
2148     /* Mark beginning of line for ^ and lookbehind. */
2149     reginfo.bol = startpos; /* XXX not used ??? */
2150     PL_bostr  = strbeg;
2151     reginfo.sv = sv;
2152
2153     /* Mark end of line for $ (and such) */
2154     PL_regeol = strend;
2155
2156     /* see how far we have to get to not match where we matched before */
2157     reginfo.till = startpos+minend;
2158
2159     /* If there is a "must appear" string, look for it. */
2160     s = startpos;
2161
2162     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2163         MAGIC *mg;
2164         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2165             reginfo.ganch = startpos + prog->gofs;
2166             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2167               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2168         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2169                   && SvMAGIC(sv)
2170                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2171                   && mg->mg_len >= 0) {
2172             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2173             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2174                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2175
2176             if (prog->extflags & RXf_ANCH_GPOS) {
2177                 if (s > reginfo.ganch)
2178                     goto phooey;
2179                 s = reginfo.ganch - prog->gofs;
2180                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2181                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2182                 if (s < strbeg)
2183                     goto phooey;
2184             }
2185         }
2186         else if (data) {
2187             reginfo.ganch = strbeg + PTR2UV(data);
2188             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2189                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2190
2191         } else {                                /* pos() not defined */
2192             reginfo.ganch = strbeg;
2193             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194                  "GPOS: reginfo.ganch = strbeg\n"));
2195         }
2196     }
2197     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2198         /* We have to be careful. If the previous successful match
2199            was from this regex we don't want a subsequent partially
2200            successful match to clobber the old results.
2201            So when we detect this possibility we add a swap buffer
2202            to the re, and switch the buffer each match. If we fail
2203            we switch it back, otherwise we leave it swapped.
2204         */
2205         swap = prog->offs;
2206         /* do we need a save destructor here for eval dies? */
2207         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2208         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2209             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2210             PTR2UV(prog),
2211             PTR2UV(swap),
2212             PTR2UV(prog->offs)
2213         ));
2214     }
2215     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2216         re_scream_pos_data d;
2217
2218         d.scream_olds = &scream_olds;
2219         d.scream_pos = &scream_pos;
2220         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2221         if (!s) {
2222             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2223             goto phooey;        /* not present */
2224         }
2225     }
2226
2227
2228
2229     /* Simplest case:  anchored match need be tried only once. */
2230     /*  [unless only anchor is BOL and multiline is set] */
2231     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2232         if (s == startpos && regtry(&reginfo, &startpos))
2233             goto got_it;
2234         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2235                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2236         {
2237             char *end;
2238
2239             if (minlen)
2240                 dontbother = minlen - 1;
2241             end = HOP3c(strend, -dontbother, strbeg) - 1;
2242             /* for multiline we only have to try after newlines */
2243             if (prog->check_substr || prog->check_utf8) {
2244                 /* because of the goto we can not easily reuse the macros for bifurcating the
2245                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2246                 if (utf8_target) {
2247                     if (s == startpos)
2248                         goto after_try_utf8;
2249                     while (1) {
2250                         if (regtry(&reginfo, &s)) {
2251                             goto got_it;
2252                         }
2253                       after_try_utf8:
2254                         if (s > end) {
2255                             goto phooey;
2256                         }
2257                         if (prog->extflags & RXf_USE_INTUIT) {
2258                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2259                             if (!s) {
2260                                 goto phooey;
2261                             }
2262                         }
2263                         else {
2264                             s += UTF8SKIP(s);
2265                         }
2266                     }
2267                 } /* end search for check string in unicode */
2268                 else {
2269                     if (s == startpos) {
2270                         goto after_try_latin;
2271                     }
2272                     while (1) {
2273                         if (regtry(&reginfo, &s)) {
2274                             goto got_it;
2275                         }
2276                       after_try_latin:
2277                         if (s > end) {
2278                             goto phooey;
2279                         }
2280                         if (prog->extflags & RXf_USE_INTUIT) {
2281                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2282                             if (!s) {
2283                                 goto phooey;
2284                             }
2285                         }
2286                         else {
2287                             s++;
2288                         }
2289                     }
2290                 } /* end search for check string in latin*/
2291             } /* end search for check string */
2292             else { /* search for newline */
2293                 if (s > startpos) {
2294                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2295                     s--;
2296                 }
2297                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2298                 while (s <= end) { /* note it could be possible to match at the end of the string */
2299                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2300                         if (regtry(&reginfo, &s))
2301                             goto got_it;
2302                     }
2303                 }
2304             } /* end search for newline */
2305         } /* end anchored/multiline check string search */
2306         goto phooey;
2307     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2308     {
2309         /* the warning about reginfo.ganch being used without initialization
2310            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2311            and we only enter this block when the same bit is set. */
2312         char *tmp_s = reginfo.ganch - prog->gofs;
2313
2314         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2315             goto got_it;
2316         goto phooey;
2317     }
2318
2319     /* Messy cases:  unanchored match. */
2320     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2321         /* we have /x+whatever/ */
2322         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2323         char ch;
2324 #ifdef DEBUGGING
2325         int did_match = 0;
2326 #endif
2327         if (utf8_target) {
2328             if (! prog->anchored_utf8) {
2329                 to_utf8_substr(prog);
2330             }
2331             ch = SvPVX_const(prog->anchored_utf8)[0];
2332             REXEC_FBC_SCAN(
2333                 if (*s == ch) {
2334                     DEBUG_EXECUTE_r( did_match = 1 );
2335                     if (regtry(&reginfo, &s)) goto got_it;
2336                     s += UTF8SKIP(s);
2337                     while (s < strend && *s == ch)
2338                         s += UTF8SKIP(s);
2339                 }
2340             );
2341
2342         }
2343         else {
2344             if (! prog->anchored_substr) {
2345                 if (! to_byte_substr(prog)) {
2346                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2347                 }
2348             }
2349             ch = SvPVX_const(prog->anchored_substr)[0];
2350             REXEC_FBC_SCAN(
2351                 if (*s == ch) {
2352                     DEBUG_EXECUTE_r( did_match = 1 );
2353                     if (regtry(&reginfo, &s)) goto got_it;
2354                     s++;
2355                     while (s < strend && *s == ch)
2356                         s++;
2357                 }
2358             );
2359         }
2360         DEBUG_EXECUTE_r(if (!did_match)
2361                 PerlIO_printf(Perl_debug_log,
2362                                   "Did not find anchored character...\n")
2363                );
2364     }
2365     else if (prog->anchored_substr != NULL
2366               || prog->anchored_utf8 != NULL
2367               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2368                   && prog->float_max_offset < strend - s)) {
2369         SV *must;
2370         I32 back_max;
2371         I32 back_min;
2372         char *last;
2373         char *last1;            /* Last position checked before */
2374 #ifdef DEBUGGING
2375         int did_match = 0;
2376 #endif
2377         if (prog->anchored_substr || prog->anchored_utf8) {
2378             if (utf8_target) {
2379                 if (! prog->anchored_utf8) {
2380                     to_utf8_substr(prog);
2381                 }
2382                 must = prog->anchored_utf8;
2383             }
2384             else {
2385                 if (! prog->anchored_substr) {
2386                     if (! to_byte_substr(prog)) {
2387                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2388                     }
2389                 }
2390                 must = prog->anchored_substr;
2391             }
2392             back_max = back_min = prog->anchored_offset;
2393         } else {
2394             if (utf8_target) {
2395                 if (! prog->float_utf8) {
2396                     to_utf8_substr(prog);
2397                 }
2398                 must = prog->float_utf8;
2399             }
2400             else {
2401                 if (! prog->float_substr) {
2402                     if (! to_byte_substr(prog)) {
2403                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2404                     }
2405                 }
2406                 must = prog->float_substr;
2407             }
2408             back_max = prog->float_max_offset;
2409             back_min = prog->float_min_offset;
2410         }
2411
2412         if (back_min<0) {
2413             last = strend;
2414         } else {
2415             last = HOP3c(strend,        /* Cannot start after this */
2416                   -(I32)(CHR_SVLEN(must)
2417                          - (SvTAIL(must) != 0) + back_min), strbeg);
2418         }
2419         if (s > PL_bostr)
2420             last1 = HOPc(s, -1);
2421         else
2422             last1 = s - 1;      /* bogus */
2423
2424         /* XXXX check_substr already used to find "s", can optimize if
2425            check_substr==must. */
2426         scream_pos = -1;
2427         dontbother = end_shift;
2428         strend = HOPc(strend, -dontbother);
2429         while ( (s <= last) &&
2430                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2431                                   (unsigned char*)strend, must,
2432                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2433             DEBUG_EXECUTE_r( did_match = 1 );
2434             if (HOPc(s, -back_max) > last1) {
2435                 last1 = HOPc(s, -back_min);
2436                 s = HOPc(s, -back_max);
2437             }
2438             else {
2439                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2440
2441                 last1 = HOPc(s, -back_min);
2442                 s = t;
2443             }
2444             if (utf8_target) {
2445                 while (s <= last1) {
2446                     if (regtry(&reginfo, &s))
2447                         goto got_it;
2448                     if (s >= last1) {
2449                         s++; /* to break out of outer loop */
2450                         break;
2451                     }
2452                     s += UTF8SKIP(s);
2453                 }
2454             }
2455             else {
2456                 while (s <= last1) {
2457                     if (regtry(&reginfo, &s))
2458                         goto got_it;
2459                     s++;
2460                 }
2461             }
2462         }
2463         DEBUG_EXECUTE_r(if (!did_match) {
2464             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2465                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2466             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2467                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2468                                ? "anchored" : "floating"),
2469                 quoted, RE_SV_TAIL(must));
2470         });
2471         goto phooey;
2472     }
2473     else if ( (c = progi->regstclass) ) {
2474         if (minlen) {
2475             const OPCODE op = OP(progi->regstclass);
2476             /* don't bother with what can't match */
2477             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2478                 strend = HOPc(strend, -(minlen - 1));
2479         }
2480         DEBUG_EXECUTE_r({
2481             SV * const prop = sv_newmortal();
2482             regprop(prog, prop, c);
2483             {
2484                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2485                     s,strend-s,60);
2486                 PerlIO_printf(Perl_debug_log,
2487                     "Matching stclass %.*s against %s (%d bytes)\n",
2488                     (int)SvCUR(prop), SvPVX_const(prop),
2489                      quoted, (int)(strend - s));
2490             }
2491         });
2492         if (find_byclass(prog, c, s, strend, &reginfo))
2493             goto got_it;
2494         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2495     }
2496     else {
2497         dontbother = 0;
2498         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2499             /* Trim the end. */
2500             char *last= NULL;
2501             SV* float_real;
2502             STRLEN len;
2503             const char *little;
2504
2505             if (utf8_target) {
2506                 if (! prog->float_utf8) {
2507                     to_utf8_substr(prog);
2508                 }
2509                 float_real = prog->float_utf8;
2510             }
2511             else {
2512                 if (! prog->float_substr) {
2513                     if (! to_byte_substr(prog)) {
2514                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2515                     }
2516                 }
2517                 float_real = prog->float_substr;
2518             }
2519
2520             little = SvPV_const(float_real, len);
2521             if (SvTAIL(float_real)) {
2522                     /* This means that float_real contains an artificial \n on
2523                      * the end due to the presence of something like this:
2524                      * /foo$/ where we can match both "foo" and "foo\n" at the
2525                      * end of the string.  So we have to compare the end of the
2526                      * string first against the float_real without the \n and
2527                      * then against the full float_real with the string.  We
2528                      * have to watch out for cases where the string might be
2529                      * smaller than the float_real or the float_real without
2530                      * the \n. */
2531                     char *checkpos= strend - len;
2532                     DEBUG_OPTIMISE_r(
2533                         PerlIO_printf(Perl_debug_log,
2534                             "%sChecking for float_real.%s\n",
2535                             PL_colors[4], PL_colors[5]));
2536                     if (checkpos + 1 < strbeg) {
2537                         /* can't match, even if we remove the trailing \n
2538                          * string is too short to match */
2539                         DEBUG_EXECUTE_r(
2540                             PerlIO_printf(Perl_debug_log,
2541                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2542                                 PL_colors[4], PL_colors[5]));
2543                         goto phooey;
2544                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2545                         /* can match, the end of the string matches without the
2546                          * "\n" */
2547                         last = checkpos + 1;
2548                     } else if (checkpos < strbeg) {
2549                         /* cant match, string is too short when the "\n" is
2550                          * included */
2551                         DEBUG_EXECUTE_r(
2552                             PerlIO_printf(Perl_debug_log,
2553                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2554                                 PL_colors[4], PL_colors[5]));
2555                         goto phooey;
2556                     } else if (!multiline) {
2557                         /* non multiline match, so compare with the "\n" at the
2558                          * end of the string */
2559                         if (memEQ(checkpos, little, len)) {
2560                             last= checkpos;
2561                         } else {
2562                             DEBUG_EXECUTE_r(
2563                                 PerlIO_printf(Perl_debug_log,
2564                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2565                                     PL_colors[4], PL_colors[5]));
2566                             goto phooey;
2567                         }
2568                     } else {
2569                         /* multiline match, so we have to search for a place
2570                          * where the full string is located */
2571                         goto find_last;
2572                     }
2573             } else {
2574                   find_last:
2575                     if (len)
2576                         last = rninstr(s, strend, little, little + len);
2577                     else
2578                         last = strend;  /* matching "$" */
2579             }
2580             if (!last) {
2581                 /* at one point this block contained a comment which was
2582                  * probably incorrect, which said that this was a "should not
2583                  * happen" case.  Even if it was true when it was written I am
2584                  * pretty sure it is not anymore, so I have removed the comment
2585                  * and replaced it with this one. Yves */
2586                 DEBUG_EXECUTE_r(
2587                     PerlIO_printf(Perl_debug_log,
2588                         "String does not contain required substring, cannot match.\n"
2589                     ));
2590                 goto phooey;
2591             }
2592             dontbother = strend - last + prog->float_min_offset;
2593         }
2594         if (minlen && (dontbother < minlen))
2595             dontbother = minlen - 1;
2596         strend -= dontbother;              /* this one's always in bytes! */
2597         /* We don't know much -- general case. */
2598         if (utf8_target) {
2599             for (;;) {
2600                 if (regtry(&reginfo, &s))
2601                     goto got_it;
2602                 if (s >= strend)
2603                     break;
2604                 s += UTF8SKIP(s);
2605             };
2606         }
2607         else {
2608             do {
2609                 if (regtry(&reginfo, &s))
2610                     goto got_it;
2611             } while (s++ < strend);
2612         }
2613     }
2614
2615     /* Failure. */
2616     goto phooey;
2617
2618 got_it:
2619     DEBUG_BUFFERS_r(
2620         if (swap)
2621             PerlIO_printf(Perl_debug_log,
2622                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2623                 PTR2UV(prog),
2624                 PTR2UV(swap)
2625             );
2626     );
2627     Safefree(swap);
2628     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2629
2630     if (PL_reg_state.re_state_eval_setup_done)
2631         restore_pos(aTHX_ prog);
2632     if (RXp_PAREN_NAMES(prog))
2633         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2634
2635     /* make sure $`, $&, $', and $digit will work later */
2636     if ( !(flags & REXEC_NOT_FIRST) ) {
2637         if (flags & REXEC_COPY_STR) {
2638 #ifdef PERL_ANY_COW
2639             if (SvCANCOW(sv)) {
2640                 if (DEBUG_C_TEST) {
2641                     PerlIO_printf(Perl_debug_log,
2642                                   "Copy on write: regexp capture, type %d\n",
2643                                   (int) SvTYPE(sv));
2644                 }
2645                 RX_MATCH_COPY_FREE(rx);
2646                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2647                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2648                 assert (SvPOKp(prog->saved_copy));
2649                 prog->sublen  = PL_regeol - strbeg;
2650                 prog->suboffset = 0;
2651                 prog->subcoffset = 0;
2652             } else
2653 #endif
2654             {
2655                 I32 min = 0;
2656                 I32 max = PL_regeol - strbeg;
2657                 I32 sublen;
2658
2659                 if (    (flags & REXEC_COPY_SKIP_POST)
2660                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2661                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2662                 ) { /* don't copy $' part of string */
2663                     U32 n = 0;
2664                     max = -1;
2665                     /* calculate the right-most part of the string covered
2666                      * by a capture. Due to look-ahead, this may be to
2667                      * the right of $&, so we have to scan all captures */
2668                     while (n <= prog->lastparen) {
2669                         if (prog->offs[n].end > max)
2670                             max = prog->offs[n].end;
2671                         n++;
2672                     }
2673                     if (max == -1)
2674                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2675                                 ? prog->offs[0].start
2676                                 : 0;
2677                     assert(max >= 0 && max <= PL_regeol - strbeg);
2678                 }
2679
2680                 if (    (flags & REXEC_COPY_SKIP_PRE)
2681                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2682                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2683                 ) { /* don't copy $` part of string */
2684                     U32 n = 0;
2685                     min = max;
2686                     /* calculate the left-most part of the string covered
2687                      * by a capture. Due to look-behind, this may be to
2688                      * the left of $&, so we have to scan all captures */
2689                     while (min && n <= prog->lastparen) {
2690                         if (   prog->offs[n].start != -1
2691                             && prog->offs[n].start < min)
2692                         {
2693                             min = prog->offs[n].start;
2694                         }
2695                         n++;
2696                     }
2697                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2698                         && min >  prog->offs[0].end
2699                     )
2700                         min = prog->offs[0].end;
2701
2702                 }
2703
2704                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2705                 sublen = max - min;
2706
2707                 if (RX_MATCH_COPIED(rx)) {
2708                     if (sublen > prog->sublen)
2709                         prog->subbeg =
2710                                 (char*)saferealloc(prog->subbeg, sublen+1);
2711                 }
2712                 else
2713                     prog->subbeg = (char*)safemalloc(sublen+1);
2714                 Copy(strbeg + min, prog->subbeg, sublen, char);
2715                 prog->subbeg[sublen] = '\0';
2716                 prog->suboffset = min;
2717                 prog->sublen = sublen;
2718                 RX_MATCH_COPIED_on(rx);
2719             }
2720             prog->subcoffset = prog->suboffset;
2721             if (prog->suboffset && utf8_target) {
2722                 /* Convert byte offset to chars.
2723                  * XXX ideally should only compute this if @-/@+
2724                  * has been seen, a la PL_sawampersand ??? */
2725
2726                 /* If there's a direct correspondence between the
2727                  * string which we're matching and the original SV,
2728                  * then we can use the utf8 len cache associated with
2729                  * the SV. In particular, it means that under //g,
2730                  * sv_pos_b2u() will use the previously cached
2731                  * position to speed up working out the new length of
2732                  * subcoffset, rather than counting from the start of
2733                  * the string each time. This stops
2734                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2735                  * from going quadratic */
2736                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2737                     sv_pos_b2u(sv, &(prog->subcoffset));
2738                 else
2739                     prog->subcoffset = utf8_length((U8*)strbeg,
2740                                         (U8*)(strbeg+prog->suboffset));
2741             }
2742         }
2743         else {
2744             RX_MATCH_COPY_FREE(rx);
2745             prog->subbeg = strbeg;
2746             prog->suboffset = 0;
2747             prog->subcoffset = 0;
2748             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2749         }
2750     }
2751
2752     return 1;
2753
2754 phooey:
2755     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2756                           PL_colors[4], PL_colors[5]));
2757     if (PL_reg_state.re_state_eval_setup_done)
2758         restore_pos(aTHX_ prog);
2759     if (swap) {
2760         /* we failed :-( roll it back */
2761         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2762             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2763             PTR2UV(prog),
2764             PTR2UV(prog->offs),
2765             PTR2UV(swap)
2766         ));
2767         Safefree(prog->offs);
2768         prog->offs = swap;
2769     }
2770     return 0;
2771 }
2772
2773
2774 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2775  * Do inc before dec, in case old and new rex are the same */
2776 #define SET_reg_curpm(Re2) \
2777     if (PL_reg_state.re_state_eval_setup_done) {    \
2778         (void)ReREFCNT_inc(Re2);                    \
2779         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2780         PM_SETRE((PL_reg_curpm), (Re2));            \
2781     }
2782
2783
2784 /*
2785  - regtry - try match at specific point
2786  */
2787 STATIC I32                      /* 0 failure, 1 success */
2788 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2789 {
2790     dVAR;
2791     CHECKPOINT lastcp;
2792     REGEXP *const rx = reginfo->prog;
2793     regexp *const prog = ReANY(rx);
2794     I32 result;
2795     RXi_GET_DECL(prog,progi);
2796     GET_RE_DEBUG_FLAGS_DECL;
2797
2798     PERL_ARGS_ASSERT_REGTRY;
2799
2800     reginfo->cutpoint=NULL;
2801
2802     if ((prog->extflags & RXf_EVAL_SEEN)
2803         && !PL_reg_state.re_state_eval_setup_done)
2804     {
2805         MAGIC *mg;
2806
2807         PL_reg_state.re_state_eval_setup_done = TRUE;
2808         if (reginfo->sv) {
2809             /* Make $_ available to executed code. */
2810             if (reginfo->sv != DEFSV) {
2811                 SAVE_DEFSV;
2812                 DEFSV_set(reginfo->sv);
2813             }
2814
2815             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2816                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2817                 /* prepare for quick setting of pos */
2818 #ifdef PERL_OLD_COPY_ON_WRITE
2819                 if (SvIsCOW(reginfo->sv))
2820                     sv_force_normal_flags(reginfo->sv, 0);
2821 #endif
2822                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2823                                  &PL_vtbl_mglob, NULL, 0);
2824                 mg->mg_len = -1;
2825             }
2826             PL_reg_magic    = mg;
2827             PL_reg_oldpos   = mg->mg_len;
2828             SAVEDESTRUCTOR_X(restore_pos, prog);
2829         }
2830         if (!PL_reg_curpm) {
2831             Newxz(PL_reg_curpm, 1, PMOP);
2832 #ifdef USE_ITHREADS
2833             {
2834                 SV* const repointer = &PL_sv_undef;
2835                 /* this regexp is also owned by the new PL_reg_curpm, which
2836                    will try to free it.  */
2837                 av_push(PL_regex_padav, repointer);
2838                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2839                 PL_regex_pad = AvARRAY(PL_regex_padav);
2840             }
2841 #endif
2842         }
2843         SET_reg_curpm(rx);
2844         PL_reg_oldcurpm = PL_curpm;
2845         PL_curpm = PL_reg_curpm;
2846         if (RXp_MATCH_COPIED(prog)) {
2847             /*  Here is a serious problem: we cannot rewrite subbeg,
2848                 since it may be needed if this match fails.  Thus
2849                 $` inside (?{}) could fail... */
2850             PL_reg_oldsaved = prog->subbeg;
2851             PL_reg_oldsavedlen = prog->sublen;
2852             PL_reg_oldsavedoffset = prog->suboffset;
2853             PL_reg_oldsavedcoffset = prog->suboffset;
2854 #ifdef PERL_ANY_COW
2855             PL_nrs = prog->saved_copy;
2856 #endif
2857             RXp_MATCH_COPIED_off(prog);
2858         }
2859         else
2860             PL_reg_oldsaved = NULL;
2861         prog->subbeg = PL_bostr;
2862         prog->suboffset = 0;
2863         prog->subcoffset = 0;
2864         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2865     }
2866 #ifdef DEBUGGING
2867     PL_reg_starttry = *startposp;
2868 #endif
2869     prog->offs[0].start = *startposp - PL_bostr;
2870     prog->lastparen = 0;
2871     prog->lastcloseparen = 0;
2872     PL_regsize = 0;
2873
2874     /* XXXX What this code is doing here?!!!  There should be no need
2875        to do this again and again, prog->lastparen should take care of
2876        this!  --ilya*/
2877
2878     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2879      * Actually, the code in regcppop() (which Ilya may be meaning by
2880      * prog->lastparen), is not needed at all by the test suite
2881      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2882      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2883      * Meanwhile, this code *is* needed for the
2884      * above-mentioned test suite tests to succeed.  The common theme
2885      * on those tests seems to be returning null fields from matches.
2886      * --jhi updated by dapm */
2887 #if 1
2888     if (prog->nparens) {
2889         regexp_paren_pair *pp = prog->offs;
2890         I32 i;
2891         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2892             ++pp;
2893             pp->start = -1;
2894             pp->end = -1;
2895         }
2896     }
2897 #endif
2898     REGCP_SET(lastcp);
2899     result = regmatch(reginfo, *startposp, progi->program + 1);
2900     if (result != -1) {
2901         prog->offs[0].end = result;
2902         return 1;
2903     }
2904     if (reginfo->cutpoint)
2905         *startposp= reginfo->cutpoint;
2906     REGCP_UNWIND(lastcp);
2907     return 0;
2908 }
2909
2910
2911 #define sayYES goto yes
2912 #define sayNO goto no
2913 #define sayNO_SILENT goto no_silent
2914
2915 /* we dont use STMT_START/END here because it leads to
2916    "unreachable code" warnings, which are bogus, but distracting. */
2917 #define CACHEsayNO \
2918     if (ST.cache_mask) \
2919        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2920     sayNO
2921
2922 /* this is used to determine how far from the left messages like
2923    'failed...' are printed. It should be set such that messages
2924    are inline with the regop output that created them.
2925 */
2926 #define REPORT_CODE_OFF 32
2927
2928
2929 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2930 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2931 #define CHRTEST_NOT_A_CP_1 -999
2932 #define CHRTEST_NOT_A_CP_2 -998
2933
2934 #define SLAB_FIRST(s) (&(s)->states[0])
2935 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2936
2937 /* grab a new slab and return the first slot in it */
2938
2939 STATIC regmatch_state *
2940 S_push_slab(pTHX)
2941 {
2942 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2943     dMY_CXT;
2944 #endif
2945     regmatch_slab *s = PL_regmatch_slab->next;
2946     if (!s) {
2947         Newx(s, 1, regmatch_slab);
2948         s->prev = PL_regmatch_slab;
2949         s->next = NULL;
2950         PL_regmatch_slab->next = s;
2951     }
2952     PL_regmatch_slab = s;
2953     return SLAB_FIRST(s);
2954 }
2955
2956
2957 /* push a new state then goto it */
2958
2959 #define PUSH_STATE_GOTO(state, node, input) \
2960     pushinput = input; \
2961     scan = node; \
2962     st->resume_state = state; \
2963     goto push_state;
2964
2965 /* push a new state with success backtracking, then goto it */
2966
2967 #define PUSH_YES_STATE_GOTO(state, node, input) \
2968     pushinput = input; \
2969     scan = node; \
2970     st->resume_state = state; \
2971     goto push_yes_state;
2972
2973
2974
2975
2976 /*
2977
2978 regmatch() - main matching routine
2979
2980 This is basically one big switch statement in a loop. We execute an op,
2981 set 'next' to point the next op, and continue. If we come to a point which
2982 we may need to backtrack to on failure such as (A|B|C), we push a
2983 backtrack state onto the backtrack stack. On failure, we pop the top
2984 state, and re-enter the loop at the state indicated. If there are no more
2985 states to pop, we return failure.
2986
2987 Sometimes we also need to backtrack on success; for example /A+/, where
2988 after successfully matching one A, we need to go back and try to
2989 match another one; similarly for lookahead assertions: if the assertion
2990 completes successfully, we backtrack to the state just before the assertion
2991 and then carry on.  In these cases, the pushed state is marked as
2992 'backtrack on success too'. This marking is in fact done by a chain of
2993 pointers, each pointing to the previous 'yes' state. On success, we pop to
2994 the nearest yes state, discarding any intermediate failure-only states.
2995 Sometimes a yes state is pushed just to force some cleanup code to be
2996 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2997 it to free the inner regex.
2998
2999 Note that failure backtracking rewinds the cursor position, while
3000 success backtracking leaves it alone.
3001
3002 A pattern is complete when the END op is executed, while a subpattern
3003 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3004 ops trigger the "pop to last yes state if any, otherwise return true"
3005 behaviour.
3006
3007 A common convention in this function is to use A and B to refer to the two
3008 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3009 the subpattern to be matched possibly multiple times, while B is the entire
3010 rest of the pattern. Variable and state names reflect this convention.
3011
3012 The states in the main switch are the union of ops and failure/success of
3013 substates associated with with that op.  For example, IFMATCH is the op
3014 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3015 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3016 successfully matched A and IFMATCH_A_fail is a state saying that we have
3017 just failed to match A. Resume states always come in pairs. The backtrack
3018 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3019 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3020 on success or failure.
3021
3022 The struct that holds a backtracking state is actually a big union, with
3023 one variant for each major type of op. The variable st points to the
3024 top-most backtrack struct. To make the code clearer, within each
3025 block of code we #define ST to alias the relevant union.
3026
3027 Here's a concrete example of a (vastly oversimplified) IFMATCH
3028 implementation:
3029
3030     switch (state) {
3031     ....
3032
3033 #define ST st->u.ifmatch
3034
3035     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3036         ST.foo = ...; // some state we wish to save
3037         ...
3038         // push a yes backtrack state with a resume value of
3039         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3040         // first node of A:
3041         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3042         // NOTREACHED
3043
3044     case IFMATCH_A: // we have successfully executed A; now continue with B
3045         next = B;
3046         bar = ST.foo; // do something with the preserved value
3047         break;
3048
3049     case IFMATCH_A_fail: // A failed, so the assertion failed
3050         ...;   // do some housekeeping, then ...
3051         sayNO; // propagate the failure
3052
3053 #undef ST
3054
3055     ...
3056     }
3057
3058 For any old-timers reading this who are familiar with the old recursive
3059 approach, the code above is equivalent to:
3060
3061     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3062     {
3063         int foo = ...
3064         ...
3065         if (regmatch(A)) {
3066             next = B;
3067             bar = foo;
3068             break;
3069         }
3070         ...;   // do some housekeeping, then ...
3071         sayNO; // propagate the failure
3072     }
3073
3074 The topmost backtrack state, pointed to by st, is usually free. If you
3075 want to claim it, populate any ST.foo fields in it with values you wish to
3076 save, then do one of
3077
3078         PUSH_STATE_GOTO(resume_state, node, newinput);
3079         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3080
3081 which sets that backtrack state's resume value to 'resume_state', pushes a
3082 new free entry to the top of the backtrack stack, then goes to 'node'.
3083 On backtracking, the free slot is popped, and the saved state becomes the
3084 new free state. An ST.foo field in this new top state can be temporarily
3085 accessed to retrieve values, but once the main loop is re-entered, it
3086 becomes available for reuse.
3087
3088 Note that the depth of the backtrack stack constantly increases during the
3089 left-to-right execution of the pattern, rather than going up and down with
3090 the pattern nesting. For example the stack is at its maximum at Z at the
3091 end of the pattern, rather than at X in the following:
3092
3093     /(((X)+)+)+....(Y)+....Z/
3094
3095 The only exceptions to this are lookahead/behind assertions and the cut,
3096 (?>A), which pop all the backtrack states associated with A before
3097 continuing.
3098
3099 Backtrack state structs are allocated in slabs of about 4K in size.
3100 PL_regmatch_state and st always point to the currently active state,
3101 and PL_regmatch_slab points to the slab currently containing
3102 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3103 allocated, and is never freed until interpreter destruction. When the slab
3104 is full, a new one is allocated and chained to the end. At exit from
3105 regmatch(), slabs allocated since entry are freed.
3106
3107 */
3108
3109
3110 #define DEBUG_STATE_pp(pp)                                  \
3111     DEBUG_STATE_r({                                         \
3112         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3113         PerlIO_printf(Perl_debug_log,                       \
3114             "    %*s"pp" %s%s%s%s%s\n",                     \
3115             depth*2, "",                                    \
3116             PL_reg_name[st->resume_state],                     \
3117             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3118             ((st==yes_state) ? "Y" : ""),                   \
3119             ((st==mark_state) ? "M" : ""),                  \
3120             ((st==yes_state||st==mark_state) ? "]" : "")    \
3121         );                                                  \
3122     });
3123
3124
3125 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3126
3127 #ifdef DEBUGGING
3128
3129 STATIC void
3130 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3131     const char *start, const char *end, const char *blurb)
3132 {
3133     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3134
3135     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3136
3137     if (!PL_colorset)
3138             reginitcolors();
3139     {
3140         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3141             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3142
3143         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3144             start, end - start, 60);
3145
3146         PerlIO_printf(Perl_debug_log,
3147             "%s%s REx%s %s against %s\n",
3148                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3149
3150         if (utf8_target||utf8_pat)
3151             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3152                 utf8_pat ? "pattern" : "",
3153                 utf8_pat && utf8_target ? " and " : "",
3154                 utf8_target ? "string" : ""
3155             );
3156     }
3157 }
3158
3159 STATIC void
3160 S_dump_exec_pos(pTHX_ const char *locinput,
3161                       const regnode *scan,
3162                       const char *loc_regeol,
3163                       const char *loc_bostr,
3164                       const char *loc_reg_starttry,
3165                       const bool utf8_target)
3166 {
3167     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3168     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3169     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3170     /* The part of the string before starttry has one color
3171        (pref0_len chars), between starttry and current
3172        position another one (pref_len - pref0_len chars),
3173        after the current position the third one.
3174        We assume that pref0_len <= pref_len, otherwise we
3175        decrease pref0_len.  */
3176     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3177         ? (5 + taill) - l : locinput - loc_bostr;
3178     int pref0_len;
3179
3180     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3181
3182     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3183         pref_len++;
3184     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3185     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3186         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3187               ? (5 + taill) - pref_len : loc_regeol - locinput);
3188     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3189         l--;
3190     if (pref0_len < 0)
3191         pref0_len = 0;
3192     if (pref0_len > pref_len)
3193         pref0_len = pref_len;
3194     {
3195         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3196
3197         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3198             (locinput - pref_len),pref0_len, 60, 4, 5);
3199
3200         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3201                     (locinput - pref_len + pref0_len),
3202                     pref_len - pref0_len, 60, 2, 3);
3203
3204         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3205                     locinput, loc_regeol - locinput, 10, 0, 1);
3206
3207         const STRLEN tlen=len0+len1+len2;
3208         PerlIO_printf(Perl_debug_log,
3209                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3210                     (IV)(locinput - loc_bostr),
3211                     len0, s0,
3212                     len1, s1,
3213                     (docolor ? "" : "> <"),
3214                     len2, s2,
3215                     (int)(tlen > 19 ? 0 :  19 - tlen),
3216                     "");
3217     }
3218 }
3219
3220 #endif
3221
3222 /* reg_check_named_buff_matched()
3223  * Checks to see if a named buffer has matched. The data array of
3224  * buffer numbers corresponding to the buffer is expected to reside
3225  * in the regexp->data->data array in the slot stored in the ARG() of
3226  * node involved. Note that this routine doesn't actually care about the
3227  * name, that information is not preserved from compilation to execution.
3228  * Returns the index of the leftmost defined buffer with the given name
3229  * or 0 if non of the buffers matched.
3230  */
3231 STATIC I32
3232 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3233 {
3234     I32 n;
3235     RXi_GET_DECL(rex,rexi);
3236     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3237     I32 *nums=(I32*)SvPVX(sv_dat);
3238
3239     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3240
3241     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3242         if ((I32)rex->lastparen >= nums[n] &&
3243             rex->offs[nums[n]].end != -1)
3244         {
3245             return nums[n];
3246         }
3247     }
3248     return 0;
3249 }
3250
3251
3252 /* free all slabs above current one  - called during LEAVE_SCOPE */
3253
3254 STATIC void
3255 S_clear_backtrack_stack(pTHX_ void *p)
3256 {
3257     regmatch_slab *s = PL_regmatch_slab->next;
3258     PERL_UNUSED_ARG(p);
3259
3260     if (!s)
3261         return;
3262     PL_regmatch_slab->next = NULL;
3263     while (s) {
3264         regmatch_slab * const osl = s;
3265         s = s->next;
3266         Safefree(osl);
3267     }
3268 }
3269 static bool
3270 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3271 {
3272     /* This function determines if there are one or two characters that match
3273      * the first character of the passed-in EXACTish node <text_node>, and if
3274      * so, returns them in the passed-in pointers.
3275      *
3276      * If it determines that no possible character in the target string can
3277      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3278      * the first character in <text_node> requires UTF-8 to represent, and the
3279      * target string isn't in UTF-8.)
3280      *
3281      * If there are more than two characters that could match the beginning of
3282      * <text_node>, or if more context is required to determine a match or not,
3283      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3284      *
3285      * The motiviation behind this function is to allow the caller to set up
3286      * tight loops for matching.  If <text_node> is of type EXACT, there is
3287      * only one possible character that can match its first character, and so
3288      * the situation is quite simple.  But things get much more complicated if
3289      * folding is involved.  It may be that the first character of an EXACTFish
3290      * node doesn't participate in any possible fold, e.g., punctuation, so it
3291      * can be matched only by itself.  The vast majority of characters that are
3292      * in folds match just two things, their lower and upper-case equivalents.
3293      * But not all are like that; some have multiple possible matches, or match
3294      * sequences of more than one character.  This function sorts all that out.
3295      *
3296      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3297      * loop of trying to match A*, we know we can't exit where the thing
3298      * following it isn't a B.  And something can't be a B unless it is the
3299      * beginning of B.  By putting a quick test for that beginning in a tight
3300      * loop, we can rule out things that can't possibly be B without having to
3301      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3302      * character, we can make a tight loop matching A*, using the outputs of
3303      * this function.
3304      *
3305      * If the target string to match isn't in UTF-8, and there aren't
3306      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3307      * the one or two possible octets (which are characters in this situation)
3308      * that can match.  In all cases, if there is only one character that can
3309      * match, *<c1p> and *<c2p> will be identical.
3310      *
3311      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3312      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3313      * can match the beginning of <text_node>.  They should be declared with at
3314      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3315      * undefined what these contain.)  If one or both of the buffers are
3316      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3317      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3318      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3319      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3320      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3321
3322     const bool utf8_target = PL_reg_match_utf8;
3323
3324     UV c1 = CHRTEST_NOT_A_CP_1;
3325     UV c2 = CHRTEST_NOT_A_CP_2;
3326     bool use_chrtest_void = FALSE;
3327
3328     /* Used when we have both utf8 input and utf8 output, to avoid converting
3329      * to/from code points */
3330     bool utf8_has_been_setup = FALSE;
3331
3332     dVAR;
3333
3334     U8 *pat = (U8*)STRING(text_node);
3335
3336     if (OP(text_node) == EXACT) {
3337
3338         /* In an exact node, only one thing can be matched, that first
3339          * character.  If both the pat and the target are UTF-8, we can just
3340          * copy the input to the output, avoiding finding the code point of
3341          * that character */
3342         if (! UTF_PATTERN) {
3343             c2 = c1 = *pat;
3344         }
3345         else if (utf8_target) {
3346             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3347             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3348             utf8_has_been_setup = TRUE;
3349         }
3350         else {
3351             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3352         }
3353     }
3354     else /* an EXACTFish node */
3355          if ((UTF_PATTERN
3356                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3357                                                     pat + STR_LEN(text_node)))
3358              || (! UTF_PATTERN
3359                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3360                                                     pat + STR_LEN(text_node))))
3361     {
3362         /* Multi-character folds require more context to sort out.  Also
3363          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3364          * handled outside this routine */
3365         use_chrtest_void = TRUE;
3366     }
3367     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3368         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3369         if (c1 > 256) {
3370             /* Load the folds hash, if not already done */
3371             SV** listp;
3372             if (! PL_utf8_foldclosures) {
3373                 if (! PL_utf8_tofold) {
3374                     U8 dummy[UTF8_MAXBYTES+1];
3375
3376                     /* Force loading this by folding an above-Latin1 char */
3377                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3378                     assert(PL_utf8_tofold); /* Verify that worked */
3379                 }
3380                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3381             }
3382
3383             /* The fold closures data structure is a hash with the keys being
3384              * the UTF-8 of every character that is folded to, like 'k', and
3385              * the values each an array of all code points that fold to its
3386              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3387              * not included */
3388             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3389                                      (char *) pat,
3390                                      UTF8SKIP(pat),
3391                                      FALSE))))
3392             {
3393                 /* Not found in the hash, therefore there are no folds
3394                  * containing it, so there is only a single character that
3395                  * could match */
3396                 c2 = c1;
3397             }
3398             else {  /* Does participate in folds */
3399                 AV* list = (AV*) *listp;
3400                 if (av_len(list) != 1) {
3401
3402                     /* If there aren't exactly two folds to this, it is outside
3403                      * the scope of this function */
3404                     use_chrtest_void = TRUE;
3405                 }
3406                 else {  /* There are two.  Get them */
3407                     SV** c_p = av_fetch(list, 0, FALSE);
3408                     if (c_p == NULL) {
3409                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3410                     }
3411                     c1 = SvUV(*c_p);
3412
3413                     c_p = av_fetch(list, 1, FALSE);
3414                     if (c_p == NULL) {
3415                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3416                     }
3417                     c2 = SvUV(*c_p);
3418
3419                     /* Folds that cross the 255/256 boundary are forbidden if
3420                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3421                      * pattern character is above 256, and its only other match
3422                      * is below 256, the only legal match will be to itself.
3423                      * We have thrown away the original, so have to compute
3424                      * which is the one above 255 */
3425                     if ((c1 < 256) != (c2 < 256)) {
3426                         if (OP(text_node) == EXACTFL
3427                             || (OP(text_node) == EXACTFA
3428                                 && (isASCII(c1) || isASCII(c2))))
3429                         {
3430                             if (c1 < 256) {
3431                                 c1 = c2;
3432                             }
3433                             else {
3434                                 c2 = c1;
3435                             }
3436                         }
3437                     }
3438                 }
3439             }
3440         }
3441         else /* Here, c1 is < 255 */
3442              if (utf8_target
3443                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3444                  && OP(text_node) != EXACTFL
3445                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3446         {
3447             /* Here, there could be something above Latin1 in the target which
3448              * folds to this character in the pattern.  All such cases except
3449              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3450              * involved in their folds, so are outside the scope of this
3451              * function */
3452             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3453                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3454             }
3455             else {
3456                 use_chrtest_void = TRUE;
3457             }
3458         }
3459         else { /* Here nothing above Latin1 can fold to the pattern character */
3460             switch (OP(text_node)) {
3461
3462                 case EXACTFL:   /* /l rules */
3463                     c2 = PL_fold_locale[c1];
3464                     break;
3465
3466                 case EXACTF:
3467                     if (! utf8_target) {    /* /d rules */
3468                         c2 = PL_fold[c1];
3469                         break;
3470                     }
3471                     /* FALLTHROUGH */
3472                     /* /u rules for all these.  This happens to work for
3473                      * EXACTFA as nothing in Latin1 folds to ASCII */
3474                 case EXACTFA:
3475                 case EXACTFU_TRICKYFOLD:
3476                 case EXACTFU_SS:
3477                 case EXACTFU:
3478                     c2 = PL_fold_latin1[c1];
3479                     break;
3480
3481                 default:
3482                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3483                     assert(0); /* NOTREACHED */
3484             }
3485         }
3486     }
3487
3488     /* Here have figured things out.  Set up the returns */
3489     if (use_chrtest_void) {
3490         *c2p = *c1p = CHRTEST_VOID;
3491     }
3492     else if (utf8_target) {
3493         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3494             uvchr_to_utf8(c1_utf8, c1);
3495             uvchr_to_utf8(c2_utf8, c2);
3496         }
3497
3498         /* Invariants are stored in both the utf8 and byte outputs; Use
3499          * negative numbers otherwise for the byte ones.  Make sure that the
3500          * byte ones are the same iff the utf8 ones are the same */
3501         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3502         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3503                 ? *c2_utf8
3504                 : (c1 == c2)
3505                   ? CHRTEST_NOT_A_CP_1
3506                   : CHRTEST_NOT_A_CP_2;
3507     }
3508     else if (c1 > 255) {
3509        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3510                            can represent */
3511            return FALSE;
3512        }
3513
3514        *c1p = *c2p = c2;    /* c2 is the only representable value */
3515     }
3516     else {  /* c1 is representable; see about c2 */
3517        *c1p = c1;
3518        *c2p = (c2 < 256) ? c2 : c1;
3519     }
3520
3521     return TRUE;
3522 }
3523
3524 /* returns -1 on failure, $+[0] on success */
3525 STATIC I32
3526 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3527 {
3528 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3529     dMY_CXT;
3530 #endif
3531     dVAR;
3532     const bool utf8_target = PL_reg_match_utf8;
3533     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3534     REGEXP *rex_sv = reginfo->prog;
3535     regexp *rex = ReANY(rex_sv);
3536     RXi_GET_DECL(rex,rexi);
3537     I32 oldsave;
3538     /* the current state. This is a cached copy of PL_regmatch_state */
3539     regmatch_state *st;
3540     /* cache heavy used fields of st in registers */
3541     regnode *scan;
3542     regnode *next;
3543     U32 n = 0;  /* general value; init to avoid compiler warning */
3544     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3545     char *locinput = startpos;
3546     char *pushinput; /* where to continue after a PUSH */
3547     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3548
3549     bool result = 0;        /* return value of S_regmatch */
3550     int depth = 0;          /* depth of backtrack stack */
3551     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3552     const U32 max_nochange_depth =
3553         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3554         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3555     regmatch_state *yes_state = NULL; /* state to pop to on success of
3556                                                             subpattern */
3557     /* mark_state piggy backs on the yes_state logic so that when we unwind
3558        the stack on success we can update the mark_state as we go */
3559     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3560     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3561     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3562     U32 state_num;
3563     bool no_final = 0;      /* prevent failure from backtracking? */
3564     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3565     char *startpoint = locinput;
3566     SV *popmark = NULL;     /* are we looking for a mark? */
3567     SV *sv_commit = NULL;   /* last mark name seen in failure */
3568     SV *sv_yes_mark = NULL; /* last mark name we have seen
3569                                during a successful match */
3570     U32 lastopen = 0;       /* last open we saw */
3571     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3572     SV* const oreplsv = GvSV(PL_replgv);
3573     /* these three flags are set by various ops to signal information to
3574      * the very next op. They have a useful lifetime of exactly one loop
3575      * iteration, and are not preserved or restored by state pushes/pops
3576      */
3577     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3578     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3579     int logical = 0;        /* the following EVAL is:
3580                                 0: (?{...})
3581                                 1: (?(?{...})X|Y)
3582                                 2: (??{...})
3583                                or the following IFMATCH/UNLESSM is:
3584                                 false: plain (?=foo)
3585                                 true:  used as a condition: (?(?=foo))
3586                             */
3587     PAD* last_pad = NULL;
3588     dMULTICALL;
3589     I32 gimme = G_SCALAR;
3590     CV *caller_cv = NULL;       /* who called us */
3591     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3592     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3593
3594 #ifdef DEBUGGING
3595     GET_RE_DEBUG_FLAGS_DECL;
3596 #endif
3597
3598     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3599     multicall_oldcatch = 0;
3600     multicall_cv = NULL;
3601     cx = NULL;
3602     PERL_UNUSED_VAR(multicall_cop);
3603     PERL_UNUSED_VAR(newsp);
3604
3605
3606     PERL_ARGS_ASSERT_REGMATCH;
3607
3608     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3609             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3610     }));
3611     /* on first ever call to regmatch, allocate first slab */
3612     if (!PL_regmatch_slab) {
3613         Newx(PL_regmatch_slab, 1, regmatch_slab);
3614         PL_regmatch_slab->prev = NULL;
3615         PL_regmatch_slab->next = NULL;
3616         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3617     }
3618
3619     oldsave = PL_savestack_ix;
3620     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3621     SAVEVPTR(PL_regmatch_slab);
3622     SAVEVPTR(PL_regmatch_state);
3623
3624     /* grab next free state slot */
3625     st = ++PL_regmatch_state;
3626     if (st >  SLAB_LAST(PL_regmatch_slab))
3627         st = PL_regmatch_state = S_push_slab(aTHX);
3628
3629     /* Note that nextchr is a byte even in UTF */
3630     SET_nextchr;
3631     scan = prog;
3632     while (scan != NULL) {
3633
3634         DEBUG_EXECUTE_r( {
3635             SV * const prop = sv_newmortal();
3636             regnode *rnext=regnext(scan);
3637             DUMP_EXEC_POS( locinput, scan, utf8_target );
3638             regprop(rex, prop, scan);
3639
3640             PerlIO_printf(Perl_debug_log,
3641                     "%3"IVdf":%*s%s(%"IVdf")\n",
3642                     (IV)(scan - rexi->program), depth*2, "",
3643                     SvPVX_const(prop),
3644                     (PL_regkind[OP(scan)] == END || !rnext) ?
3645                         0 : (IV)(rnext - rexi->program));
3646         });
3647
3648         next = scan + NEXT_OFF(scan);
3649         if (next == scan)
3650             next = NULL;
3651         state_num = OP(scan);
3652
3653       reenter_switch:
3654
3655         SET_nextchr;
3656         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3657
3658         switch (state_num) {
3659         case BOL: /*  /^../  */
3660             if (locinput == PL_bostr)
3661             {
3662                 /* reginfo->till = reginfo->bol; */
3663                 break;
3664             }
3665             sayNO;
3666
3667         case MBOL: /*  /^../m  */
3668             if (locinput == PL_bostr ||
3669                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3670             {
3671                 break;
3672             }
3673             sayNO;
3674
3675         case SBOL: /*  /^../s  */
3676             if (locinput == PL_bostr)
3677                 break;
3678             sayNO;
3679
3680         case GPOS: /*  \G  */
3681             if (locinput == reginfo->ganch)
3682                 break;
3683             sayNO;
3684
3685         case KEEPS: /*   \K  */
3686             /* update the startpoint */
3687             st->u.keeper.val = rex->offs[0].start;
3688             rex->offs[0].start = locinput - PL_bostr;
3689             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3690             assert(0); /*NOTREACHED*/
3691         case KEEPS_next_fail:
3692             /* rollback the start point change */
3693             rex->offs[0].start = st->u.keeper.val;
3694             sayNO_SILENT;
3695             assert(0); /*NOTREACHED*/
3696
3697         case EOL: /* /..$/  */
3698                 goto seol;
3699
3700         case MEOL: /* /..$/m  */
3701             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3702                 sayNO;
3703             break;
3704
3705         case SEOL: /* /..$/s  */
3706           seol:
3707             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3708                 sayNO;
3709             if (PL_regeol - locinput > 1)
3710                 sayNO;
3711             break;
3712
3713         case EOS: /*  \z  */
3714             if (!NEXTCHR_IS_EOS)
3715                 sayNO;
3716             break;
3717
3718         case SANY: /*  /./s  */
3719             if (NEXTCHR_IS_EOS)
3720                 sayNO;
3721             goto increment_locinput;
3722
3723         case CANY: /*  \C  */
3724             if (NEXTCHR_IS_EOS)
3725                 sayNO;
3726             locinput++;
3727             break;
3728
3729         case REG_ANY: /*  /./  */
3730             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3731                 sayNO;
3732             goto increment_locinput;
3733
3734
3735 #undef  ST
3736 #define ST st->u.trie
3737         case TRIEC: /* (ab|cd) with known charclass */
3738             /* In this case the charclass data is available inline so
3739                we can fail fast without a lot of extra overhead.
3740              */
3741             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3742                 DEBUG_EXECUTE_r(
3743                     PerlIO_printf(Perl_debug_log,
3744                               "%*s  %sfailed to match trie start class...%s\n",
3745                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3746                 );
3747                 sayNO_SILENT;
3748                 assert(0); /* NOTREACHED */
3749             }
3750             /* FALL THROUGH */
3751         case TRIE:  /* (ab|cd)  */
3752             /* the basic plan of execution of the trie is:
3753              * At the beginning, run though all the states, and
3754              * find the longest-matching word. Also remember the position
3755              * of the shortest matching word. For example, this pattern:
3756              *    1  2 3 4    5
3757              *    ab|a|x|abcd|abc
3758              * when matched against the string "abcde", will generate
3759              * accept states for all words except 3, with the longest
3760              * matching word being 4, and the shortest being 2 (with
3761              * the position being after char 1 of the string).
3762              *
3763              * Then for each matching word, in word order (i.e. 1,2,4,5),
3764              * we run the remainder of the pattern; on each try setting
3765              * the current position to the character following the word,
3766              * returning to try the next word on failure.
3767              *
3768              * We avoid having to build a list of words at runtime by
3769              * using a compile-time structure, wordinfo[].prev, which
3770              * gives, for each word, the previous accepting word (if any).
3771              * In the case above it would contain the mappings 1->2, 2->0,
3772              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3773              * the longest word (4 above), a list of all words, by
3774              * following the list of prev pointers; this gives us the
3775              * unordered list 4,5,1,2. Then given the current word we have
3776              * just tried, we can go through the list and find the
3777              * next-biggest word to try (so if we just failed on word 2,
3778              * the next in the list is 4).
3779              *
3780              * Since at runtime we don't record the matching position in
3781              * the string for each word, we have to work that out for
3782              * each word we're about to process. The wordinfo table holds
3783              * the character length of each word; given that we recorded
3784              * at the start: the position of the shortest word and its
3785              * length in chars, we just need to move the pointer the
3786              * difference between the two char lengths. Depending on
3787              * Unicode status and folding, that's cheap or expensive.
3788              *
3789              * This algorithm is optimised for the case where are only a
3790              * small number of accept states, i.e. 0,1, or maybe 2.
3791              * With lots of accepts states, and having to try all of them,
3792              * it becomes quadratic on number of accept states to find all
3793              * the next words.
3794              */
3795
3796             {
3797                 /* what type of TRIE am I? (utf8 makes this contextual) */
3798                 DECL_TRIE_TYPE(scan);
3799
3800                 /* what trie are we using right now */
3801                 reg_trie_data * const trie
3802                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3803                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3804                 U32 state = trie->startstate;
3805
3806                 if (   trie->bitmap
3807                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3808                 {
3809                     if (trie->states[ state ].wordnum) {
3810                          DEBUG_EXECUTE_r(
3811                             PerlIO_printf(Perl_debug_log,
3812                                           "%*s  %smatched empty string...%s\n",
3813                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3814                         );
3815                         if (!trie->jump)
3816                             break;
3817                     } else {
3818                         DEBUG_EXECUTE_r(
3819                             PerlIO_printf(Perl_debug_log,
3820                                           "%*s  %sfailed to match trie start class...%s\n",
3821                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3822                         );
3823                         sayNO_SILENT;
3824                    }
3825                 }
3826
3827             {
3828                 U8 *uc = ( U8* )locinput;
3829
3830                 STRLEN len = 0;
3831                 STRLEN foldlen = 0;
3832                 U8 *uscan = (U8*)NULL;
3833                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3834                 U32 charcount = 0; /* how many input chars we have matched */
3835                 U32 accepted = 0; /* have we seen any accepting states? */
3836
3837                 ST.jump = trie->jump;
3838                 ST.me = scan;
3839                 ST.firstpos = NULL;
3840                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3841                 ST.nextword = 0;
3842
3843                 /* fully traverse the TRIE; note the position of the
3844                    shortest accept state and the wordnum of the longest
3845                    accept state */
3846
3847                 while ( state && uc <= (U8*)PL_regeol ) {
3848                     U32 base = trie->states[ state ].trans.base;
3849                     UV uvc = 0;
3850                     U16 charid = 0;
3851                     U16 wordnum;
3852                     wordnum = trie->states[ state ].wordnum;
3853
3854                     if (wordnum) { /* it's an accept state */
3855                         if (!accepted) {
3856                             accepted = 1;
3857                             /* record first match position */
3858                             if (ST.longfold) {
3859                                 ST.firstpos = (U8*)locinput;
3860                                 ST.firstchars = 0;
3861                             }
3862                             else {
3863                                 ST.firstpos = uc;
3864                                 ST.firstchars = charcount;
3865                             }
3866                         }
3867                         if (!ST.nextword || wordnum < ST.nextword)
3868                             ST.nextword = wordnum;
3869                         ST.topword = wordnum;
3870                     }
3871
3872                     DEBUG_TRIE_EXECUTE_r({
3873                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3874                                 PerlIO_printf( Perl_debug_log,
3875                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3876                                     2+depth * 2, "", PL_colors[4],
3877                                     (UV)state, (accepted ? 'Y' : 'N'));
3878                     });
3879
3880                     /* read a char and goto next state */
3881                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3882                         I32 offset;
3883                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3884                                              uscan, len, uvc, charid, foldlen,
3885                                              foldbuf, uniflags);
3886                         charcount++;
3887                         if (foldlen>0)
3888                             ST.longfold = TRUE;
3889                         if (charid &&
3890                              ( ((offset =
3891                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3892
3893                              && ((U32)offset < trie->lasttrans)
3894                              && trie->trans[offset].check == state)
3895                         {
3896                             state = trie->trans[offset].next;
3897                         }
3898                         else {
3899                             state = 0;
3900                         }
3901                         uc += len;
3902
3903                     }
3904                     else {
3905                         state = 0;
3906                     }
3907                     DEBUG_TRIE_EXECUTE_r(
3908                         PerlIO_printf( Perl_debug_log,
3909                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3910                             charid, uvc, (UV)state, PL_colors[5] );
3911                     );
3912                 }
3913                 if (!accepted)
3914                    sayNO;
3915
3916                 /* calculate total number of accept states */
3917                 {
3918                     U16 w = ST.topword;
3919                     accepted = 0;
3920                     while (w) {
3921                         w = trie->wordinfo[w].prev;
3922                         accepted++;
3923                     }
3924                     ST.accepted = accepted;
3925                 }
3926
3927                 DEBUG_EXECUTE_r(
3928                     PerlIO_printf( Perl_debug_log,
3929                         "%*s  %sgot %"IVdf" possible matches%s\n",
3930                         REPORT_CODE_OFF + depth * 2, "",
3931                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3932                 );
3933                 goto trie_first_try; /* jump into the fail handler */
3934             }}
3935             assert(0); /* NOTREACHED */
3936
3937         case TRIE_next_fail: /* we failed - try next alternative */
3938         {
3939             U8 *uc;
3940             if ( ST.jump) {
3941                 REGCP_UNWIND(ST.cp);
3942                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3943             }
3944             if (!--ST.accepted) {
3945                 DEBUG_EXECUTE_r({
3946                     PerlIO_printf( Perl_debug_log,
3947                         "%*s  %sTRIE failed...%s\n",
3948                         REPORT_CODE_OFF+depth*2, "",
3949                         PL_colors[4],
3950                         PL_colors[5] );
3951                 });
3952                 sayNO_SILENT;
3953             }
3954             {
3955                 /* Find next-highest word to process.  Note that this code
3956                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3957                 U16 min = 0;
3958                 U16 word;
3959                 U16 const nextword = ST.nextword;
3960                 reg_trie_wordinfo * const wordinfo
3961                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3962                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3963                     if (word > nextword && (!min || word < min))
3964                         min = word;
3965                 }
3966                 ST.nextword = min;
3967             }
3968
3969           trie_first_try:
3970             if (do_cutgroup) {
3971                 do_cutgroup = 0;
3972                 no_final = 0;
3973             }
3974
3975             if ( ST.jump) {
3976                 ST.lastparen = rex->lastparen;
3977                 ST.lastcloseparen = rex->lastcloseparen;
3978                 REGCP_SET(ST.cp);
3979             }
3980
3981             /* find start char of end of current word */
3982             {
3983                 U32 chars; /* how many chars to skip */
3984                 reg_trie_data * const trie
3985                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3986
3987                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3988                             >=  ST.firstchars);
3989                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3990                             - ST.firstchars;
3991                 uc = ST.firstpos;
3992
3993                 if (ST.longfold) {
3994                     /* the hard option - fold each char in turn and find
3995                      * its folded length (which may be different */
3996                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3997                     STRLEN foldlen;
3998                     STRLEN len;
3999                     UV uvc;
4000                     U8 *uscan;
4001
4002                     while (chars) {
4003                         if (utf8_target) {
4004                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4005                                                     uniflags);
4006                             uc += len;
4007                         }
4008                         else {
4009                             uvc = *uc;
4010                             uc++;
4011                         }
4012                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4013                         uscan = foldbuf;
4014                         while (foldlen) {
4015                             if (!--chars)
4016                                 break;
4017                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4018                                             uniflags);
4019                             uscan += len;
4020                             foldlen -= len;
4021                         }
4022                     }
4023                 }
4024                 else {
4025                     if (utf8_target)
4026                         while (chars--)
4027                             uc += UTF8SKIP(uc);
4028                     else
4029                         uc += chars;
4030                 }
4031             }
4032
4033             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4034                             ? ST.jump[ST.nextword]
4035                             : NEXT_OFF(ST.me));
4036
4037             DEBUG_EXECUTE_r({
4038                 PerlIO_printf( Perl_debug_log,
4039                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4040                     REPORT_CODE_OFF+depth*2, "",
4041                     PL_colors[4],
4042                     ST.nextword,
4043                     PL_colors[5]
4044                     );
4045             });
4046
4047             if (ST.accepted > 1 || has_cutgroup) {
4048                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4049                 assert(0); /* NOTREACHED */
4050             }
4051             /* only one choice left - just continue */
4052             DEBUG_EXECUTE_r({
4053                 AV *const trie_words
4054                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4055                 SV ** const tmp = av_fetch( trie_words,
4056                     ST.nextword-1, 0 );
4057                 SV *sv= tmp ? sv_newmortal() : NULL;
4058
4059                 PerlIO_printf( Perl_debug_log,
4060                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4061                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4062                     ST.nextword,
4063                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4064                             PL_colors[0], PL_colors[1],
4065                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4066                         )
4067                     : "not compiled under -Dr",
4068                     PL_colors[5] );
4069             });
4070
4071             locinput = (char*)uc;
4072             continue; /* execute rest of RE */
4073             assert(0); /* NOTREACHED */
4074         }
4075 #undef  ST
4076
4077         case EXACT: {            /*  /abc/        */
4078             char *s = STRING(scan);
4079             ln = STR_LEN(scan);
4080             if (utf8_target != UTF_PATTERN) {
4081                 /* The target and the pattern have differing utf8ness. */
4082                 char *l = locinput;
4083                 const char * const e = s + ln;
4084
4085                 if (utf8_target) {
4086                     /* The target is utf8, the pattern is not utf8.
4087                      * Above-Latin1 code points can't match the pattern;
4088                      * invariants match exactly, and the other Latin1 ones need
4089                      * to be downgraded to a single byte in order to do the
4090                      * comparison.  (If we could be confident that the target
4091                      * is not malformed, this could be refactored to have fewer
4092                      * tests by just assuming that if the first bytes match, it
4093                      * is an invariant, but there are tests in the test suite
4094                      * dealing with (??{...}) which violate this) */
4095                     while (s < e) {
4096                         if (l >= PL_regeol)
4097                              sayNO;
4098                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4099                             sayNO;
4100                         }
4101                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4102                             if (*l != *s) {
4103                                 sayNO;
4104                             }
4105                             l++;
4106                         }
4107                         else {
4108                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4109                                 sayNO;
4110                             }
4111                             l += 2;
4112                         }
4113                         s++;
4114                     }
4115                 }
4116                 else {
4117                     /* The target is not utf8, the pattern is utf8. */
4118                     while (s < e) {
4119                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4120                         {
4121                             sayNO;
4122                         }
4123                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4124                             if (*s != *l) {
4125                                 sayNO;
4126                             }
4127                             s++;
4128                         }
4129                         else {
4130                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4131                                 sayNO;
4132                             }
4133                             s += 2;
4134                         }
4135                         l++;
4136                     }
4137                 }
4138                 locinput = l;
4139                 break;
4140             }
4141             /* The target and the pattern have the same utf8ness. */
4142             /* Inline the first character, for speed. */
4143             if (UCHARAT(s) != nextchr)
4144                 sayNO;
4145             if (PL_regeol - locinput < ln)
4146                 sayNO;
4147             if (ln > 1 && memNE(s, locinput, ln))
4148                 sayNO;
4149             locinput += ln;
4150             break;
4151             }
4152
4153         case EXACTFL: {          /*  /abc/il      */
4154             re_fold_t folder;
4155             const U8 * fold_array;
4156             const char * s;
4157             U32 fold_utf8_flags;
4158
4159             PL_reg_flags |= RF_tainted;
4160             folder = foldEQ_locale;
4161             fold_array = PL_fold_locale;
4162             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4163             goto do_exactf;
4164
4165         case EXACTFU_SS:         /*  /\x{df}/iu   */
4166         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4167         case EXACTFU:            /*  /abc/iu      */
4168             folder = foldEQ_latin1;
4169             fold_array = PL_fold_latin1;
4170             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4171             goto do_exactf;
4172
4173         case EXACTFA:            /*  /abc/iaa     */
4174             folder = foldEQ_latin1;
4175             fold_array = PL_fold_latin1;
4176             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4177             goto do_exactf;
4178
4179         case EXACTF:             /*  /abc/i       */
4180             folder = foldEQ;
4181             fold_array = PL_fold;
4182             fold_utf8_flags = 0;
4183
4184           do_exactf:
4185             s = STRING(scan);
4186             ln = STR_LEN(scan);
4187
4188             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4189               /* Either target or the pattern are utf8, or has the issue where
4190                * the fold lengths may differ. */
4191                 const char * const l = locinput;
4192                 char *e = PL_regeol;
4193
4194                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4195                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4196                 {
4197                     sayNO;
4198                 }
4199                 locinput = e;
4200                 break;
4201             }
4202
4203             /* Neither the target nor the pattern are utf8 */
4204             if (UCHARAT(s) != nextchr
4205                 && !NEXTCHR_IS_EOS
4206                 && UCHARAT(s) != fold_array[nextchr])
4207             {
4208                 sayNO;
4209             }
4210             if (PL_regeol - locinput < ln)
4211                 sayNO;
4212             if (ln > 1 && ! folder(s, locinput, ln))
4213                 sayNO;
4214             locinput += ln;
4215             break;
4216         }
4217
4218         /* XXX Could improve efficiency by separating these all out using a
4219          * macro or in-line function.  At that point regcomp.c would no longer
4220          * have to set the FLAGS fields of these */
4221         case BOUNDL:  /*  /\b/l  */
4222         case NBOUNDL: /*  /\B/l  */
4223             PL_reg_flags |= RF_tainted;
4224             /* FALL THROUGH */
4225         case BOUND:   /*  /\b/   */
4226         case BOUNDU:  /*  /\b/u  */
4227         case BOUNDA:  /*  /\b/a  */
4228         case NBOUND:  /*  /\B/   */
4229         case NBOUNDU: /*  /\B/u  */
4230         case NBOUNDA: /*  /\B/a  */
4231             /* was last char in word? */
4232             if (utf8_target
4233                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4234                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4235             {
4236                 if (locinput == PL_bostr)
4237                     ln = '\n';
4238                 else {
4239                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4240
4241                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4242                 }
4243                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4244                     ln = isALNUM_uni(ln);
4245                     if (NEXTCHR_IS_EOS)
4246                         n = 0;
4247                     else {
4248                         LOAD_UTF8_CHARCLASS_ALNUM();
4249                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4250                                                                 utf8_target);
4251                     }
4252                 }
4253                 else {
4254                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4255                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4256                 }
4257             }
4258             else {
4259
4260                 /* Here the string isn't utf8, or is utf8 and only ascii
4261                  * characters are to match \w.  In the latter case looking at
4262                  * the byte just prior to the current one may be just the final
4263                  * byte of a multi-byte character.  This is ok.  There are two
4264                  * cases:
4265                  * 1) it is a single byte character, and then the test is doing
4266                  *      just what it's supposed to.
4267                  * 2) it is a multi-byte character, in which case the final
4268                  *      byte is never mistakable for ASCII, and so the test
4269                  *      will say it is not a word character, which is the
4270                  *      correct answer. */
4271                 ln = (locinput != PL_bostr) ?
4272                     UCHARAT(locinput - 1) : '\n';
4273                 switch (FLAGS(scan)) {
4274                     case REGEX_UNICODE_CHARSET:
4275                         ln = isWORDCHAR_L1(ln);
4276                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4277                         break;
4278                     case REGEX_LOCALE_CHARSET:
4279                         ln = isALNUM_LC(ln);
4280                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4281                         break;
4282                     case REGEX_DEPENDS_CHARSET:
4283                         ln = isALNUM(ln);
4284                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4285                         break;
4286                     case REGEX_ASCII_RESTRICTED_CHARSET:
4287                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4288                         ln = isWORDCHAR_A(ln);
4289                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4290                         break;
4291                     default:
4292                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4293                         break;
4294                 }
4295             }
4296             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4297              * regcomp.sym */
4298             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4299                     sayNO;
4300             break;
4301
4302         case ANYOF:  /*  /[abc]/       */
4303             if (NEXTCHR_IS_EOS)
4304                 sayNO;
4305             if (utf8_target) {
4306                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4307                     sayNO;
4308                 locinput += UTF8SKIP(locinput);
4309                 break;
4310             }
4311             else {
4312                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4313                     sayNO;
4314                 locinput++;
4315                 break;
4316             }
4317             break;
4318
4319         /* Special char classes: \d, \w etc.
4320          * The defines start on line 166 or so */
4321         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4322                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4323                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4324                   ALNUMA, NALNUMA, isWORDCHAR_A,
4325                   alnum, "a");
4326
4327         case SPACEL:
4328             PL_reg_flags |= RF_tainted;
4329             if (NEXTCHR_IS_EOS) {
4330                 sayNO;
4331             }
4332             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4333                 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4334                     sayNO;
4335                 }
4336             }
4337             else if (! isSPACE_LC((U8) nextchr)) {
4338                     sayNO;
4339             }
4340             goto increment_locinput;
4341
4342         case NSPACEL:
4343             PL_reg_flags |= RF_tainted;
4344             if (NEXTCHR_IS_EOS) {
4345                 sayNO;
4346             }
4347             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4348                 if (isSPACE_LC_utf8((U8 *) locinput)) {
4349                     sayNO;
4350                 }
4351             }
4352             else if (isSPACE_LC(nextchr)) {
4353                     sayNO;
4354             }
4355             goto increment_locinput;
4356
4357         case SPACE:
4358             if (utf8_target) {
4359                 goto utf8_space;
4360             }
4361             /* FALL THROUGH */
4362         case SPACEA:
4363             if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4364                 sayNO;
4365             }
4366             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4367             locinput++;
4368             break;
4369
4370         case NSPACE:
4371             if (utf8_target) {
4372                 goto utf8_nspace;
4373             }
4374             /* FALL THROUGH */
4375         case NSPACEA:
4376             if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4377                 sayNO;
4378             }
4379             goto increment_locinput;
4380
4381         case SPACEU:
4382           utf8_space:
4383             if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4384                 sayNO;
4385             }
4386             goto increment_locinput;
4387
4388         case NSPACEU:
4389           utf8_nspace:
4390             if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4391                 sayNO;
4392             }
4393             goto increment_locinput;
4394
4395         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4396                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4397                 DIGITA, NDIGITA, isDIGIT_A,
4398                 digit, "0");
4399
4400         case POSIXA: /* /[[:ascii:]]/ etc */
4401             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4402                 sayNO;
4403             }
4404             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4405             locinput++;
4406             break;
4407
4408         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4409             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4410                 sayNO;
4411             }
4412             goto increment_locinput;
4413
4414         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4415                        a Unicode extended Grapheme Cluster */
4416             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4417               extended Grapheme Cluster is:
4418
4419                CR LF
4420                | Prepend* Begin Extend*
4421                | .
4422
4423                Begin is:           ( Special_Begin | ! Control )
4424                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4425                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4426                Control is:         [ GCB_Control  CR  LF ]
4427                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4428
4429                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4430                we can rewrite
4431
4432                    Begin is ( Regular_Begin + Special Begin )
4433
4434                It turns out that 98.4% of all Unicode code points match
4435                Regular_Begin.  Doing it this way eliminates a table match in
4436                the previous implementation for almost all Unicode code points.
4437
4438                There is a subtlety with Prepend* which showed up in testing.
4439                Note that the Begin, and only the Begin is required in:
4440                 | Prepend* Begin Extend*
4441                Also, Begin contains '! Control'.  A Prepend must be a
4442                '!  Control', which means it must also be a Begin.  What it
4443                comes down to is that if we match Prepend* and then find no
4444                suitable Begin afterwards, that if we backtrack the last
4445                Prepend, that one will be a suitable Begin.
4446             */
4447
4448             if (NEXTCHR_IS_EOS)
4449                 sayNO;
4450             if  (! utf8_target) {
4451
4452                 /* Match either CR LF  or '.', as all the other possibilities
4453                  * require utf8 */
4454                 locinput++;         /* Match the . or CR */
4455                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4456                                        match the LF */
4457                     && locinput < PL_regeol
4458                     && UCHARAT(locinput) == '\n') locinput++;
4459             }
4460             else {
4461
4462                 /* Utf8: See if is ( CR LF ); already know that locinput <
4463                  * PL_regeol, so locinput+1 is in bounds */
4464                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4465                         && UCHARAT(locinput + 1) == '\n')
4466                 {
4467                     locinput += 2;
4468                 }
4469                 else {
4470                     STRLEN len;
4471
4472                     /* In case have to backtrack to beginning, then match '.' */
4473                     char *starting = locinput;
4474
4475                     /* In case have to backtrack the last prepend */
4476                     char *previous_prepend = 0;
4477
4478                     LOAD_UTF8_CHARCLASS_GCB();
4479
4480                     /* Match (prepend)*   */
4481                     while (locinput < PL_regeol
4482                            && (len = is_GCB_Prepend_utf8(locinput)))
4483                     {
4484                         previous_prepend = locinput;
4485                         locinput += len;
4486                     }
4487
4488                     /* As noted above, if we matched a prepend character, but
4489                      * the next thing won't match, back off the last prepend we
4490                      * matched, as it is guaranteed to match the begin */
4491                     if (previous_prepend
4492                         && (locinput >=  PL_regeol
4493                             || (! swash_fetch(PL_utf8_X_regular_begin,
4494                                              (U8*)locinput, utf8_target)
4495                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4496                         )
4497                     {
4498                         locinput = previous_prepend;
4499                     }
4500
4501                     /* Note that here we know PL_regeol > locinput, as we
4502                      * tested that upon input to this switch case, and if we
4503                      * moved locinput forward, we tested the result just above
4504                      * and it either passed, or we backed off so that it will
4505                      * now pass */
4506                     if (swash_fetch(PL_utf8_X_regular_begin,
4507                                     (U8*)locinput, utf8_target)) {
4508                         locinput += UTF8SKIP(locinput);
4509                     }
4510                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4511
4512                         /* Here did not match the required 'Begin' in the
4513                          * second term.  So just match the very first
4514                          * character, the '.' of the final term of the regex */
4515                         locinput = starting + UTF8SKIP(starting);
4516                         goto exit_utf8;
4517                     } else {
4518
4519                         /* Here is a special begin.  It can be composed of
4520                          * several individual characters.  One possibility is
4521                          * RI+ */
4522                         if ((len = is_GCB_RI_utf8(locinput))) {
4523                             locinput += len;
4524                             while (locinput < PL_regeol
4525                                    && (len = is_GCB_RI_utf8(locinput)))
4526                             {
4527                                 locinput += len;
4528                             }
4529                         } else if ((len = is_GCB_T_utf8(locinput))) {
4530                             /* Another possibility is T+ */
4531                             locinput += len;
4532                             while (locinput < PL_regeol
4533                                 && (len = is_GCB_T_utf8(locinput)))
4534                             {
4535                                 locinput += len;
4536                             }
4537                         } else {
4538
4539                             /* Here, neither RI+ nor T+; must be some other
4540                              * Hangul.  That means it is one of the others: L,
4541                              * LV, LVT or V, and matches:
4542                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4543
4544                             /* Match L*           */
4545                             while (locinput < PL_regeol
4546                                    && (len = is_GCB_L_utf8(locinput)))
4547                             {
4548                                 locinput += len;
4549                             }
4550
4551                             /* Here, have exhausted L*.  If the next character
4552                              * is not an LV, LVT nor V, it means we had to have
4553                              * at least one L, so matches L+ in the original
4554                              * equation, we have a complete hangul syllable.
4555                              * Are done. */
4556
4557                             if (locinput < PL_regeol
4558                                 && is_GCB_LV_LVT_V_utf8(locinput))
4559                             {
4560
4561                                 /* Otherwise keep going.  Must be LV, LVT or V.
4562                                  * See if LVT */
4563                                 if (is_utf8_X_LVT((U8*)locinput)) {
4564                                     locinput += UTF8SKIP(locinput);
4565                                 } else {
4566
4567                                     /* Must be  V or LV.  Take it, then match
4568                                      * V*     */
4569                                     locinput += UTF8SKIP(locinput);
4570                                     while (locinput < PL_regeol
4571                                            && (len = is_GCB_V_utf8(locinput)))
4572                                     {
4573                                         locinput += len;
4574                                     }
4575                                 }
4576
4577                                 /* And any of LV, LVT, or V can be followed
4578                                  * by T*            */
4579                                 while (locinput < PL_regeol
4580                                        && (len = is_GCB_T_utf8(locinput)))
4581                                 {
4582                                     locinput += len;
4583                                 }
4584                             }
4585                         }
4586                     }
4587
4588                     /* Match any extender */
4589                     while (locinput < PL_regeol
4590                             && swash_fetch(PL_utf8_X_extend,
4591                                             (U8*)locinput, utf8_target))
4592                     {
4593                         locinput += UTF8SKIP(locinput);
4594                     }
4595                 }
4596             exit_utf8:
4597                 if (locinput > PL_regeol) sayNO;
4598             }
4599             break;
4600
4601         case NREFFL:  /*  /\g{name}/il  */
4602         {   /* The capture buffer cases.  The ones beginning with N for the
4603                named buffers just convert to the equivalent numbered and
4604                pretend they were called as the corresponding numbered buffer
4605                op.  */
4606             /* don't initialize these in the declaration, it makes C++
4607                unhappy */
4608             char *s;
4609             char type;
4610             re_fold_t folder;
4611             const U8 *fold_array;
4612             UV utf8_fold_flags;
4613
4614             PL_reg_flags |= RF_tainted;
4615             folder = foldEQ_locale;
4616             fold_array = PL_fold_locale;
4617             type = REFFL;
4618             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4619             goto do_nref;
4620
4621         case NREFFA:  /*  /\g{name}/iaa  */
4622             folder = foldEQ_latin1;
4623             fold_array = PL_fold_latin1;
4624             type = REFFA;
4625             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4626             goto do_nref;
4627
4628         case NREFFU:  /*  /\g{name}/iu  */
4629             folder = foldEQ_latin1;
4630             fold_array = PL_fold_latin1;
4631             type = REFFU;
4632             utf8_fold_flags = 0;
4633             goto do_nref;
4634
4635         case NREFF:  /*  /\g{name}/i  */
4636             folder = foldEQ;
4637             fold_array = PL_fold;
4638             type = REFF;
4639             utf8_fold_flags = 0;
4640             goto do_nref;
4641
4642         case NREF:  /*  /\g{name}/   */
4643             type = REF;
4644             folder = NULL;
4645             fold_array = NULL;
4646             utf8_fold_flags = 0;
4647           do_nref:
4648
4649             /* For the named back references, find the corresponding buffer
4650              * number */
4651             n = reg_check_named_buff_matched(rex,scan);
4652
4653             if ( ! n ) {
4654                 sayNO;
4655             }
4656             goto do_nref_ref_common;
4657
4658         case REFFL:  /*  /\1/il  */
4659             PL_reg_flags |= RF_tainted;
4660             folder = foldEQ_locale;
4661             fold_array = PL_fold_locale;
4662             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4663             goto do_ref;
4664
4665         case REFFA:  /*  /\1/iaa  */
4666             folder = foldEQ_latin1;
4667             fold_array = PL_fold_latin1;
4668             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4669             goto do_ref;
4670
4671         case REFFU:  /*  /\1/iu  */
4672             folder = foldEQ_latin1;
4673             fold_array = PL_fold_latin1;
4674             utf8_fold_flags = 0;
4675             goto do_ref;
4676
4677         case REFF:  /*  /\1/i  */
4678             folder = foldEQ;
4679             fold_array = PL_fold;
4680             utf8_fold_flags = 0;
4681             goto do_ref;
4682
4683         case REF:  /*  /\1/    */
4684             folder = NULL;
4685             fold_array = NULL;
4686             utf8_fold_flags = 0;
4687
4688           do_ref:
4689             type = OP(scan);
4690             n = ARG(scan);  /* which paren pair */
4691
4692           do_nref_ref_common:
4693             ln = rex->offs[n].start;
4694             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4695             if (rex->lastparen < n || ln == -1)
4696                 sayNO;                  /* Do not match unless seen CLOSEn. */
4697             if (ln == rex->offs[n].end)
4698                 break;
4699
4700             s = PL_bostr + ln;
4701             if (type != REF     /* REF can do byte comparison */
4702                 && (utf8_target || type == REFFU))
4703             { /* XXX handle REFFL better */
4704                 char * limit = PL_regeol;
4705
4706                 /* This call case insensitively compares the entire buffer
4707                     * at s, with the current input starting at locinput, but
4708                     * not going off the end given by PL_regeol, and returns in
4709                     * <limit> upon success, how much of the current input was
4710                     * matched */
4711                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4712                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4713                 {
4714                     sayNO;
4715                 }
4716                 locinput = limit;
4717                 break;
4718             }
4719
4720             /* Not utf8:  Inline the first character, for speed. */
4721             if (!NEXTCHR_IS_EOS &&
4722                 UCHARAT(s) != nextchr &&
4723                 (type == REF ||
4724                  UCHARAT(s) != fold_array[nextchr]))
4725                 sayNO;
4726             ln = rex->offs[n].end - ln;
4727             if (locinput + ln > PL_regeol)
4728                 sayNO;
4729             if (ln > 1 && (type == REF
4730                            ? memNE(s, locinput, ln)
4731                            : ! folder(s, locinput, ln)))
4732                 sayNO;
4733             locinput += ln;
4734             break;
4735         }
4736
4737         case NOTHING: /* null op; e.g. the 'nothing' following
4738                        * the '*' in m{(a+|b)*}' */
4739             break;
4740         case TAIL: /* placeholder while compiling (A|B|C) */
4741             break;
4742
4743         case BACK: /* ??? doesn't appear to be used ??? */
4744             break;
4745
4746 #undef  ST
4747 #define ST st->u.eval
4748         {
4749             SV *ret;
4750             REGEXP *re_sv;
4751             regexp *re;
4752             regexp_internal *rei;
4753             regnode *startpoint;
4754
4755         case GOSTART: /*  (?R)  */
4756         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4757             if (cur_eval && cur_eval->locinput==locinput) {
4758                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4759                     Perl_croak(aTHX_ "Infinite recursion in regex");
4760                 if ( ++nochange_depth > max_nochange_depth )
4761                     Perl_croak(aTHX_
4762                         "Pattern subroutine nesting without pos change"
4763                         " exceeded limit in regex");
4764             } else {
4765                 nochange_depth = 0;
4766             }
4767             re_sv = rex_sv;
4768             re = rex;
4769             rei = rexi;
4770             if (OP(scan)==GOSUB) {
4771                 startpoint = scan + ARG2L(scan);
4772                 ST.close_paren = ARG(scan);
4773             } else {
4774                 startpoint = rei->program+1;
4775                 ST.close_paren = 0;
4776             }
4777             goto eval_recurse_doit;
4778             assert(0); /* NOTREACHED */
4779
4780         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4781             if (cur_eval && cur_eval->locinput==locinput) {
4782                 if ( ++nochange_depth > max_nochange_depth )
4783                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4784             } else {
4785                 nochange_depth = 0;
4786             }
4787             {
4788                 /* execute the code in the {...} */
4789
4790                 dSP;
4791                 PADOFFSET before;
4792                 OP * const oop = PL_op;
4793                 COP * const ocurcop = PL_curcop;
4794                 OP *nop;
4795                 char *saved_regeol = PL_regeol;
4796                 struct re_save_state saved_state;
4797                 CV *newcv;
4798
4799                 /* save *all* paren positions */
4800                 regcppush(rex, 0);
4801                 REGCP_SET(runops_cp);
4802
4803                 /* To not corrupt the existing regex state while executing the
4804                  * eval we would normally put it on the save stack, like with
4805                  * save_re_context. However, re-evals have a weird scoping so we
4806                  * can't just add ENTER/LEAVE here. With that, things like
4807                  *
4808                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4809                  *
4810                  * would break, as they expect the localisation to be unwound
4811                  * only when the re-engine backtracks through the bit that
4812                  * localised it.
4813                  *
4814                  * What we do instead is just saving the state in a local c
4815                  * variable.
4816                  */
4817                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4818
4819                 PL_reg_state.re_reparsing = FALSE;
4820
4821                 if (!caller_cv)
4822                     caller_cv = find_runcv(NULL);
4823
4824                 n = ARG(scan);
4825
4826                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4827                     newcv = (ReANY(
4828                                                 (REGEXP*)(rexi->data->data[n])
4829                                             ))->qr_anoncv
4830                                         ;
4831                     nop = (OP*)rexi->data->data[n+1];
4832                 }
4833                 else if (rexi->data->what[n] == 'l') { /* literal code */
4834                     newcv = caller_cv;
4835                     nop = (OP*)rexi->data->data[n];
4836                     assert(CvDEPTH(newcv));
4837                 }
4838                 else {
4839                     /* literal with own CV */
4840                     assert(rexi->data->what[n] == 'L');
4841                     newcv = rex->qr_anoncv;
4842                     nop = (OP*)rexi->data->data[n];
4843                 }
4844
4845                 /* normally if we're about to execute code from the same
4846                  * CV that we used previously, we just use the existing
4847                  * CX stack entry. However, its possible that in the
4848                  * meantime we may have backtracked, popped from the save
4849                  * stack, and undone the SAVECOMPPAD(s) associated with
4850                  * PUSH_MULTICALL; in which case PL_comppad no longer
4851                  * points to newcv's pad. */
4852                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4853                 {
4854                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4855                     if (last_pushed_cv) {
4856                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4857                     }
4858                     else {
4859                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4860                     }
4861                     last_pushed_cv = newcv;
4862                 }
4863                 else {
4864                     /* these assignments are just to silence compiler
4865                      * warnings */
4866                     multicall_cop = NULL;
4867                     newsp = NULL;
4868                 }
4869                 last_pad = PL_comppad;
4870
4871                 /* the initial nextstate you would normally execute
4872                  * at the start of an eval (which would cause error
4873                  * messages to come from the eval), may be optimised
4874                  * away from the execution path in the regex code blocks;
4875                  * so manually set PL_curcop to it initially */
4876                 {
4877                     OP *o = cUNOPx(nop)->op_first;
4878                     assert(o->op_type == OP_NULL);
4879                     if (o->op_targ == OP_SCOPE) {
4880                         o = cUNOPo->op_first;
4881                     }
4882                     else {
4883                         assert(o->op_targ == OP_LEAVE);
4884                         o = cUNOPo->op_first;
4885                         assert(o->op_type == OP_ENTER);
4886                         o = o->op_sibling;
4887                     }
4888
4889                     if (o->op_type != OP_STUB) {
4890                         assert(    o->op_type == OP_NEXTSTATE
4891                                 || o->op_type == OP_DBSTATE
4892                                 || (o->op_type == OP_NULL
4893                                     &&  (  o->op_targ == OP_NEXTSTATE
4894                                         || o->op_targ == OP_DBSTATE
4895                                         )
4896                                     )
4897                         );
4898                         PL_curcop = (COP*)o;
4899                     }
4900                 }
4901                 nop = nop->op_next;
4902
4903                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4904                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4905
4906                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4907
4908                 if (sv_yes_mark) {
4909                     SV *sv_mrk = get_sv("REGMARK", 1);
4910                     sv_setsv(sv_mrk, sv_yes_mark);
4911                 }
4912
4913                 /* we don't use MULTICALL here as we want to call the
4914                  * first op of the block of interest, rather than the
4915                  * first op of the sub */
4916                 before = SP-PL_stack_base;
4917                 PL_op = nop;
4918                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4919                 SPAGAIN;
4920                 if (SP-PL_stack_base == before)
4921                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4922                 else {
4923                     ret = POPs;
4924                     PUTBACK;
4925                 }
4926
4927                 /* before restoring everything, evaluate the returned
4928                  * value, so that 'uninit' warnings don't use the wrong
4929                  * PL_op or pad. Also need to process any magic vars
4930                  * (e.g. $1) *before* parentheses are restored */
4931
4932                 PL_op = NULL;
4933
4934                 re_sv = NULL;
4935                 if (logical == 0)        /*   (?{})/   */
4936                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4937                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4938                     sw = cBOOL(SvTRUE(ret));
4939                     logical = 0;
4940                 }
4941                 else {                   /*  /(??{})  */
4942                     /*  if its overloaded, let the regex compiler handle
4943                      *  it; otherwise extract regex, or stringify  */
4944                     if (!SvAMAGIC(ret)) {
4945                         SV *sv = ret;
4946                         if (SvROK(sv))
4947                             sv = SvRV(sv);
4948                         if (SvTYPE(sv) == SVt_REGEXP)
4949                             re_sv = (REGEXP*) sv;
4950                         else if (SvSMAGICAL(sv)) {
4951                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4952                             if (mg)
4953                                 re_sv = (REGEXP *) mg->mg_obj;
4954                         }
4955
4956                         /* force any magic, undef warnings here */
4957                         if (!re_sv) {
4958                             ret = sv_mortalcopy(ret);
4959                             (void) SvPV_force_nolen(ret);
4960                         }
4961                     }
4962
4963                 }
4964
4965                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4966
4967                 /* *** Note that at this point we don't restore
4968                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4969                  * be used again soon. This is safe as long as nothing
4970                  * in the regexp code uses the pad ! */
4971                 PL_op = oop;
4972                 PL_curcop = ocurcop;
4973                 PL_regeol = saved_regeol;
4974                 S_regcp_restore(aTHX_ rex, runops_cp);
4975
4976                 if (logical != 2)
4977                     break;
4978             }
4979
4980                 /* only /(??{})/  from now on */
4981                 logical = 0;
4982                 {
4983                     /* extract RE object from returned value; compiling if
4984                      * necessary */
4985
4986                     if (re_sv) {
4987                         re_sv = reg_temp_copy(NULL, re_sv);
4988                     }
4989                     else {
4990                         U32 pm_flags = 0;
4991                         const I32 osize = PL_regsize;
4992
4993                         if (SvUTF8(ret) && IN_BYTES) {
4994                             /* In use 'bytes': make a copy of the octet
4995                              * sequence, but without the flag on */
4996                             STRLEN len;
4997                             const char *const p = SvPV(ret, len);
4998                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4999                         }
5000                         if (rex->intflags & PREGf_USE_RE_EVAL)
5001                             pm_flags |= PMf_USE_RE_EVAL;
5002
5003                         /* if we got here, it should be an engine which
5004                          * supports compiling code blocks and stuff */
5005                         assert(rex->engine && rex->engine->op_comp);
5006                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5007                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5008                                     rex->engine, NULL, NULL,
5009                                     /* copy /msix etc to inner pattern */
5010                                     scan->flags,
5011                                     pm_flags);
5012
5013                         if (!(SvFLAGS(ret)
5014                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5015                                  | SVs_GMG))) {
5016                             /* This isn't a first class regexp. Instead, it's
5017                                caching a regexp onto an existing, Perl visible
5018                                scalar.  */
5019                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5020                         }
5021                         PL_regsize = osize;
5022                         /* safe to do now that any $1 etc has been
5023                          * interpolated into the new pattern string and
5024                          * compiled */
5025                         S_regcp_restore(aTHX_ rex, runops_cp);
5026                     }
5027                     SAVEFREESV(re_sv);
5028                     re = ReANY(re_sv);
5029                 }
5030                 RXp_MATCH_COPIED_off(re);
5031                 re->subbeg = rex->subbeg;
5032                 re->sublen = rex->sublen;
5033                 re->suboffset = rex->suboffset;
5034                 re->subcoffset = rex->subcoffset;
5035                 rei = RXi_GET(re);
5036                 DEBUG_EXECUTE_r(
5037                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5038                         "Matching embedded");
5039                 );
5040                 startpoint = rei->program + 1;
5041                 ST.close_paren = 0; /* only used for GOSUB */
5042
5043         eval_recurse_doit: /* Share code with GOSUB below this line */
5044                 /* run the pattern returned from (??{...}) */
5045                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
5046                 REGCP_SET(ST.lastcp);
5047
5048                 re->lastparen = 0;
5049                 re->lastcloseparen = 0;
5050
5051                 PL_regsize = 0;
5052
5053                 /* XXXX This is too dramatic a measure... */
5054                 PL_reg_maxiter = 0;
5055
5056                 ST.toggle_reg_flags = PL_reg_flags;
5057                 if (RX_UTF8(re_sv))
5058                     PL_reg_flags |= RF_utf8;
5059                 else
5060                     PL_reg_flags &= ~RF_utf8;
5061                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5062
5063                 ST.prev_rex = rex_sv;
5064                 ST.prev_curlyx = cur_curlyx;
5065                 rex_sv = re_sv;
5066                 SET_reg_curpm(rex_sv);
5067                 rex = re;
5068                 rexi = rei;
5069                 cur_curlyx = NULL;
5070                 ST.B = next;
5071                 ST.prev_eval = cur_eval;
5072                 cur_eval = st;
5073                 /* now continue from first node in postoned RE */
5074                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5075                 assert(0); /* NOTREACHED */
5076         }
5077
5078         case EVAL_AB: /* cleanup after a successful (??{A})B */
5079             /* note: this is called twice; first after popping B, then A */
5080             PL_reg_flags ^= ST.toggle_reg_flags;
5081             rex_sv = ST.prev_rex;
5082             SET_reg_curpm(rex_sv);
5083             rex = ReANY(rex_sv);
5084             rexi = RXi_GET(rex);
5085             regcpblow(ST.cp);
5086             cur_eval = ST.prev_eval;
5087             cur_curlyx = ST.prev_curlyx;
5088
5089             /* XXXX This is too dramatic a measure... */
5090             PL_reg_maxiter = 0;
5091             if ( nochange_depth )
5092                 nochange_depth--;
5093             sayYES;
5094
5095
5096         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5097             /* note: this is called twice; first after popping B, then A */
5098             PL_reg_flags ^= ST.toggle_reg_flags;
5099             rex_sv = ST.prev_rex;
5100             SET_reg_curpm(rex_sv);
5101             rex = ReANY(rex_sv);
5102             rexi = RXi_GET(rex);
5103
5104             REGCP_UNWIND(ST.lastcp);
5105             regcppop(rex);
5106             cur_eval = ST.prev_eval;
5107             cur_curlyx = ST.prev_curlyx;
5108             /* XXXX This is too dramatic a measure... */
5109             PL_reg_maxiter = 0;
5110             if ( nochange_depth )
5111                 nochange_depth--;
5112             sayNO_SILENT;
5113 #undef ST
5114
5115         case OPEN: /*  (  */
5116             n = ARG(scan);  /* which paren pair */
5117             rex->offs[n].start_tmp = locinput - PL_bostr;
5118             if (n > PL_regsize)
5119                 PL_regsize = n;
5120             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5121                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5122                 PTR2UV(rex),
5123                 PTR2UV(rex->offs),
5124                 (UV)n,
5125                 (IV)rex->offs[n].start_tmp,
5126                 (UV)PL_regsize
5127             ));
5128             lastopen = n;
5129             break;
5130
5131 /* XXX really need to log other places start/end are set too */
5132 #define CLOSE_CAPTURE \
5133     rex->offs[n].start = rex->offs[n].start_tmp; \
5134     rex->offs[n].end = locinput - PL_bostr; \
5135     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5136         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5137         PTR2UV(rex), \
5138         PTR2UV(rex->offs), \
5139         (UV)n, \
5140         (IV)rex->offs[n].start, \
5141         (IV)rex->offs[n].end \
5142     ))
5143
5144         case CLOSE:  /*  )  */
5145             n = ARG(scan);  /* which paren pair */
5146             CLOSE_CAPTURE;
5147             /*if (n > PL_regsize)
5148                 PL_regsize = n;*/
5149             if (n > rex->lastparen)
5150                 rex->lastparen = n;
5151             rex->lastcloseparen = n;
5152             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5153                 goto fake_end;
5154             }
5155             break;
5156
5157         case ACCEPT:  /*  (*ACCEPT)  */
5158             if (ARG(scan)){
5159                 regnode *cursor;
5160                 for (cursor=scan;
5161                      cursor && OP(cursor)!=END;
5162                      cursor=regnext(cursor))
5163                 {
5164                     if ( OP(cursor)==CLOSE ){
5165                         n = ARG(cursor);
5166                         if ( n <= lastopen ) {
5167                             CLOSE_CAPTURE;
5168                             /*if (n > PL_regsize)
5169                             PL_regsize = n;*/
5170                             if (n > rex->lastparen)
5171                                 rex->lastparen = n;
5172                             rex->lastcloseparen = n;
5173                             if ( n == ARG(scan) || (cur_eval &&
5174                                 cur_eval->u.eval.close_paren == n))
5175                                 break;
5176                         }
5177                     }
5178                 }
5179             }
5180             goto fake_end;
5181             /*NOTREACHED*/
5182
5183         case GROUPP:  /*  (?(1))  */
5184             n = ARG(scan);  /* which paren pair */
5185             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5186             break;
5187
5188         case NGROUPP:  /*  (?(<name>))  */
5189             /* reg_check_named_buff_matched returns 0 for no match */
5190             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5191             break;
5192
5193         case INSUBP:   /*  (?(R))  */
5194             n = ARG(scan);
5195             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5196             break;
5197
5198         case DEFINEP:  /*  (?(DEFINE))  */
5199             sw = 0;
5200             break;
5201
5202         case IFTHEN:   /*  (?(cond)A|B)  */
5203             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5204             if (sw)
5205                 next = NEXTOPER(NEXTOPER(scan));
5206             else {
5207                 next = scan + ARG(scan);
5208                 if (OP(next) == IFTHEN) /* Fake one. */
5209                     next = NEXTOPER(NEXTOPER(next));
5210             }
5211             break;
5212
5213         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5214             logical = scan->flags;
5215             break;
5216
5217 /*******************************************************************
5218
5219 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5220 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5221 STAR/PLUS/CURLY/CURLYN are used instead.)
5222
5223 A*B is compiled as <CURLYX><A><WHILEM><B>
5224
5225 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5226 state, which contains the current count, initialised to -1. It also sets
5227 cur_curlyx to point to this state, with any previous value saved in the
5228 state block.
5229
5230 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5231 since the pattern may possibly match zero times (i.e. it's a while {} loop
5232 rather than a do {} while loop).
5233
5234 Each entry to WHILEM represents a successful match of A. The count in the
5235 CURLYX block is incremented, another WHILEM state is pushed, and execution
5236 passes to A or B depending on greediness and the current count.
5237
5238 For example, if matching against the string a1a2a3b (where the aN are
5239 substrings that match /A/), then the match progresses as follows: (the
5240 pushed states are interspersed with the bits of strings matched so far):
5241
5242     <CURLYX cnt=-1>
5243     <CURLYX cnt=0><WHILEM>
5244     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5245     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5246     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5247     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5248
5249 (Contrast this with something like CURLYM, which maintains only a single
5250 backtrack state:
5251
5252     <CURLYM cnt=0> a1
5253     a1 <CURLYM cnt=1> a2
5254     a1 a2 <CURLYM cnt=2> a3
5255     a1 a2 a3 <CURLYM cnt=3> b
5256 )
5257
5258 Each WHILEM state block marks a point to backtrack to upon partial failure
5259 of A or B, and also contains some minor state data related to that
5260 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5261 overall state, such as the count, and pointers to the A and B ops.
5262
5263 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5264 must always point to the *current* CURLYX block, the rules are:
5265
5266 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5267 and set cur_curlyx to point the new block.
5268
5269 When popping the CURLYX block after a successful or unsuccessful match,
5270 restore the previous cur_curlyx.
5271
5272 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5273 to the outer one saved in the CURLYX block.
5274
5275 When popping the WHILEM block after a successful or unsuccessful B match,
5276 restore the previous cur_curlyx.
5277
5278 Here's an example for the pattern (AI* BI)*BO
5279 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5280
5281 cur_
5282 curlyx backtrack stack
5283 ------ ---------------
5284 NULL
5285 CO     <CO prev=NULL> <WO>
5286 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5287 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5288 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5289
5290 At this point the pattern succeeds, and we work back down the stack to
5291 clean up, restoring as we go:
5292
5293 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5294 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5295 CO     <CO prev=NULL> <WO>
5296 NULL
5297
5298 *******************************************************************/
5299
5300 #define ST st->u.curlyx
5301
5302         case CURLYX:    /* start of /A*B/  (for complex A) */
5303         {
5304             /* No need to save/restore up to this paren */
5305             I32 parenfloor = scan->flags;
5306
5307             assert(next); /* keep Coverity happy */
5308             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5309                 next += ARG(next);
5310
5311             /* XXXX Probably it is better to teach regpush to support
5312                parenfloor > PL_regsize... */
5313             if (parenfloor > (I32)rex->lastparen)
5314                 parenfloor = rex->lastparen; /* Pessimization... */
5315
5316             ST.prev_curlyx= cur_curlyx;
5317             cur_curlyx = st;
5318             ST.cp = PL_savestack_ix;
5319
5320             /* these fields contain the state of the current curly.
5321              * they are accessed by subsequent WHILEMs */
5322             ST.parenfloor = parenfloor;
5323             ST.me = scan;
5324             ST.B = next;
5325             ST.minmod = minmod;
5326             minmod = 0;
5327             ST.count = -1;      /* this will be updated by WHILEM */
5328             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5329
5330             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5331             assert(0); /* NOTREACHED */
5332         }
5333
5334         case CURLYX_end: /* just finished matching all of A*B */
5335             cur_curlyx = ST.prev_curlyx;
5336             sayYES;
5337             assert(0); /* NOTREACHED */
5338
5339         case CURLYX_end_fail: /* just failed to match all of A*B */
5340             regcpblow(ST.cp);
5341             cur_curlyx = ST.prev_curlyx;
5342             sayNO;
5343             assert(0); /* NOTREACHED */
5344
5345
5346 #undef ST
5347 #define ST st->u.whilem
5348
5349         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5350         {
5351             /* see the discussion above about CURLYX/WHILEM */
5352             I32 n;
5353             int min = ARG1(cur_curlyx->u.curlyx.me);
5354             int max = ARG2(cur_curlyx->u.curlyx.me);
5355             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5356
5357             assert(cur_curlyx); /* keep Coverity happy */
5358             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5359             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5360             ST.cache_offset = 0;
5361             ST.cache_mask = 0;
5362
5363
5364             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5365                   "%*s  whilem: matched %ld out of %d..%d\n",
5366                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5367             );
5368
5369             /* First just match a string of min A's. */
5370
5371             if (n < min) {
5372                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5373                 cur_curlyx->u.curlyx.lastloc = locinput;
5374                 REGCP_SET(ST.lastcp);
5375
5376                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5377                 assert(0); /* NOTREACHED */
5378             }
5379
5380             /* If degenerate A matches "", assume A done. */
5381
5382             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5383                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5384                    "%*s  whilem: empty match detected, trying continuation...\n",
5385                    REPORT_CODE_OFF+depth*2, "")
5386                 );
5387                 goto do_whilem_B_max;
5388             }
5389
5390             /* super-linear cache processing */
5391
5392             if (scan->flags) {
5393
5394                 if (!PL_reg_maxiter) {
5395                     /* start the countdown: Postpone detection until we
5396                      * know the match is not *that* much linear. */
5397                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5398                     /* possible overflow for long strings and many CURLYX's */
5399                     if (PL_reg_maxiter < 0)
5400                         PL_reg_maxiter = I32_MAX;
5401                     PL_reg_leftiter = PL_reg_maxiter;
5402                 }
5403
5404                 if (PL_reg_leftiter-- == 0) {
5405                     /* initialise cache */
5406                     const I32 size = (PL_reg_maxiter + 7)/8;
5407                     if (PL_reg_poscache) {
5408                         if ((I32)PL_reg_poscache_size < size) {
5409                             Renew(PL_reg_poscache, size, char);
5410                             PL_reg_poscache_size = size;
5411                         }
5412                         Zero(PL_reg_poscache, size, char);
5413                     }
5414                     else {
5415                         PL_reg_poscache_size = size;
5416                         Newxz(PL_reg_poscache, size, char);
5417                     }
5418                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5419       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5420                               PL_colors[4], PL_colors[5])
5421                     );
5422                 }
5423
5424                 if (PL_reg_leftiter < 0) {
5425                     /* have we already failed at this position? */
5426                     I32 offset, mask;
5427                     offset  = (scan->flags & 0xf) - 1
5428                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5429                     mask    = 1 << (offset % 8);
5430                     offset /= 8;
5431                     if (PL_reg_poscache[offset] & mask) {
5432                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5433                             "%*s  whilem: (cache) already tried at this position...\n",
5434                             REPORT_CODE_OFF+depth*2, "")
5435                         );
5436                         sayNO; /* cache records failure */
5437                     }
5438                     ST.cache_offset = offset;
5439                     ST.cache_mask   = mask;
5440                 }
5441             }
5442
5443             /* Prefer B over A for minimal matching. */
5444
5445             if (cur_curlyx->u.curlyx.minmod) {
5446                 ST.save_curlyx = cur_curlyx;
5447                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5448                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5449                 REGCP_SET(ST.lastcp);
5450                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5451                                     locinput);
5452                 assert(0); /* NOTREACHED */
5453             }
5454
5455             /* Prefer A over B for maximal matching. */
5456
5457             if (n < max) { /* More greed allowed? */
5458                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5459                 cur_curlyx->u.curlyx.lastloc = locinput;
5460                 REGCP_SET(ST.lastcp);
5461                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5462                 assert(0); /* NOTREACHED */
5463             }
5464             goto do_whilem_B_max;
5465         }
5466         assert(0); /* NOTREACHED */
5467
5468         case WHILEM_B_min: /* just matched B in a minimal match */
5469         case WHILEM_B_max: /* just matched B in a maximal match */
5470             cur_curlyx = ST.save_curlyx;
5471             sayYES;
5472             assert(0); /* NOTREACHED */
5473
5474         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5475             cur_curlyx = ST.save_curlyx;
5476             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5477             cur_curlyx->u.curlyx.count--;
5478             CACHEsayNO;
5479             assert(0); /* NOTREACHED */
5480
5481         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5482             /* FALL THROUGH */
5483         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5484             REGCP_UNWIND(ST.lastcp);
5485             regcppop(rex);
5486             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5487             cur_curlyx->u.curlyx.count--;
5488             CACHEsayNO;
5489             assert(0); /* NOTREACHED */
5490
5491         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5492             REGCP_UNWIND(ST.lastcp);
5493             regcppop(rex);      /* Restore some previous $<digit>s? */
5494             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5495                 "%*s  whilem: failed, trying continuation...\n",
5496                 REPORT_CODE_OFF+depth*2, "")
5497             );
5498           do_whilem_B_max:
5499             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5500                 && ckWARN(WARN_REGEXP)
5501                 && !(PL_reg_flags & RF_warned))
5502             {
5503                 PL_reg_flags |= RF_warned;
5504                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5505                      "Complex regular subexpression recursion limit (%d) "
5506                      "exceeded",
5507                      REG_INFTY - 1);
5508             }
5509
5510             /* now try B */
5511             ST.save_curlyx = cur_curlyx;
5512             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5513             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5514                                 locinput);
5515             assert(0); /* NOTREACHED */
5516
5517         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5518             cur_curlyx = ST.save_curlyx;
5519             REGCP_UNWIND(ST.lastcp);
5520             regcppop(rex);
5521
5522             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5523                 /* Maximum greed exceeded */
5524                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5525                     && ckWARN(WARN_REGEXP)
5526                     && !(PL_reg_flags & RF_warned))
5527                 {
5528                     PL_reg_flags |= RF_warned;
5529                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5530                         "Complex regular subexpression recursion "
5531                         "limit (%d) exceeded",
5532                         REG_INFTY - 1);
5533                 }
5534                 cur_curlyx->u.curlyx.count--;
5535                 CACHEsayNO;
5536             }
5537
5538             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5539                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5540             );
5541             /* Try grabbing another A and see if it helps. */
5542             cur_curlyx->u.curlyx.lastloc = locinput;
5543             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5544             REGCP_SET(ST.lastcp);
5545             PUSH_STATE_GOTO(WHILEM_A_min,
5546                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5547                 locinput);
5548             assert(0); /* NOTREACHED */
5549
5550 #undef  ST
5551 #define ST st->u.branch
5552
5553         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5554             next = scan + ARG(scan);
5555             if (next == scan)
5556                 next = NULL;
5557             scan = NEXTOPER(scan);
5558             /* FALL THROUGH */
5559
5560         case BRANCH:        /*  /(...|A|...)/ */
5561             scan = NEXTOPER(scan); /* scan now points to inner node */
5562             ST.lastparen = rex->lastparen;
5563             ST.lastcloseparen = rex->lastcloseparen;
5564             ST.next_branch = next;
5565             REGCP_SET(ST.cp);
5566
5567             /* Now go into the branch */
5568             if (has_cutgroup) {
5569                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5570             } else {
5571                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5572             }
5573             assert(0); /* NOTREACHED */
5574
5575         case CUTGROUP:  /*  /(*THEN)/  */
5576             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5577                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5578             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5579             assert(0); /* NOTREACHED */
5580
5581         case CUTGROUP_next_fail:
5582             do_cutgroup = 1;
5583             no_final = 1;
5584             if (st->u.mark.mark_name)
5585                 sv_commit = st->u.mark.mark_name;
5586             sayNO;
5587             assert(0); /* NOTREACHED */
5588
5589         case BRANCH_next:
5590             sayYES;
5591             assert(0); /* NOTREACHED */
5592
5593         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5594             if (do_cutgroup) {
5595                 do_cutgroup = 0;
5596                 no_final = 0;
5597             }
5598             REGCP_UNWIND(ST.cp);
5599             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5600             scan = ST.next_branch;
5601             /* no more branches? */
5602             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5603                 DEBUG_EXECUTE_r({
5604                     PerlIO_printf( Perl_debug_log,
5605                         "%*s  %sBRANCH failed...%s\n",
5606                         REPORT_CODE_OFF+depth*2, "",
5607                         PL_colors[4],
5608                         PL_colors[5] );
5609                 });
5610                 sayNO_SILENT;
5611             }
5612             continue; /* execute next BRANCH[J] op */
5613             assert(0); /* NOTREACHED */
5614
5615         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5616             minmod = 1;
5617             break;
5618
5619 #undef  ST
5620 #define ST st->u.curlym
5621
5622         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5623
5624             /* This is an optimisation of CURLYX that enables us to push
5625              * only a single backtracking state, no matter how many matches
5626              * there are in {m,n}. It relies on the pattern being constant
5627              * length, with no parens to influence future backrefs
5628              */
5629
5630             ST.me = scan;
5631             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5632
5633             ST.lastparen      = rex->lastparen;
5634             ST.lastcloseparen = rex->lastcloseparen;
5635
5636             /* if paren positive, emulate an OPEN/CLOSE around A */
5637             if (ST.me->flags) {
5638                 U32 paren = ST.me->flags;
5639                 if (paren > PL_regsize)
5640                     PL_regsize = paren;
5641                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5642             }
5643             ST.A = scan;
5644             ST.B = next;
5645             ST.alen = 0;
5646             ST.count = 0;
5647             ST.minmod = minmod;
5648             minmod = 0;
5649             ST.c1 = CHRTEST_UNINIT;
5650             REGCP_SET(ST.cp);
5651
5652             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5653                 goto curlym_do_B;
5654
5655           curlym_do_A: /* execute the A in /A{m,n}B/  */
5656             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5657             assert(0); /* NOTREACHED */
5658
5659         case CURLYM_A: /* we've just matched an A */
5660             ST.count++;
5661             /* after first match, determine A's length: u.curlym.alen */
5662             if (ST.count == 1) {
5663                 if (PL_reg_match_utf8) {
5664                     char *s = st->locinput;
5665                     while (s < locinput) {
5666                         ST.alen++;
5667                         s += UTF8SKIP(s);
5668                     }
5669                 }
5670                 else {
5671                     ST.alen = locinput - st->locinput;
5672                 }
5673                 if (ST.alen == 0)
5674                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5675             }
5676             DEBUG_EXECUTE_r(
5677                 PerlIO_printf(Perl_debug_log,
5678                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5679                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5680                           (IV) ST.count, (IV)ST.alen)
5681             );
5682
5683             if (cur_eval && cur_eval->u.eval.close_paren &&
5684                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5685                 goto fake_end;
5686
5687             {
5688                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5689                 if ( max == REG_INFTY || ST.count < max )
5690                     goto curlym_do_A; /* try to match another A */
5691             }
5692             goto curlym_do_B; /* try to match B */
5693
5694         case CURLYM_A_fail: /* just failed to match an A */
5695             REGCP_UNWIND(ST.cp);
5696
5697             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5698                 || (cur_eval && cur_eval->u.eval.close_paren &&
5699                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5700                 sayNO;
5701
5702           curlym_do_B: /* execute the B in /A{m,n}B/  */
5703             if (ST.c1 == CHRTEST_UNINIT) {
5704                 /* calculate c1 and c2 for possible match of 1st char
5705                  * following curly */
5706                 ST.c1 = ST.c2 = CHRTEST_VOID;
5707                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5708                     regnode *text_node = ST.B;
5709                     if (! HAS_TEXT(text_node))
5710                         FIND_NEXT_IMPT(text_node);
5711                     /* this used to be
5712
5713                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5714
5715                         But the former is redundant in light of the latter.
5716
5717                         if this changes back then the macro for
5718                         IS_TEXT and friends need to change.
5719                      */
5720                     if (PL_regkind[OP(text_node)] == EXACT) {
5721                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5722                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5723                         {
5724                             sayNO;
5725                         }
5726                     }
5727                 }
5728             }
5729
5730             DEBUG_EXECUTE_r(
5731                 PerlIO_printf(Perl_debug_log,
5732                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5733                     (int)(REPORT_CODE_OFF+(depth*2)),
5734                     "", (IV)ST.count)
5735                 );
5736             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5737                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5738                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5739                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5740                     {
5741                         /* simulate B failing */
5742                         DEBUG_OPTIMISE_r(
5743                             PerlIO_printf(Perl_debug_log,
5744                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5745                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5746                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5747                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5748                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5749                         );
5750                         state_num = CURLYM_B_fail;
5751                         goto reenter_switch;
5752                     }
5753                 }
5754                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5755                     /* simulate B failing */
5756                     DEBUG_OPTIMISE_r(
5757                         PerlIO_printf(Perl_debug_log,
5758                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5759                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5760                             (int) nextchr, ST.c1, ST.c2)
5761                     );
5762                     state_num = CURLYM_B_fail;
5763                     goto reenter_switch;
5764                 }
5765             }
5766
5767             if (ST.me->flags) {
5768                 /* emulate CLOSE: mark current A as captured */
5769                 I32 paren = ST.me->flags;
5770                 if (ST.count) {
5771                     rex->offs[paren].start
5772                         = HOPc(locinput, -ST.alen) - PL_bostr;
5773                     rex->offs[paren].end = locinput - PL_bostr;
5774                     if ((U32)paren > rex->lastparen)
5775                         rex->lastparen = paren;
5776                     rex->lastcloseparen = paren;
5777                 }
5778                 else
5779                     rex->offs[paren].end = -1;
5780                 if (cur_eval && cur_eval->u.eval.close_paren &&
5781                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5782                 {
5783                     if (ST.count)
5784                         goto fake_end;
5785                     else
5786                         sayNO;
5787                 }
5788             }
5789
5790             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5791             assert(0); /* NOTREACHED */
5792
5793         case CURLYM_B_fail: /* just failed to match a B */
5794             REGCP_UNWIND(ST.cp);
5795             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5796             if (ST.minmod) {
5797                 I32 max = ARG2(ST.me);
5798                 if (max != REG_INFTY && ST.count == max)
5799                     sayNO;
5800                 goto curlym_do_A; /* try to match a further A */
5801             }
5802             /* backtrack one A */
5803             if (ST.count == ARG1(ST.me) /* min */)
5804                 sayNO;
5805             ST.count--;
5806             SET_locinput(HOPc(locinput, -ST.alen));
5807             goto curlym_do_B; /* try to match B */
5808
5809 #undef ST
5810 #define ST st->u.curly
5811
5812 #define CURLY_SETPAREN(paren, success) \
5813     if (paren) { \
5814         if (success) { \
5815             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5816             rex->offs[paren].end = locinput - PL_bostr; \
5817             if (paren > rex->lastparen) \
5818                 rex->lastparen = paren; \
5819             rex->lastcloseparen = paren; \
5820         } \
5821         else { \
5822             rex->offs[paren].end = -1; \
5823             rex->lastparen      = ST.lastparen; \
5824             rex->lastcloseparen = ST.lastcloseparen; \
5825         } \
5826     }
5827
5828         case STAR:              /*  /A*B/ where A is width 1 char */
5829             ST.paren = 0;
5830             ST.min = 0;
5831             ST.max = REG_INFTY;
5832             scan = NEXTOPER(scan);
5833             goto repeat;
5834
5835         case PLUS:              /*  /A+B/ where A is width 1 char */
5836             ST.paren = 0;
5837             ST.min = 1;
5838             ST.max = REG_INFTY;
5839             scan = NEXTOPER(scan);
5840             goto repeat;
5841
5842         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5843             ST.paren = scan->flags;     /* Which paren to set */
5844             ST.lastparen      = rex->lastparen;
5845             ST.lastcloseparen = rex->lastcloseparen;
5846             if (ST.paren > PL_regsize)
5847                 PL_regsize = ST.paren;
5848             ST.min = ARG1(scan);  /* min to match */
5849             ST.max = ARG2(scan);  /* max to match */
5850             if (cur_eval && cur_eval->u.eval.close_paren &&
5851                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5852                 ST.min=1;
5853                 ST.max=1;
5854             }
5855             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5856             goto repeat;
5857
5858         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5859             ST.paren = 0;
5860             ST.min = ARG1(scan);  /* min to match */
5861             ST.max = ARG2(scan);  /* max to match */
5862             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5863           repeat:
5864             /*
5865             * Lookahead to avoid useless match attempts
5866             * when we know what character comes next.
5867             *
5868             * Used to only do .*x and .*?x, but now it allows
5869             * for )'s, ('s and (?{ ... })'s to be in the way
5870             * of the quantifier and the EXACT-like node.  -- japhy
5871             */
5872
5873             assert(ST.min <= ST.max);
5874             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5875                 ST.c1 = ST.c2 = CHRTEST_VOID;
5876             }
5877             else {
5878                 regnode *text_node = next;
5879
5880                 if (! HAS_TEXT(text_node))
5881                     FIND_NEXT_IMPT(text_node);
5882
5883                 if (! HAS_TEXT(text_node))
5884                     ST.c1 = ST.c2 = CHRTEST_VOID;
5885                 else {
5886                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5887                         ST.c1 = ST.c2 = CHRTEST_VOID;
5888                     }
5889                     else {
5890
5891                     /*  Currently we only get here when
5892
5893                         PL_rekind[OP(text_node)] == EXACT
5894
5895                         if this changes back then the macro for IS_TEXT and
5896                         friends need to change. */
5897                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5898                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5899                         {
5900                             sayNO;
5901                         }
5902                     }
5903                 }
5904             }
5905
5906             ST.A = scan;
5907             ST.B = next;
5908             if (minmod) {
5909                 char *li = locinput;
5910                 minmod = 0;
5911                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5912                     sayNO;
5913                 SET_locinput(li);
5914                 ST.count = ST.min;
5915                 REGCP_SET(ST.cp);
5916                 if (ST.c1 == CHRTEST_VOID)
5917                     goto curly_try_B_min;
5918
5919                 ST.oldloc = locinput;
5920
5921                 /* set ST.maxpos to the furthest point along the
5922                  * string that could possibly match */
5923                 if  (ST.max == REG_INFTY) {
5924                     ST.maxpos = PL_regeol - 1;
5925                     if (utf8_target)
5926                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5927                             ST.maxpos--;
5928                 }
5929                 else if (utf8_target) {
5930                     int m = ST.max - ST.min;
5931                     for (ST.maxpos = locinput;
5932                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5933                         ST.maxpos += UTF8SKIP(ST.maxpos);
5934                 }
5935                 else {
5936                     ST.maxpos = locinput + ST.max - ST.min;
5937                     if (ST.maxpos >= PL_regeol)
5938                         ST.maxpos = PL_regeol - 1;
5939                 }
5940                 goto curly_try_B_min_known;
5941
5942             }
5943             else {
5944                 /* avoid taking address of locinput, so it can remain
5945                  * a register var */
5946                 char *li = locinput;
5947                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5948                 if (ST.count < ST.min)
5949                     sayNO;
5950                 SET_locinput(li);
5951                 if ((ST.count > ST.min)
5952                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5953                 {
5954                     /* A{m,n} must come at the end of the string, there's
5955                      * no point in backing off ... */
5956                     ST.min = ST.count;
5957                     /* ...except that $ and \Z can match before *and* after
5958                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5959                        We may back off by one in this case. */
5960                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5961                         ST.min--;
5962                 }
5963                 REGCP_SET(ST.cp);
5964                 goto curly_try_B_max;
5965             }
5966             assert(0); /* NOTREACHED */
5967
5968
5969         case CURLY_B_min_known_fail:
5970             /* failed to find B in a non-greedy match where c1,c2 valid */
5971
5972             REGCP_UNWIND(ST.cp);
5973             if (ST.paren) {
5974                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5975             }
5976             /* Couldn't or didn't -- move forward. */
5977             ST.oldloc = locinput;
5978             if (utf8_target)
5979                 locinput += UTF8SKIP(locinput);
5980             else
5981                 locinput++;
5982             ST.count++;
5983           curly_try_B_min_known:
5984              /* find the next place where 'B' could work, then call B */
5985             {
5986                 int n;
5987                 if (utf8_target) {
5988                     n = (ST.oldloc == locinput) ? 0 : 1;
5989                     if (ST.c1 == ST.c2) {
5990                         /* set n to utf8_distance(oldloc, locinput) */
5991                         while (locinput <= ST.maxpos
5992                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5993                         {
5994                             locinput += UTF8SKIP(locinput);
5995                             n++;
5996                         }
5997                     }
5998                     else {
5999                         /* set n to utf8_distance(oldloc, locinput) */
6000                         while (locinput <= ST.maxpos
6001                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6002                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6003                         {
6004                             locinput += UTF8SKIP(locinput);
6005                             n++;
6006                         }
6007                     }
6008                 }
6009                 else {  /* Not utf8_target */
6010                     if (ST.c1 == ST.c2) {
6011                         while (locinput <= ST.maxpos &&
6012                                UCHARAT(locinput) != ST.c1)
6013                             locinput++;
6014                     }
6015                     else {
6016                         while (locinput <= ST.maxpos
6017                                && UCHARAT(locinput) != ST.c1
6018                                && UCHARAT(locinput) != ST.c2)
6019                             locinput++;
6020                     }
6021                     n = locinput - ST.oldloc;
6022                 }
6023                 if (locinput > ST.maxpos)
6024                     sayNO;
6025                 if (n) {
6026                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6027                      * at b; check that everything between oldloc and
6028                      * locinput matches */
6029                     char *li = ST.oldloc;
6030                     ST.count += n;
6031                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
6032                         sayNO;
6033                     assert(n == REG_INFTY || locinput == li);
6034                 }
6035                 CURLY_SETPAREN(ST.paren, ST.count);
6036                 if (cur_eval && cur_eval->u.eval.close_paren &&
6037                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6038                     goto fake_end;
6039                 }
6040                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6041             }
6042             assert(0); /* NOTREACHED */
6043
6044
6045         case CURLY_B_min_fail:
6046             /* failed to find B in a non-greedy match where c1,c2 invalid */
6047
6048             REGCP_UNWIND(ST.cp);
6049             if (ST.paren) {
6050                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6051             }
6052             /* failed -- move forward one */
6053             {
6054                 char *li = locinput;
6055                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6056                     sayNO;
6057                 }
6058                 locinput = li;
6059             }
6060             {
6061                 ST.count++;
6062                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6063                         ST.count > 0)) /* count overflow ? */
6064                 {
6065                   curly_try_B_min:
6066                     CURLY_SETPAREN(ST.paren, ST.count);
6067                     if (cur_eval && cur_eval->u.eval.close_paren &&
6068                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6069                         goto fake_end;
6070                     }
6071                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6072                 }
6073             }
6074             sayNO;
6075             assert(0); /* NOTREACHED */
6076
6077
6078         curly_try_B_max:
6079             /* a successful greedy match: now try to match B */
6080             if (cur_eval && cur_eval->u.eval.close_paren &&
6081                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6082                 goto fake_end;
6083             }
6084             {
6085                 bool could_match = locinput < PL_regeol;
6086
6087                 /* If it could work, try it. */
6088                 if (ST.c1 != CHRTEST_VOID && could_match) {
6089                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6090                     {
6091                         could_match = memEQ(locinput,
6092                                             ST.c1_utf8,
6093                                             UTF8SKIP(locinput))
6094                                     || memEQ(locinput,
6095                                              ST.c2_utf8,
6096                                              UTF8SKIP(locinput));
6097                     }
6098                     else {
6099                         could_match = UCHARAT(locinput) == ST.c1
6100                                       || UCHARAT(locinput) == ST.c2;
6101                     }
6102                 }
6103                 if (ST.c1 == CHRTEST_VOID || could_match) {
6104                     CURLY_SETPAREN(ST.paren, ST.count);
6105                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6106                     assert(0); /* NOTREACHED */
6107                 }
6108             }
6109             /* FALL THROUGH */
6110
6111         case CURLY_B_max_fail:
6112             /* failed to find B in a greedy match */
6113
6114             REGCP_UNWIND(ST.cp);
6115             if (ST.paren) {
6116                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6117             }
6118             /*  back up. */
6119             if (--ST.count < ST.min)
6120                 sayNO;
6121             locinput = HOPc(locinput, -1);
6122             goto curly_try_B_max;
6123
6124 #undef ST
6125
6126         case END: /*  last op of main pattern  */
6127             fake_end:
6128             if (cur_eval) {
6129                 /* we've just finished A in /(??{A})B/; now continue with B */
6130                 st->u.eval.toggle_reg_flags
6131                             = cur_eval->u.eval.toggle_reg_flags;
6132                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6133
6134                 st->u.eval.prev_rex = rex_sv;           /* inner */
6135                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6136                 rex_sv = cur_eval->u.eval.prev_rex;
6137                 SET_reg_curpm(rex_sv);
6138                 rex = ReANY(rex_sv);
6139                 rexi = RXi_GET(rex);
6140                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6141
6142                 REGCP_SET(st->u.eval.lastcp);
6143
6144                 /* Restore parens of the outer rex without popping the
6145                  * savestack */
6146                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6147
6148                 st->u.eval.prev_eval = cur_eval;
6149                 cur_eval = cur_eval->u.eval.prev_eval;
6150                 DEBUG_EXECUTE_r(
6151                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6152                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6153                 if ( nochange_depth )
6154                     nochange_depth--;
6155
6156                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6157                                     locinput); /* match B */
6158             }
6159
6160             if (locinput < reginfo->till) {
6161                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6162                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6163                                       PL_colors[4],
6164                                       (long)(locinput - PL_reg_starttry),
6165                                       (long)(reginfo->till - PL_reg_starttry),
6166                                       PL_colors[5]));
6167
6168                 sayNO_SILENT;           /* Cannot match: too short. */
6169             }
6170             sayYES;                     /* Success! */
6171
6172         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6173             DEBUG_EXECUTE_r(
6174             PerlIO_printf(Perl_debug_log,
6175                 "%*s  %ssubpattern success...%s\n",
6176                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6177             sayYES;                     /* Success! */
6178
6179 #undef  ST
6180 #define ST st->u.ifmatch
6181
6182         {
6183             char *newstart;
6184
6185         case SUSPEND:   /* (?>A) */
6186             ST.wanted = 1;
6187             newstart = locinput;
6188             goto do_ifmatch;
6189
6190         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6191             ST.wanted = 0;
6192             goto ifmatch_trivial_fail_test;
6193
6194         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6195             ST.wanted = 1;
6196           ifmatch_trivial_fail_test:
6197             if (scan->flags) {
6198                 char * const s = HOPBACKc(locinput, scan->flags);
6199                 if (!s) {
6200                     /* trivial fail */
6201                     if (logical) {
6202                         logical = 0;
6203                         sw = 1 - cBOOL(ST.wanted);
6204                     }
6205                     else if (ST.wanted)
6206                         sayNO;
6207                     next = scan + ARG(scan);
6208                     if (next == scan)
6209                         next = NULL;
6210                     break;
6211                 }
6212                 newstart = s;
6213             }
6214             else
6215                 newstart = locinput;
6216
6217           do_ifmatch:
6218             ST.me = scan;
6219             ST.logical = logical;
6220             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6221
6222             /* execute body of (?...A) */
6223             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6224             assert(0); /* NOTREACHED */
6225         }
6226
6227         case IFMATCH_A_fail: /* body of (?...A) failed */
6228             ST.wanted = !ST.wanted;
6229             /* FALL THROUGH */
6230
6231         case IFMATCH_A: /* body of (?...A) succeeded */
6232             if (ST.logical) {
6233                 sw = cBOOL(ST.wanted);
6234             }
6235             else if (!ST.wanted)
6236                 sayNO;
6237
6238             if (OP(ST.me) != SUSPEND) {
6239                 /* restore old position except for (?>...) */
6240                 locinput = st->locinput;
6241             }
6242             scan = ST.me + ARG(ST.me);
6243             if (scan == ST.me)
6244                 scan = NULL;
6245             continue; /* execute B */
6246
6247 #undef ST
6248
6249         case LONGJMP: /*  alternative with many branches compiles to
6250                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6251             next = scan + ARG(scan);
6252             if (next == scan)
6253                 next = NULL;
6254             break;
6255
6256         case COMMIT:  /*  (*COMMIT)  */
6257             reginfo->cutpoint = PL_regeol;
6258             /* FALLTHROUGH */
6259
6260         case PRUNE:   /*  (*PRUNE)   */
6261             if (!scan->flags)
6262                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6263             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6264             assert(0); /* NOTREACHED */
6265
6266         case COMMIT_next_fail:
6267             no_final = 1;
6268             /* FALLTHROUGH */
6269
6270         case OPFAIL:   /* (*FAIL)  */
6271             sayNO;
6272             assert(0); /* NOTREACHED */
6273
6274 #define ST st->u.mark
6275         case MARKPOINT: /*  (*MARK:foo)  */
6276             ST.prev_mark = mark_state;
6277             ST.mark_name = sv_commit = sv_yes_mark
6278                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6279             mark_state = st;
6280             ST.mark_loc = locinput;
6281             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6282             assert(0); /* NOTREACHED */
6283
6284         case MARKPOINT_next:
6285             mark_state = ST.prev_mark;
6286             sayYES;
6287             assert(0); /* NOTREACHED */
6288
6289         case MARKPOINT_next_fail:
6290             if (popmark && sv_eq(ST.mark_name,popmark))
6291             {
6292                 if (ST.mark_loc > startpoint)
6293                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6294                 popmark = NULL; /* we found our mark */
6295                 sv_commit = ST.mark_name;
6296
6297                 DEBUG_EXECUTE_r({
6298                         PerlIO_printf(Perl_debug_log,
6299                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6300                             REPORT_CODE_OFF+depth*2, "",
6301                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6302                 });
6303             }
6304             mark_state = ST.prev_mark;
6305             sv_yes_mark = mark_state ?
6306                 mark_state->u.mark.mark_name : NULL;
6307             sayNO;
6308             assert(0); /* NOTREACHED */
6309
6310         case SKIP:  /*  (*SKIP)  */
6311             if (scan->flags) {
6312                 /* (*SKIP) : if we fail we cut here*/
6313                 ST.mark_name = NULL;
6314                 ST.mark_loc = locinput;
6315                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6316             } else {
6317                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6318                    otherwise do nothing.  Meaning we need to scan
6319                  */
6320                 regmatch_state *cur = mark_state;
6321                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6322
6323                 while (cur) {
6324                     if ( sv_eq( cur->u.mark.mark_name,
6325                                 find ) )
6326                     {
6327                         ST.mark_name = find;
6328                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6329                     }
6330                     cur = cur->u.mark.prev_mark;
6331                 }
6332             }
6333             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6334             break;
6335
6336         case SKIP_next_fail:
6337             if (ST.mark_name) {
6338                 /* (*CUT:NAME) - Set up to search for the name as we
6339                    collapse the stack*/
6340                 popmark = ST.mark_name;
6341             } else {
6342                 /* (*CUT) - No name, we cut here.*/
6343                 if (ST.mark_loc > startpoint)
6344                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6345                 /* but we set sv_commit to latest mark_name if there
6346                    is one so they can test to see how things lead to this
6347                    cut */
6348                 if (mark_state)
6349                     sv_commit=mark_state->u.mark.mark_name;
6350             }
6351             no_final = 1;
6352             sayNO;
6353             assert(0); /* NOTREACHED */
6354 #undef ST
6355
6356         case LNBREAK: /* \R */
6357             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6358                 locinput += n;
6359             } else
6360                 sayNO;
6361             break;
6362
6363 #define CASE_CLASS(nAmE)                              \
6364         case nAmE:                                    \
6365             if (NEXTCHR_IS_EOS)                       \
6366                 sayNO;                                \
6367             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6368                 locinput += n;                        \
6369             } else                                    \
6370                 sayNO;                                \
6371             break;                                    \
6372         case N##nAmE:                                 \
6373             if (NEXTCHR_IS_EOS)                       \
6374                 sayNO;                                \
6375             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6376                 sayNO;                                \
6377             } else {                                  \
6378                 locinput += UTF8SKIP(locinput);       \
6379             }                                         \
6380             break
6381
6382         CASE_CLASS(VERTWS);  /*  \v \V  */
6383         CASE_CLASS(HORIZWS); /*  \h \H  */
6384 #undef CASE_CLASS
6385
6386         default:
6387             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6388                           PTR2UV(scan), OP(scan));
6389             Perl_croak(aTHX_ "regexp memory corruption");
6390
6391         /* this is a point to jump to in order to increment
6392          * locinput by one character */
6393         increment_locinput:
6394             assert(!NEXTCHR_IS_EOS);
6395             if (utf8_target) {
6396                 locinput += PL_utf8skip[nextchr];
6397                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6398                 if (locinput > PL_regeol)
6399                     sayNO;
6400             }
6401             else
6402                 locinput++;
6403             break;
6404
6405         } /* end switch */
6406
6407         /* switch break jumps here */
6408         scan = next; /* prepare to execute the next op and ... */
6409         continue;    /* ... jump back to the top, reusing st */
6410         assert(0); /* NOTREACHED */
6411
6412       push_yes_state:
6413         /* push a state that backtracks on success */
6414         st->u.yes.prev_yes_state = yes_state;
6415         yes_state = st;
6416         /* FALL THROUGH */
6417       push_state:
6418         /* push a new regex state, then continue at scan  */
6419         {
6420             regmatch_state *newst;
6421
6422             DEBUG_STACK_r({
6423                 regmatch_state *cur = st;
6424                 regmatch_state *curyes = yes_state;
6425                 int curd = depth;
6426                 regmatch_slab *slab = PL_regmatch_slab;
6427                 for (;curd > -1;cur--,curd--) {
6428                     if (cur < SLAB_FIRST(slab)) {
6429                         slab = slab->prev;
6430                         cur = SLAB_LAST(slab);
6431                     }
6432                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6433                         REPORT_CODE_OFF + 2 + depth * 2,"",
6434                         curd, PL_reg_name[cur->resume_state],
6435                         (curyes == cur) ? "yes" : ""
6436                     );
6437                     if (curyes == cur)
6438                         curyes = cur->u.yes.prev_yes_state;
6439                 }
6440             } else
6441                 DEBUG_STATE_pp("push")
6442             );
6443             depth++;
6444             st->locinput = locinput;
6445             newst = st+1;
6446             if (newst >  SLAB_LAST(PL_regmatch_slab))
6447                 newst = S_push_slab(aTHX);
6448             PL_regmatch_state = newst;
6449
6450             locinput = pushinput;
6451             st = newst;
6452             continue;
6453             assert(0); /* NOTREACHED */
6454         }
6455     }
6456
6457     /*
6458     * We get here only if there's trouble -- normally "case END" is
6459     * the terminating point.
6460     */
6461     Perl_croak(aTHX_ "corrupted regexp pointers");
6462     /*NOTREACHED*/
6463     sayNO;
6464
6465 yes:
6466     if (yes_state) {
6467         /* we have successfully completed a subexpression, but we must now
6468          * pop to the state marked by yes_state and continue from there */
6469         assert(st != yes_state);
6470 #ifdef DEBUGGING
6471         while (st != yes_state) {
6472             st--;
6473             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6474                 PL_regmatch_slab = PL_regmatch_slab->prev;
6475                 st = SLAB_LAST(PL_regmatch_slab);
6476             }
6477             DEBUG_STATE_r({
6478                 if (no_final) {
6479                     DEBUG_STATE_pp("pop (no final)");
6480                 } else {
6481                     DEBUG_STATE_pp("pop (yes)");
6482                 }
6483             });
6484             depth--;
6485         }
6486 #else
6487         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6488             || yes_state > SLAB_LAST(PL_regmatch_slab))
6489         {
6490             /* not in this slab, pop slab */
6491             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6492             PL_regmatch_slab = PL_regmatch_slab->prev;
6493             st = SLAB_LAST(PL_regmatch_slab);
6494         }
6495         depth -= (st - yes_state);
6496 #endif
6497         st = yes_state;
6498         yes_state = st->u.yes.prev_yes_state;
6499         PL_regmatch_state = st;
6500
6501         if (no_final)
6502             locinput= st->locinput;
6503         state_num = st->resume_state + no_final;
6504         goto reenter_switch;
6505     }
6506
6507     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6508                           PL_colors[4], PL_colors[5]));
6509
6510     if (PL_reg_state.re_state_eval_setup_done) {
6511         /* each successfully executed (?{...}) block does the equivalent of
6512          *   local $^R = do {...}
6513          * When popping the save stack, all these locals would be undone;
6514          * bypass this by setting the outermost saved $^R to the latest
6515          * value */
6516         if (oreplsv != GvSV(PL_replgv))
6517             sv_setsv(oreplsv, GvSV(PL_replgv));
6518     }
6519     result = 1;
6520     goto final_exit;
6521
6522 no:
6523     DEBUG_EXECUTE_r(
6524         PerlIO_printf(Perl_debug_log,
6525             "%*s  %sfailed...%s\n",
6526             REPORT_CODE_OFF+depth*2, "",
6527             PL_colors[4], PL_colors[5])
6528         );
6529
6530 no_silent:
6531     if (no_final) {
6532         if (yes_state) {
6533             goto yes;
6534         } else {
6535             goto final_exit;
6536         }
6537     }
6538     if (depth) {
6539         /* there's a previous state to backtrack to */
6540         st--;
6541         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6542             PL_regmatch_slab = PL_regmatch_slab->prev;
6543             st = SLAB_LAST(PL_regmatch_slab);
6544         }
6545         PL_regmatch_state = st;
6546         locinput= st->locinput;
6547
6548         DEBUG_STATE_pp("pop");
6549         depth--;
6550         if (yes_state == st)
6551             yes_state = st->u.yes.prev_yes_state;
6552
6553         state_num = st->resume_state + 1; /* failure = success + 1 */
6554         goto reenter_switch;
6555     }
6556     result = 0;
6557
6558   final_exit:
6559     if (rex->intflags & PREGf_VERBARG_SEEN) {
6560         SV *sv_err = get_sv("REGERROR", 1);
6561         SV *sv_mrk = get_sv("REGMARK", 1);
6562         if (result) {
6563             sv_commit = &PL_sv_no;
6564             if (!sv_yes_mark)
6565                 sv_yes_mark = &PL_sv_yes;
6566         } else {
6567             if (!sv_commit)
6568                 sv_commit = &PL_sv_yes;
6569             sv_yes_mark = &PL_sv_no;
6570         }
6571         sv_setsv(sv_err, sv_commit);
6572         sv_setsv(sv_mrk, sv_yes_mark);
6573     }
6574
6575
6576     if (last_pushed_cv) {
6577         dSP;
6578         POP_MULTICALL;
6579         PERL_UNUSED_VAR(SP);
6580     }
6581
6582     /* clean up; in particular, free all slabs above current one */
6583     LEAVE_SCOPE(oldsave);
6584
6585     assert(!result ||  locinput - PL_bostr >= 0);
6586     return result ?  locinput - PL_bostr : -1;
6587 }
6588
6589 /*
6590  - regrepeat - repeatedly match something simple, report how many
6591  *
6592  * What 'simple' means is a node which can be the operand of a quantifier like
6593  * '+', or {1,3}
6594  *
6595  * startposp - pointer a pointer to the start position.  This is updated
6596  *             to point to the byte following the highest successful
6597  *             match.
6598  * p         - the regnode to be repeatedly matched against.
6599  * max       - maximum number of things to match.
6600  * depth     - (for debugging) backtracking depth.
6601  */
6602 STATIC I32
6603 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6604 {
6605     dVAR;
6606     char *scan;     /* Pointer to current position in target string */
6607     I32 c;
6608     char *loceol = PL_regeol;   /* local version */
6609     I32 hardcount = 0;  /* How many matches so far */
6610     bool utf8_target = PL_reg_match_utf8;
6611     UV utf8_flags;
6612 #ifndef DEBUGGING
6613     PERL_UNUSED_ARG(depth);
6614 #endif
6615
6616     PERL_ARGS_ASSERT_REGREPEAT;
6617
6618     scan = *startposp;
6619     if (max == REG_INFTY)
6620         max = I32_MAX;
6621     else if (! utf8_target && scan + max < loceol)
6622         loceol = scan + max;
6623
6624     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6625      * to the maximum of how far we should go in it (leaving it set to the real
6626      * end, if the maximum permissible would take us beyond that).  This allows
6627      * us to make the loop exit condition that we haven't gone past <loceol> to
6628      * also mean that we haven't exceeded the max permissible count, saving a
6629      * test each time through the loop.  But it assumes that the OP matches a
6630      * single byte, which is true for most of the OPs below when applied to a
6631      * non-UTF-8 target.  Those relatively few OPs that don't have this
6632      * characteristic will have to compensate.
6633      *
6634      * There is no adjustment for UTF-8 targets, as the number of bytes per
6635      * character varies.  OPs will have to test both that the count is less
6636      * than the max permissible (using <hardcount> to keep track), and that we
6637      * are still within the bounds of the string (using <loceol>.  A few OPs
6638      * match a single byte no matter what the encoding.  They can omit the max
6639      * test if, for the UTF-8 case, they do the adjustment that was skipped
6640      * above.
6641      *
6642      * Thus, the code above sets things up for the common case; and exceptional
6643      * cases need extra work; the common case is to make sure <scan> doesn't
6644      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6645      * count doesn't exceed the maximum permissible */
6646
6647     switch (OP(p)) {
6648     case REG_ANY:
6649         if (utf8_target) {
6650             while (scan < loceol && hardcount < max && *scan != '\n') {
6651                 scan += UTF8SKIP(scan);
6652                 hardcount++;
6653             }
6654         } else {
6655             while (scan < loceol && *scan != '\n')
6656                 scan++;
6657         }
6658         break;
6659     case SANY:
6660         if (utf8_target) {
6661             while (scan < loceol && hardcount < max) {
6662                 scan += UTF8SKIP(scan);
6663                 hardcount++;
6664             }
6665         }
6666         else
6667             scan = loceol;
6668         break;
6669     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6670         if (utf8_target && scan + max < loceol) {
6671
6672             /* <loceol> hadn't been adjusted in the UTF-8 case */
6673             scan +=  max;
6674         }
6675         else {
6676             scan = loceol;
6677         }
6678         break;
6679     case EXACT:
6680         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6681
6682         c = (U8)*STRING(p);
6683
6684         /* Can use a simple loop if the pattern char to match on is invariant
6685          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6686          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6687          * true iff it doesn't matter if the argument is in UTF-8 or not */
6688         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6689             if (utf8_target && scan + max < loceol) {
6690                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6691                  * since here, to match at all, 1 char == 1 byte */
6692                 loceol = scan + max;
6693             }
6694             while (scan < loceol && UCHARAT(scan) == c) {
6695                 scan++;
6696             }
6697         }
6698         else if (UTF_PATTERN) {
6699             if (utf8_target) {
6700                 STRLEN scan_char_len;
6701
6702                 /* When both target and pattern are UTF-8, we have to do
6703                  * string EQ */
6704                 while (hardcount < max
6705                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6706                        && scan_char_len <= STR_LEN(p)
6707                        && memEQ(scan, STRING(p), scan_char_len))
6708                 {
6709                     scan += scan_char_len;
6710                     hardcount++;
6711                 }
6712             }
6713             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6714
6715                 /* Target isn't utf8; convert the character in the UTF-8
6716                  * pattern to non-UTF8, and do a simple loop */
6717                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6718                 while (scan < loceol && UCHARAT(scan) == c) {
6719                     scan++;
6720                 }
6721             } /* else pattern char is above Latin1, can't possibly match the
6722                  non-UTF-8 target */
6723         }
6724         else {
6725
6726             /* Here, the string must be utf8; pattern isn't, and <c> is
6727              * different in utf8 than not, so can't compare them directly.
6728              * Outside the loop, find the two utf8 bytes that represent c, and
6729              * then look for those in sequence in the utf8 string */
6730             U8 high = UTF8_TWO_BYTE_HI(c);
6731             U8 low = UTF8_TWO_BYTE_LO(c);
6732
6733             while (hardcount < max
6734                     && scan + 1 < loceol
6735                     && UCHARAT(scan) == high
6736                     && UCHARAT(scan + 1) == low)
6737             {
6738                 scan += 2;
6739                 hardcount++;
6740             }
6741         }
6742         break;
6743
6744     case EXACTFA:
6745         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6746         goto do_exactf;
6747
6748     case EXACTFL:
6749         PL_reg_flags |= RF_tainted;
6750         utf8_flags = FOLDEQ_UTF8_LOCALE;
6751         goto do_exactf;
6752
6753     case EXACTF:
6754             utf8_flags = 0;
6755             goto do_exactf;
6756
6757     case EXACTFU_SS:
6758     case EXACTFU_TRICKYFOLD:
6759     case EXACTFU:
6760         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6761
6762     do_exactf: {
6763         int c1, c2;
6764         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6765
6766         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6767
6768         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6769             if (c1 == CHRTEST_VOID) {
6770                 /* Use full Unicode fold matching */
6771                 char *tmpeol = PL_regeol;
6772                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6773                 while (hardcount < max
6774                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6775                                              STRING(p), NULL, pat_len,
6776                                              cBOOL(UTF_PATTERN), utf8_flags))
6777                 {
6778                     scan = tmpeol;
6779                     tmpeol = PL_regeol;
6780                     hardcount++;
6781                 }
6782             }
6783             else if (utf8_target) {
6784                 if (c1 == c2) {
6785                     while (scan < loceol
6786                            && hardcount < max
6787                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6788                     {
6789                         scan += UTF8SKIP(scan);
6790                         hardcount++;
6791                     }
6792                 }
6793                 else {
6794                     while (scan < loceol
6795                            && hardcount < max
6796                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6797                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6798                     {
6799                         scan += UTF8SKIP(scan);
6800                         hardcount++;
6801                     }
6802                 }
6803             }
6804             else if (c1 == c2) {
6805                 while (scan < loceol && UCHARAT(scan) == c1) {
6806                     scan++;
6807                 }
6808             }
6809             else {
6810                 while (scan < loceol &&
6811                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6812                 {
6813                     scan++;
6814                 }
6815             }
6816         }
6817         break;
6818     }
6819     case ANYOF:
6820         if (utf8_target) {
6821             STRLEN inclasslen;
6822             while (hardcount < max
6823                    && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6824                    && reginclass(prog, p, (U8*)scan, utf8_target))
6825             {
6826                 scan += inclasslen;
6827                 hardcount++;
6828             }
6829         } else {
6830             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6831                 scan++;
6832         }
6833         break;
6834     case ALNUMU:
6835         if (utf8_target) {
6836     utf8_wordchar:
6837             LOAD_UTF8_CHARCLASS_ALNUM();
6838             while (hardcount < max && scan < loceol &&
6839                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6840             {
6841                 scan += UTF8SKIP(scan);
6842                 hardcount++;
6843             }
6844         } else {
6845             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6846                 scan++;
6847             }
6848         }
6849         break;
6850     case ALNUM:
6851         if (utf8_target)
6852             goto utf8_wordchar;
6853         while (scan < loceol && isALNUM((U8) *scan)) {
6854             scan++;
6855         }
6856         break;
6857     case ALNUMA:
6858         if (utf8_target && scan + max < loceol) {
6859
6860             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6861              * since here, to match, 1 char == 1 byte */
6862             loceol = scan + max;
6863         }
6864         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6865             scan++;
6866         }
6867         break;
6868     case ALNUML:
6869         PL_reg_flags |= RF_tainted;
6870         if (utf8_target) {
6871             while (hardcount < max && scan < loceol &&
6872                    isALNUM_LC_utf8((U8*)scan)) {
6873                 scan += UTF8SKIP(scan);
6874                 hardcount++;
6875             }
6876         } else {
6877             while (scan < loceol && isALNUM_LC(*scan))
6878                 scan++;
6879         }
6880         break;
6881     case NALNUMU:
6882         if (utf8_target) {
6883
6884     utf8_Nwordchar:
6885
6886             LOAD_UTF8_CHARCLASS_ALNUM();
6887             while (hardcount < max && scan < loceol &&
6888                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6889             {
6890                 scan += UTF8SKIP(scan);
6891                 hardcount++;
6892             }
6893         } else {
6894             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6895                 scan++;
6896             }
6897         }
6898         break;
6899     case NALNUM:
6900         if (utf8_target)
6901             goto utf8_Nwordchar;
6902         while (scan < loceol && ! isALNUM((U8) *scan)) {
6903             scan++;
6904         }
6905         break;
6906
6907     case POSIXA:
6908         if (utf8_target && scan + max < loceol) {
6909
6910             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6911              * since here, to match, 1 char == 1 byte */
6912             loceol = scan + max;
6913         }
6914         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6915             scan++;
6916         }
6917         break;
6918     case NPOSIXA:
6919         if (utf8_target) {
6920             while (scan < loceol && hardcount < max
6921                    && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6922             {
6923                 scan += UTF8SKIP(scan);
6924                 hardcount++;
6925             }
6926         }
6927         else {
6928             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6929                 scan++;
6930             }
6931         }
6932         break;
6933     case NALNUMA:
6934         if (utf8_target) {
6935             while (scan < loceol && hardcount < max
6936                    && ! isWORDCHAR_A((U8) *scan))
6937             {
6938                 scan += UTF8SKIP(scan);
6939                 hardcount++;
6940             }
6941         }
6942         else {
6943             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6944                 scan++;
6945             }
6946         }
6947         break;
6948     case NALNUML:
6949         PL_reg_flags |= RF_tainted;
6950         if (utf8_target) {
6951             while (hardcount < max && scan < loceol &&
6952                    !isALNUM_LC_utf8((U8*)scan)) {
6953                 scan += UTF8SKIP(scan);
6954                 hardcount++;
6955             }
6956         } else {
6957             while (scan < loceol && !isALNUM_LC(*scan))
6958                 scan++;
6959         }
6960         break;
6961     case SPACEU:
6962         if (utf8_target) {
6963
6964     utf8_space:
6965
6966             while (hardcount < max && scan < loceol
6967                    && is_XPERLSPACE_utf8((U8*)scan))
6968             {
6969                 scan += UTF8SKIP(scan);
6970                 hardcount++;
6971             }
6972             break;
6973         }
6974         else {
6975             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6976                 scan++;
6977             }
6978             break;
6979         }
6980     case SPACE:
6981         if (utf8_target)
6982             goto utf8_space;
6983
6984         while (scan < loceol && isSPACE((U8) *scan)) {
6985             scan++;
6986         }
6987         break;
6988     case SPACEA:
6989         if (utf8_target && scan + max < loceol) {
6990
6991             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6992              * since here, to match, 1 char == 1 byte */
6993             loceol = scan + max;
6994         }
6995         while (scan < loceol && isSPACE_A((U8) *scan)) {
6996             scan++;
6997         }
6998         break;
6999     case SPACEL:
7000         PL_reg_flags |= RF_tainted;
7001         if (utf8_target) {
7002             while (hardcount < max && scan < loceol &&
7003                    isSPACE_LC_utf8((U8*)scan)) {
7004                 scan += UTF8SKIP(scan);
7005                 hardcount++;
7006             }
7007         } else {
7008             while (scan < loceol && isSPACE_LC(*scan))
7009                 scan++;
7010         }
7011         break;
7012     case NSPACEU:
7013         if (utf8_target) {
7014
7015     utf8_Nspace:
7016
7017             while (hardcount < max && scan < loceol
7018                    && ! is_XPERLSPACE_utf8((U8*)scan))
7019             {
7020                 scan += UTF8SKIP(scan);
7021                 hardcount++;
7022             }
7023             break;
7024         }
7025         else {
7026             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7027                 scan++;
7028             }
7029         }
7030         break;
7031     case NSPACE:
7032         if (utf8_target)
7033             goto utf8_Nspace;
7034
7035         while (scan < loceol && ! isSPACE((U8) *scan)) {
7036             scan++;
7037         }
7038         break;
7039     case NSPACEA:
7040         if (utf8_target) {
7041             while (hardcount < max && scan < loceol
7042                    && ! isSPACE_A((U8) *scan))
7043             {
7044                 scan += UTF8SKIP(scan);
7045                 hardcount++;
7046             }
7047         }
7048         else {
7049             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7050                 scan++;
7051             }
7052         }
7053         break;
7054     case NSPACEL:
7055         PL_reg_flags |= RF_tainted;
7056         if (utf8_target) {
7057             while (hardcount < max && scan < loceol &&
7058                    !isSPACE_LC_utf8((U8*)scan)) {
7059                 scan += UTF8SKIP(scan);
7060                 hardcount++;
7061             }
7062         } else {
7063             while (scan < loceol && !isSPACE_LC(*scan))
7064                 scan++;
7065         }
7066         break;
7067     case DIGIT:
7068         if (utf8_target) {
7069             LOAD_UTF8_CHARCLASS_DIGIT();
7070             while (hardcount < max && scan < loceol &&
7071                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7072                 scan += UTF8SKIP(scan);
7073                 hardcount++;
7074             }
7075         } else {
7076             while (scan < loceol && isDIGIT(*scan))
7077                 scan++;
7078         }
7079         break;
7080     case DIGITA:
7081         if (utf8_target && scan + max < loceol) {
7082
7083             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7084              * since here, to match, 1 char == 1 byte */
7085             loceol = scan + max;
7086         }
7087         while (scan < loceol && isDIGIT_A((U8) *scan)) {
7088             scan++;
7089         }
7090         break;
7091     case DIGITL:
7092         PL_reg_flags |= RF_tainted;
7093         if (utf8_target) {
7094             while (hardcount < max && scan < loceol &&
7095                    isDIGIT_LC_utf8((U8*)scan)) {
7096                 scan += UTF8SKIP(scan);
7097                 hardcount++;
7098             }
7099         } else {
7100             while (scan < loceol && isDIGIT_LC(*scan))
7101                 scan++;
7102         }
7103         break;
7104     case NDIGIT:
7105         if (utf8_target) {
7106             LOAD_UTF8_CHARCLASS_DIGIT();
7107             while (hardcount < max && scan < loceol &&
7108                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7109                 scan += UTF8SKIP(scan);
7110                 hardcount++;
7111             }
7112         } else {
7113             while (scan < loceol && !isDIGIT(*scan))
7114                 scan++;
7115         }
7116         break;
7117     case NDIGITA:
7118         if (utf8_target) {
7119             while (hardcount < max && scan < loceol
7120                    && ! isDIGIT_A((U8) *scan)) {
7121                 scan += UTF8SKIP(scan);
7122                 hardcount++;
7123             }
7124         }
7125         else {
7126             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7127                 scan++;
7128             }
7129         }
7130         break;
7131     case NDIGITL:
7132         PL_reg_flags |= RF_tainted;
7133         if (utf8_target) {
7134             while (hardcount < max && scan < loceol &&
7135                    !isDIGIT_LC_utf8((U8*)scan)) {
7136                 scan += UTF8SKIP(scan);
7137                 hardcount++;
7138             }
7139         } else {
7140             while (scan < loceol && !isDIGIT_LC(*scan))
7141                 scan++;
7142         }
7143         break;
7144     case LNBREAK:
7145         if (utf8_target) {
7146             while (hardcount < max && scan < loceol &&
7147                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7148                 scan += c;
7149                 hardcount++;
7150             }
7151         } else {
7152             /* LNBREAK can match one or two latin chars, which is ok, but we
7153              * have to use hardcount in this situation, and throw away the
7154              * adjustment to <loceol> done before the switch statement */
7155             loceol = PL_regeol;
7156             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7157                 scan+=c;
7158                 hardcount++;
7159             }
7160         }
7161         break;
7162     case HORIZWS:
7163         if (utf8_target) {
7164             while (hardcount < max && scan < loceol &&
7165                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7166             {
7167                 scan += c;
7168                 hardcount++;
7169             }
7170         } else {
7171             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7172                 scan++;
7173         }
7174         break;
7175     case NHORIZWS:
7176         if (utf8_target) {
7177             while (hardcount < max && scan < loceol &&
7178                         !is_HORIZWS_utf8_safe(scan, loceol))
7179             {
7180                 scan += UTF8SKIP(scan);
7181                 hardcount++;
7182             }
7183         } else {
7184             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7185                 scan++;
7186
7187         }
7188         break;
7189     case VERTWS:
7190         if (utf8_target) {
7191             while (hardcount < max && scan < loceol &&
7192                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7193             {
7194                 scan += c;
7195                 hardcount++;
7196             }
7197         } else {
7198             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7199                 scan++;
7200
7201         }
7202         break;
7203     case NVERTWS:
7204         if (utf8_target) {
7205             while (hardcount < max && scan < loceol &&
7206                                 !is_VERTWS_utf8_safe(scan, loceol))
7207             {
7208                 scan += UTF8SKIP(scan);
7209                 hardcount++;
7210             }
7211         } else {
7212             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7213                 scan++;
7214
7215         }
7216         break;
7217
7218     case BOUND:
7219     case BOUNDA:
7220     case BOUNDL:
7221     case BOUNDU:
7222     case EOS:
7223     case GPOS:
7224     case KEEPS:
7225     case NBOUND:
7226     case NBOUNDA:
7227     case NBOUNDL:
7228     case NBOUNDU:
7229     case OPFAIL:
7230     case SBOL:
7231     case SEOL:
7232         /* These are all 0 width, so match right here or not at all. */
7233         break;
7234
7235     default:
7236         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7237         assert(0); /* NOTREACHED */
7238
7239     }
7240
7241     if (hardcount)
7242         c = hardcount;
7243     else
7244         c = scan - *startposp;
7245     *startposp = scan;
7246
7247     DEBUG_r({
7248         GET_RE_DEBUG_FLAGS_DECL;
7249         DEBUG_EXECUTE_r({
7250             SV * const prop = sv_newmortal();
7251             regprop(prog, prop, p);
7252             PerlIO_printf(Perl_debug_log,
7253                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7254                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7255         });
7256     });
7257
7258     return(c);
7259 }
7260
7261
7262 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7263 /*
7264 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7265 create a copy so that changes the caller makes won't change the shared one.
7266 If <altsvp> is non-null, will return NULL in it, for back-compat.
7267  */
7268 SV *
7269 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7270 {
7271     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7272
7273     if (altsvp) {
7274         *altsvp = NULL;
7275     }
7276
7277     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7278 }
7279 #endif
7280
7281 STATIC SV *
7282 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7283 {
7284     /* Returns the swash for the input 'node' in the regex 'prog'.
7285      * If <doinit> is true, will attempt to create the swash if not already
7286      *    done.
7287      * If <listsvp> is non-null, will return the swash initialization string in
7288      *    it.
7289      * Tied intimately to how regcomp.c sets up the data structure */
7290
7291     dVAR;
7292     SV *sw  = NULL;
7293     SV *si  = NULL;
7294     SV*  invlist = NULL;
7295
7296     RXi_GET_DECL(prog,progi);
7297     const struct reg_data * const data = prog ? progi->data : NULL;
7298
7299     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7300
7301     assert(ANYOF_NONBITMAP(node));
7302
7303     if (data && data->count) {
7304         const U32 n = ARG(node);
7305
7306         if (data->what[n] == 's') {
7307             SV * const rv = MUTABLE_SV(data->data[n]);
7308             AV * const av = MUTABLE_AV(SvRV(rv));
7309             SV **const ary = AvARRAY(av);
7310             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7311
7312             si = *ary;  /* ary[0] = the string to initialize the swash with */
7313
7314             /* Elements 2 and 3 are either both present or both absent. [2] is
7315              * any inversion list generated at compile time; [3] indicates if
7316              * that inversion list has any user-defined properties in it. */
7317             if (av_len(av) >= 2) {
7318                 invlist = ary[2];
7319                 if (SvUV(ary[3])) {
7320                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7321                 }
7322             }
7323             else {
7324                 invlist = NULL;
7325             }
7326
7327             /* Element [1] is reserved for the set-up swash.  If already there,
7328              * return it; if not, create it and store it there */
7329             if (SvROK(ary[1])) {
7330                 sw = ary[1];
7331             }
7332             else if (si && doinit) {
7333
7334                 sw = _core_swash_init("utf8", /* the utf8 package */
7335                                       "", /* nameless */
7336                                       si,
7337                                       1, /* binary */
7338                                       0, /* not from tr/// */
7339                                       invlist,
7340                                       &swash_init_flags);
7341                 (void)av_store(av, 1, sw);
7342             }
7343         }
7344     }
7345
7346     if (listsvp) {
7347         SV* matches_string = newSVpvn("", 0);
7348
7349         /* Use the swash, if any, which has to have incorporated into it all
7350          * possibilities */
7351         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7352             && (si && si != &PL_sv_undef))
7353         {
7354
7355             /* If no swash, use the input initialization string, if available */
7356             sv_catsv(matches_string, si);
7357         }
7358
7359         /* Add the inversion list to whatever we have.  This may have come from
7360          * the swash, or from an input parameter */
7361         if (invlist) {
7362             sv_catsv(matches_string, _invlist_contents(invlist));
7363         }
7364         *listsvp = matches_string;
7365     }
7366
7367     return sw;
7368 }
7369
7370 /*
7371  - reginclass - determine if a character falls into a character class
7372
7373   n is the ANYOF regnode
7374   p is the target string
7375   utf8_target tells whether p is in UTF-8.
7376
7377   Returns true if matched; false otherwise.
7378
7379   Note that this can be a synthetic start class, a combination of various
7380   nodes, so things you think might be mutually exclusive, such as locale,
7381   aren't.  It can match both locale and non-locale
7382
7383  */
7384
7385 STATIC bool
7386 S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7387 {
7388     dVAR;
7389     const char flags = ANYOF_FLAGS(n);
7390     bool match = FALSE;
7391     UV c = *p;
7392
7393     PERL_ARGS_ASSERT_REGINCLASS;
7394
7395     /* If c is not already the code point, get it.  Note that
7396      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7397     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7398         STRLEN c_len = 0;
7399         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7400                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7401                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7402                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7403                  * UTF8_ALLOW_FFFF */
7404         if (c_len == (STRLEN)-1)
7405             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7406     }
7407
7408     /* If this character is potentially in the bitmap, check it */
7409     if (c < 256) {
7410         if (ANYOF_BITMAP_TEST(n, c))
7411             match = TRUE;
7412         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7413                 && ! utf8_target
7414                 && ! isASCII(c))
7415         {
7416             match = TRUE;
7417         }
7418         else if (flags & ANYOF_LOCALE) {
7419             PL_reg_flags |= RF_tainted;
7420
7421             if ((flags & ANYOF_LOC_FOLD)
7422                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7423             {
7424                 match = TRUE;
7425             }
7426             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7427                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7428                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7429                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7430                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7431                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7432                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7433                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7434                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7435                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7436                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7437                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7438                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7439                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7440                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7441                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7442                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7443                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7444                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7445                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7446                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7447                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7448                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7449                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7450                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7451                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7452                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7453                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7454                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7455                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7456                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7457                      ) /* How's that for a conditional? */
7458             ) {
7459                 match = TRUE;
7460             }
7461         }
7462     }
7463
7464     /* If the bitmap didn't (or couldn't) match, and something outside the
7465      * bitmap could match, try that.  Locale nodes specify completely the
7466      * behavior of code points in the bit map (otherwise, a utf8 target would
7467      * cause them to be treated as Unicode and not locale), except in
7468      * the very unlikely event when this node is a synthetic start class, which
7469      * could be a combination of locale and non-locale nodes.  So allow locale
7470      * to match for the synthetic start class, which will give a false
7471      * positive that will be resolved when the match is done again as not part
7472      * of the synthetic start class */
7473     if (!match) {
7474         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7475             match = TRUE;       /* Everything above 255 matches */
7476         }
7477         else if (ANYOF_NONBITMAP(n)
7478                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7479                      || (utf8_target
7480                          && (c >=256
7481                              || (! (flags & ANYOF_LOCALE))
7482                              || (flags & ANYOF_IS_SYNTHETIC)))))
7483         {
7484             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7485             if (sw) {
7486                 U8 * utf8_p;
7487                 if (utf8_target) {
7488                     utf8_p = (U8 *) p;
7489                 } else { /* Convert to utf8 */
7490                     STRLEN len = 1;
7491                     utf8_p = bytes_to_utf8(p, &len);
7492                 }
7493
7494                 if (swash_fetch(sw, utf8_p, TRUE)) {
7495                     match = TRUE;
7496                 }
7497
7498                 /* If we allocated a string above, free it */
7499                 if (! utf8_target) Safefree(utf8_p);
7500             }
7501         }
7502
7503         if (UNICODE_IS_SUPER(c)
7504             && (flags & ANYOF_WARN_SUPER)
7505             && ckWARN_d(WARN_NON_UNICODE))
7506         {
7507             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7508                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7509         }
7510     }
7511
7512     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7513     return cBOOL(flags & ANYOF_INVERT) ^ match;
7514 }
7515
7516 STATIC U8 *
7517 S_reghop3(U8 *s, I32 off, const U8* lim)
7518 {
7519     /* return the position 'off' UTF-8 characters away from 's', forward if
7520      * 'off' >= 0, backwards if negative.  But don't go outside of position
7521      * 'lim', which better be < s  if off < 0 */
7522
7523     dVAR;
7524
7525     PERL_ARGS_ASSERT_REGHOP3;
7526
7527     if (off >= 0) {
7528         while (off-- && s < lim) {
7529             /* XXX could check well-formedness here */
7530             s += UTF8SKIP(s);
7531         }
7532     }
7533     else {
7534         while (off++ && s > lim) {
7535             s--;
7536             if (UTF8_IS_CONTINUED(*s)) {
7537                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7538                     s--;
7539             }
7540             /* XXX could check well-formedness here */
7541         }
7542     }
7543     return s;
7544 }
7545
7546 #ifdef XXX_dmq
7547 /* there are a bunch of places where we use two reghop3's that should
7548    be replaced with this routine. but since thats not done yet
7549    we ifdef it out - dmq
7550 */
7551 STATIC U8 *
7552 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7553 {
7554     dVAR;
7555
7556     PERL_ARGS_ASSERT_REGHOP4;
7557
7558     if (off >= 0) {
7559         while (off-- && s < rlim) {
7560             /* XXX could check well-formedness here */
7561             s += UTF8SKIP(s);
7562         }
7563     }
7564     else {
7565         while (off++ && s > llim) {
7566             s--;
7567             if (UTF8_IS_CONTINUED(*s)) {
7568                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7569                     s--;
7570             }
7571             /* XXX could check well-formedness here */
7572         }
7573     }
7574     return s;
7575 }
7576 #endif
7577
7578 STATIC U8 *
7579 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7580 {
7581     dVAR;
7582
7583     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7584
7585     if (off >= 0) {
7586         while (off-- && s < lim) {
7587             /* XXX could check well-formedness here */
7588             s += UTF8SKIP(s);
7589         }
7590         if (off >= 0)
7591             return NULL;
7592     }
7593     else {
7594         while (off++ && s > lim) {
7595             s--;
7596             if (UTF8_IS_CONTINUED(*s)) {
7597                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7598                     s--;
7599             }
7600             /* XXX could check well-formedness here */
7601         }
7602         if (off <= 0)
7603             return NULL;
7604     }
7605     return s;
7606 }
7607
7608 static void
7609 restore_pos(pTHX_ void *arg)
7610 {
7611     dVAR;
7612     regexp * const rex = (regexp *)arg;
7613     if (PL_reg_state.re_state_eval_setup_done) {
7614         if (PL_reg_oldsaved) {
7615             rex->subbeg = PL_reg_oldsaved;
7616             rex->sublen = PL_reg_oldsavedlen;
7617             rex->suboffset = PL_reg_oldsavedoffset;
7618             rex->subcoffset = PL_reg_oldsavedcoffset;
7619 #ifdef PERL_ANY_COW
7620             rex->saved_copy = PL_nrs;
7621 #endif
7622             RXp_MATCH_COPIED_on(rex);
7623         }
7624         PL_reg_magic->mg_len = PL_reg_oldpos;
7625         PL_reg_state.re_state_eval_setup_done = FALSE;
7626         PL_curpm = PL_reg_oldcurpm;
7627     }
7628 }
7629
7630 STATIC void
7631 S_to_utf8_substr(pTHX_ regexp *prog)
7632 {
7633     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7634      * on the converted value */
7635
7636     int i = 1;
7637
7638     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7639
7640     do {
7641         if (prog->substrs->data[i].substr
7642             && !prog->substrs->data[i].utf8_substr) {
7643             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7644             prog->substrs->data[i].utf8_substr = sv;
7645             sv_utf8_upgrade(sv);
7646             if (SvVALID(prog->substrs->data[i].substr)) {
7647                 if (SvTAIL(prog->substrs->data[i].substr)) {
7648                     /* Trim the trailing \n that fbm_compile added last
7649                        time.  */
7650                     SvCUR_set(sv, SvCUR(sv) - 1);
7651                     /* Whilst this makes the SV technically "invalid" (as its
7652                        buffer is no longer followed by "\0") when fbm_compile()
7653                        adds the "\n" back, a "\0" is restored.  */
7654                     fbm_compile(sv, FBMcf_TAIL);
7655                 } else
7656                     fbm_compile(sv, 0);
7657             }
7658             if (prog->substrs->data[i].substr == prog->check_substr)
7659                 prog->check_utf8 = sv;
7660         }
7661     } while (i--);
7662 }
7663
7664 STATIC bool
7665 S_to_byte_substr(pTHX_ regexp *prog)
7666 {
7667     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7668      * on the converted value; returns FALSE if can't be converted. */
7669
7670     dVAR;
7671     int i = 1;
7672
7673     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7674
7675     do {
7676         if (prog->substrs->data[i].utf8_substr
7677             && !prog->substrs->data[i].substr) {
7678             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7679             if (! sv_utf8_downgrade(sv, TRUE)) {
7680                 return FALSE;
7681             }
7682             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7683                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7684                     /* Trim the trailing \n that fbm_compile added last
7685                         time.  */
7686                     SvCUR_set(sv, SvCUR(sv) - 1);
7687                     fbm_compile(sv, FBMcf_TAIL);
7688                 } else
7689                     fbm_compile(sv, 0);
7690             }
7691             prog->substrs->data[i].substr = sv;
7692             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7693                 prog->check_substr = sv;
7694         }
7695     } while (i--);
7696
7697     return TRUE;
7698 }
7699
7700 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7701  * pre-composed Hangul syllables, which are all in a contiguous block and
7702  * arranged there in such a way so as to facilitate alorithmic determination of
7703  * their characteristics.  As such, they don't need a swash, but can be
7704  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7705  * is a GCB=LV */
7706 #define SBASE 0xAC00    /* Start of block */
7707 #define SCount 11172    /* Length of block */
7708 #define TCount 28
7709
7710 #if 0   /* This routine is not currently used */
7711 PERL_STATIC_INLINE bool
7712 S_is_utf8_X_LV(pTHX_ const U8 *p)
7713 {
7714     /* Unlike most other similarly named routines here, this does not create a
7715      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7716
7717     dVAR;
7718
7719     UV cp = valid_utf8_to_uvchr(p, NULL);
7720
7721     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7722
7723     /* The earliest Unicode releases did not have these precomposed Hangul
7724      * syllables.  Set to point to undef in that case, so will return false on
7725      * every call */
7726     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7727         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7728         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7729             SvREFCNT_dec(PL_utf8_X_LV);
7730             PL_utf8_X_LV = &PL_sv_undef;
7731         }
7732     }
7733
7734     return (PL_utf8_X_LV != &PL_sv_undef
7735             && cp >= SBASE && cp < SBASE + SCount
7736             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7737 }
7738 #endif
7739
7740 PERL_STATIC_INLINE bool
7741 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7742 {
7743     /* Unlike most other similarly named routines here, this does not create a
7744      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7745
7746     dVAR;
7747
7748     UV cp = valid_utf8_to_uvchr(p, NULL);
7749
7750     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7751
7752     /* The earliest Unicode releases did not have these precomposed Hangul
7753      * syllables.  Set to point to undef in that case, so will return false on
7754      * every call */
7755     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7756         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7757         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7758             SvREFCNT_dec(PL_utf8_X_LVT);
7759             PL_utf8_X_LVT = &PL_sv_undef;
7760         }
7761     }
7762
7763     return (PL_utf8_X_LVT != &PL_sv_undef
7764             && cp >= SBASE && cp < SBASE + SCount
7765             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7766 }
7767
7768 /*
7769  * Local variables:
7770  * c-indentation-style: bsd
7771  * c-basic-offset: 4
7772  * indent-tabs-mode: nil
7773  * End:
7774  *
7775  * ex: set ts=8 sts=4 sw=4 et:
7776  */