regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define RF_tainted      1       /* tainted information used? e.g. locale */
  97 #define RF_warned       2               /* warned about big count? */
  98
  99 #define RF_utf8         8               /* Pattern contains multibyte chars? */
 100
 101 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
 102
 103 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 104
 105 #ifndef STATIC
 106 #define STATIC  static
 107 #endif
 108
 109 /* Valid for non-utf8 strings: avoids the reginclass
 110  * call if there are no complications: i.e., if everything matchable is
 111  * straight forward in the bitmap */
 112 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 113                                               : ANYOF_BITMAP_TEST(p,*(c)))
 114
 115 /*
 116  * Forwards.
 117  */
 118
 119 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 120 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 121
 122 #define HOPc(pos,off) \
 123         (char *)(PL_reg_match_utf8 \
 124             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 125             : (U8*)(pos + off))
 126 #define HOPBACKc(pos, off) \
 127         (char*)(PL_reg_match_utf8\
 128             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 129             : (pos - off >= PL_bostr)           \
 130                 ? (U8*)pos - off                \
 131                 : NULL)
 132
 133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 135
 136
 137 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 138 #define NEXTCHR_IS_EOS (nextchr < 0)
 139
 140 #define SET_nextchr \
 141     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 142
 143 #define SET_locinput(p) \
 144     locinput = (p);  \
 145     SET_nextchr
 146
 147
 148 /* these are unrolled below in the CCC_TRY_XXX defined */
 149 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 150     if (!CAT2(PL_utf8_,class)) { \
 151         bool ok; \
 152         ENTER; save_re_context(); \
 153         ok=CAT2(is_utf8_,class)((const U8*)str); \
 154         PERL_UNUSED_VAR(ok); \
 155         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 156 /* Doesn't do an assert to verify that is correct */
 157 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 158     if (!CAT2(PL_utf8_,class)) { \
 159         bool throw_away PERL_UNUSED_DECL; \
 160         ENTER; save_re_context(); \
 161         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 162         PERL_UNUSED_VAR(throw_away); \
 163         LEAVE; } } STMT_END
 164
 165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 166 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 167
 168 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 169         /* No asserts are done for some of these, in case called on a   */  \
 170         /* Unicode version in which they map to nothing */                  \
 171         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                  \
 172         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 173
 174 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 175
 176 /* The actual code for CCC_TRY, which uses several variables from the routine
 177  * it's callable from.  It is designed to be the bulk of a case statement.
 178  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 179  *      nextchr matches the class.
 180  * UTF8_TEST is the whole test string to use for utf8 targets
 181  * LOAD is what to use to test, and if not present to load in the swash for the
 182  *      class
 183  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 184  *      UTF8_TEST test.
 185  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 186  * utf8 and a variant, load the swash if necessary and test using the utf8
 187  * test.  Advance to the next character if test is ok, otherwise fail; If not
 188  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 189  * fails, or advance to the next character */
 190
 191 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 192     if (NEXTCHR_IS_EOS) {                                                     \
 193         sayNO;                                                                \
 194     }                                                                         \
 195     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 196         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 197         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 198             sayNO;                                                            \
 199         }                                                                     \
 200     }                                                                         \
 201     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 202             sayNO;                                                            \
 203     }                                                                         \
 204     goto increment_locinput;
 205
 206 /* Handle the non-locale cases for a character class and its complement.  It
 207  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 208  * This is because that code fails when the test succeeds, so we want to have
 209  * the test fail so that the code succeeds.  The swash is stored in a
 210  * predictable PL_ place */
 211 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 212                            CLASS, STR)                                        \
 213     case NAME:                                                                \
 214         _CCC_TRY_CODE( !, FUNC,                                               \
 215                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 216                                             (U8*)locinput, TRUE)),            \
 217                           CLASS, STR)                                         \
 218     case NNAME:                                                               \
 219         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 220                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 221                                             (U8*)locinput, TRUE)),            \
 222                           CLASS, STR)
 223 /* Generate the case statements for both locale and non-locale character
 224  * classes in regmatch for classes that don't have special unicode semantics.
 225  * Locales don't use an immediate swash, but an intermediary special locale
 226  * function that is called on the pointer to the current place in the input
 227  * string.  That function will resolve to needing the same swash.  One might
 228  * think that because we don't know what the locale will match, we shouldn't
 229  * check with the swash loading function that it loaded properly; ie, that we
 230  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 231  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 232  * irrelevant here */
 233 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 234                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 235                 NAMEA, NNAMEA, FUNCA,                                         \
 236                 CLASS, STR)                                                   \
 237     case NAMEL:                                                               \
 238         PL_reg_flags |= RF_tainted;                                           \
 239         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 240     case NNAMEL:                                                              \
 241         PL_reg_flags |= RF_tainted;                                           \
 242         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 243                        CLASS, STR)                                            \
 244     case NAMEA:                                                               \
 245         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 246             sayNO;                                                            \
 247         }                                                                     \
 248         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 249         locinput++;                                        \
 250         break;                                                                \
 251     case NNAMEA:                                                              \
 252         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 253             sayNO;                                                            \
 254         }                                                                     \
 255         goto increment_locinput;                                              \
 256     /* Generate the non-locale cases */                                       \
 257     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 258
 259 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 260  * statements to handle separate Unicode semantics nodes */
 261 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 262                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 263                   NAMEU, NNAMEU, FUNCU,                                        \
 264                   NAMEA, NNAMEA, FUNCA,                                        \
 265                   CLASS, STR)                                                  \
 266     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 267             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 268             NAMEA, NNAMEA, FUNCA,                                              \
 269             CLASS, STR)                                                        \
 270     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 271
 272 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 273
 274 /* for use after a quantifier and before an EXACT-like node -- japhy */
 275 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 276  *
 277  * NOTE that *nothing* that affects backtracking should be in here, specifically
 278  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 279  * node that is in between two EXACT like nodes when ascertaining what the required
 280  * "follow" character is. This should probably be moved to regex compile time
 281  * although it may be done at run time beause of the REF possibility - more
 282  * investigation required. -- demerphq
 283 */
 284 #define JUMPABLE(rn) (      \
 285     OP(rn) == OPEN ||       \
 286     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 287     OP(rn) == EVAL ||   \
 288     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 289     OP(rn) == PLUS || OP(rn) == MINMOD || \
 290     OP(rn) == KEEPS || \
 291     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 292 )
 293 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 294
 295 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 296
 297 #if 0
 298 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 299    we don't need this definition. */
 300 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 301 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 302 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 303
 304 #else
 305 /* ... so we use this as its faster. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 307 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 308 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 309 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 310
 311 #endif
 312
 313 /*
 314   Search for mandatory following text node; for lookahead, the text must
 315   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 316 */
 317 #define FIND_NEXT_IMPT(rn) STMT_START { \
 318     while (JUMPABLE(rn)) { \
 319         const OPCODE type = OP(rn); \
 320         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 321             rn = NEXTOPER(NEXTOPER(rn)); \
 322         else if (type == PLUS) \
 323             rn = NEXTOPER(rn); \
 324         else if (type == IFMATCH) \
 325             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 326         else rn += NEXT_OFF(rn); \
 327     } \
 328 } STMT_END
 329
 330
 331 static void restore_pos(pTHX_ void *arg);
 332
 333 #define REGCP_PAREN_ELEMS 3
 334 #define REGCP_OTHER_ELEMS 3
 335 #define REGCP_FRAME_ELEMS 1
 336 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 337  * are needed for the regexp context stack bookkeeping. */
 338
 339 STATIC CHECKPOINT
 340 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 341 {
 342     dVAR;
 343     const int retval = PL_savestack_ix;
 344     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 345     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 346     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 347     I32 p;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     PERL_ARGS_ASSERT_REGCPPUSH;
 351
 352     if (paren_elems_to_push < 0)
 353         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 354                    paren_elems_to_push);
 355
 356     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 357         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 358                    " out of range (%lu-%ld)",
 359                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 360
 361     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 362
 363     DEBUG_BUFFERS_r(
 364         if ((int)PL_regsize > (int)parenfloor)
 365             PerlIO_printf(Perl_debug_log,
 366                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 367                 PTR2UV(rex),
 368                 PTR2UV(rex->offs)
 369             );
 370     );
 371     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 372 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 373         SSPUSHINT(rex->offs[p].end);
 374         SSPUSHINT(rex->offs[p].start);
 375         SSPUSHINT(rex->offs[p].start_tmp);
 376         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 377             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 378             (UV)p,
 379             (IV)rex->offs[p].start,
 380             (IV)rex->offs[p].start_tmp,
 381             (IV)rex->offs[p].end
 382         ));
 383     }
 384 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 385     SSPUSHINT(PL_regsize);
 386     SSPUSHINT(rex->lastparen);
 387     SSPUSHINT(rex->lastcloseparen);
 388     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 389
 390     return retval;
 391 }
 392
 393 /* These are needed since we do not localize EVAL nodes: */
 394 #define REGCP_SET(cp)                                           \
 395     DEBUG_STATE_r(                                              \
 396             PerlIO_printf(Perl_debug_log,                       \
 397                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 398                 (IV)PL_savestack_ix));                          \
 399     cp = PL_savestack_ix
 400
 401 #define REGCP_UNWIND(cp)                                        \
 402     DEBUG_STATE_r(                                              \
 403         if (cp != PL_savestack_ix)                              \
 404             PerlIO_printf(Perl_debug_log,                       \
 405                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 406                 (IV)(cp), (IV)PL_savestack_ix));                \
 407     regcpblow(cp)
 408
 409 #define UNWIND_PAREN(lp, lcp)               \
 410     for (n = rex->lastparen; n > lp; n--)   \
 411         rex->offs[n].end = -1;              \
 412     rex->lastparen = n;                     \
 413     rex->lastcloseparen = lcp;
 414
 415
 416 STATIC void
 417 S_regcppop(pTHX_ regexp *rex)
 418 {
 419     dVAR;
 420     UV i;
 421     U32 paren;
 422     GET_RE_DEBUG_FLAGS_DECL;
 423
 424     PERL_ARGS_ASSERT_REGCPPOP;
 425
 426     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 427     i = SSPOPUV;
 428     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 429     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 430     rex->lastcloseparen = SSPOPINT;
 431     rex->lastparen = SSPOPINT;
 432     PL_regsize = SSPOPINT;
 433
 434     i -= REGCP_OTHER_ELEMS;
 435     /* Now restore the parentheses context. */
 436     DEBUG_BUFFERS_r(
 437         if (i || rex->lastparen + 1 <= rex->nparens)
 438             PerlIO_printf(Perl_debug_log,
 439                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 440                 PTR2UV(rex),
 441                 PTR2UV(rex->offs)
 442             );
 443     );
 444     paren = PL_regsize;
 445     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 446         I32 tmps;
 447         rex->offs[paren].start_tmp = SSPOPINT;
 448         rex->offs[paren].start = SSPOPINT;
 449         tmps = SSPOPINT;
 450         if (paren <= rex->lastparen)
 451             rex->offs[paren].end = tmps;
 452         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 453             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 454             (UV)paren,
 455             (IV)rex->offs[paren].start,
 456             (IV)rex->offs[paren].start_tmp,
 457             (IV)rex->offs[paren].end,
 458             (paren > rex->lastparen ? "(skipped)" : ""));
 459         );
 460         paren--;
 461     }
 462 #if 1
 463     /* It would seem that the similar code in regtry()
 464      * already takes care of this, and in fact it is in
 465      * a better location to since this code can #if 0-ed out
 466      * but the code in regtry() is needed or otherwise tests
 467      * requiring null fields (pat.t#187 and split.t#{13,14}
 468      * (as of patchlevel 7877)  will fail.  Then again,
 469      * this code seems to be necessary or otherwise
 470      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 471      * --jhi updated by dapm */
 472     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 473         if (i > PL_regsize)
 474             rex->offs[i].start = -1;
 475         rex->offs[i].end = -1;
 476         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 477             "    \\%"UVuf": %s   ..-1 undeffing\n",
 478             (UV)i,
 479             (i > PL_regsize) ? "-1" : "  "
 480         ));
 481     }
 482 #endif
 483 }
 484
 485 /* restore the parens and associated vars at savestack position ix,
 486  * but without popping the stack */
 487
 488 STATIC void
 489 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 490 {
 491     I32 tmpix = PL_savestack_ix;
 492     PL_savestack_ix = ix;
 493     regcppop(rex);
 494     PL_savestack_ix = tmpix;
 495 }
 496
 497 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 498
 499 /*
 500  * pregexec and friends
 501  */
 502
 503 #ifndef PERL_IN_XSUB_RE
 504 /*
 505  - pregexec - match a regexp against a string
 506  */
 507 I32
 508 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 509          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 510 /* stringarg: the point in the string at which to begin matching */
 511 /* strend:    pointer to null at end of string */
 512 /* strbeg:    real beginning of string */
 513 /* minend:    end of match must be >= minend bytes after stringarg. */
 514 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 515  *            itself is accessed via the pointers above */
 516 /* nosave:    For optimizations. */
 517 {
 518     PERL_ARGS_ASSERT_PREGEXEC;
 519
 520     return
 521         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 522                       nosave ? 0 : REXEC_COPY_STR);
 523 }
 524 #endif
 525
 526 /*
 527  * Need to implement the following flags for reg_anch:
 528  *
 529  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 530  * USE_INTUIT_ML
 531  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 532  * INTUIT_AUTORITATIVE_ML
 533  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 534  * INTUIT_ONCE_ML
 535  *
 536  * Another flag for this function: SECOND_TIME (so that float substrs
 537  * with giant delta may be not rechecked).
 538  */
 539
 540 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 541
 542 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 543    Otherwise, only SvCUR(sv) is used to get strbeg. */
 544
 545 /* XXXX We assume that strpos is strbeg unless sv. */
 546
 547 /* XXXX Some places assume that there is a fixed substring.
 548         An update may be needed if optimizer marks as "INTUITable"
 549         RExen without fixed substrings.  Similarly, it is assumed that
 550         lengths of all the strings are no more than minlen, thus they
 551         cannot come from lookahead.
 552         (Or minlen should take into account lookahead.)
 553   NOTE: Some of this comment is not correct. minlen does now take account
 554   of lookahead/behind. Further research is required. -- demerphq
 555
 556 */
 557
 558 /* A failure to find a constant substring means that there is no need to make
 559    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 560    finding a substring too deep into the string means that less calls to
 561    regtry() should be needed.
 562
 563    REx compiler's optimizer found 4 possible hints:
 564         a) Anchored substring;
 565         b) Fixed substring;
 566         c) Whether we are anchored (beginning-of-line or \G);
 567         d) First node (of those at offset 0) which may distinguish positions;
 568    We use a)b)d) and multiline-part of c), and try to find a position in the
 569    string which does not contradict any of them.
 570  */
 571
 572 /* Most of decisions we do here should have been done at compile time.
 573    The nodes of the REx which we used for the search should have been
 574    deleted from the finite automaton. */
 575
 576 char *
 577 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 578                      char *strend, const U32 flags, re_scream_pos_data *data)
 579 {
 580     dVAR;
 581     struct regexp *const prog = ReANY(rx);
 582     I32 start_shift = 0;
 583     /* Should be nonnegative! */
 584     I32 end_shift   = 0;
 585     char *s;
 586     SV *check;
 587     char *strbeg;
 588     char *t;
 589     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 590     I32 ml_anch;
 591     char *other_last = NULL;    /* other substr checked before this */
 592     char *check_at = NULL;              /* check substr found at this pos */
 593     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 594     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 595     RXi_GET_DECL(prog,progi);
 596 #ifdef DEBUGGING
 597     const char * const i_strpos = strpos;
 598 #endif
 599     GET_RE_DEBUG_FLAGS_DECL;
 600
 601     PERL_ARGS_ASSERT_RE_INTUIT_START;
 602     PERL_UNUSED_ARG(flags);
 603     PERL_UNUSED_ARG(data);
 604
 605     RX_MATCH_UTF8_set(rx,utf8_target);
 606
 607     if (RX_UTF8(rx)) {
 608         PL_reg_flags |= RF_utf8;
 609     }
 610     DEBUG_EXECUTE_r(
 611         debug_start_match(rx, utf8_target, strpos, strend,
 612             sv ? "Guessing start of match in sv for"
 613                : "Guessing start of match in string for");
 614               );
 615
 616     /* CHR_DIST() would be more correct here but it makes things slow. */
 617     if (prog->minlen > strend - strpos) {
 618         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 619                               "String too short... [re_intuit_start]\n"));
 620         goto fail;
 621     }
 622
 623     /* XXX we need to pass strbeg as a separate arg: the following is
 624      * guesswork and can be wrong... */
 625     if (sv && SvPOK(sv)) {
 626         char * p   = SvPVX(sv);
 627         STRLEN cur = SvCUR(sv);
 628         if (p <= strpos && strpos < p + cur) {
 629             strbeg = p;
 630             assert(p <= strend && strend <= p + cur);
 631         }
 632         else
 633             strbeg = strend - cur;
 634     }
 635     else
 636         strbeg = strpos;
 637
 638     PL_regeol = strend;
 639     if (utf8_target) {
 640         if (!prog->check_utf8 && prog->check_substr)
 641             to_utf8_substr(prog);
 642         check = prog->check_utf8;
 643     } else {
 644         if (!prog->check_substr && prog->check_utf8) {
 645             if (! to_byte_substr(prog)) {
 646                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 647             }
 648         }
 649         check = prog->check_substr;
 650     }
 651     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 652         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 653                      || ( (prog->extflags & RXf_ANCH_BOL)
 654                           && !multiline ) );    /* Check after \n? */
 655
 656         if (!ml_anch) {
 657           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 658                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 659                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 660                && sv && !SvROK(sv)
 661                && (strpos != strbeg)) {
 662               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 663               goto fail;
 664           }
 665           if (prog->check_offset_min == prog->check_offset_max
 666               && !(prog->extflags & RXf_CANY_SEEN)
 667               && ! multiline)   /* /m can cause \n's to match that aren't
 668                                    accounted for in the string max length.
 669                                    See [perl #115242] */
 670           {
 671             /* Substring at constant offset from beg-of-str... */
 672             I32 slen;
 673
 674             s = HOP3c(strpos, prog->check_offset_min, strend);
 675
 676             if (SvTAIL(check)) {
 677                 slen = SvCUR(check);    /* >= 1 */
 678
 679                 if ( strend - s > slen || strend - s < slen - 1
 680                      || (strend - s == slen && strend[-1] != '\n')) {
 681                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 682                     goto fail_finish;
 683                 }
 684                 /* Now should match s[0..slen-2] */
 685                 slen--;
 686                 if (slen && (*SvPVX_const(check) != *s
 687                              || (slen > 1
 688                                  && memNE(SvPVX_const(check), s, slen)))) {
 689                   report_neq:
 690                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 691                     goto fail_finish;
 692                 }
 693             }
 694             else if (*SvPVX_const(check) != *s
 695                      || ((slen = SvCUR(check)) > 1
 696                          && memNE(SvPVX_const(check), s, slen)))
 697                 goto report_neq;
 698             check_at = s;
 699             goto success_at_start;
 700           }
 701         }
 702         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 703         s = strpos;
 704         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 705         end_shift = prog->check_end_shift;
 706
 707         if (!ml_anch) {
 708             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 709                                          - (SvTAIL(check) != 0);
 710             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 711
 712             if (end_shift < eshift)
 713                 end_shift = eshift;
 714         }
 715     }
 716     else {                              /* Can match at random position */
 717         ml_anch = 0;
 718         s = strpos;
 719         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 720         end_shift = prog->check_end_shift;
 721
 722         /* end shift should be non negative here */
 723     }
 724
 725 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 726     if (end_shift < 0)
 727         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 728                    (IV)end_shift, RX_PRECOMP(prog));
 729 #endif
 730
 731   restart:
 732     /* Find a possible match in the region s..strend by looking for
 733        the "check" substring in the region corrected by start/end_shift. */
 734
 735     {
 736         I32 srch_start_shift = start_shift;
 737         I32 srch_end_shift = end_shift;
 738         U8* start_point;
 739         U8* end_point;
 740         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 741             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 742             srch_start_shift = strbeg - s;
 743         }
 744     DEBUG_OPTIMISE_MORE_r({
 745         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 746             (IV)prog->check_offset_min,
 747             (IV)srch_start_shift,
 748             (IV)srch_end_shift,
 749             (IV)prog->check_end_shift);
 750     });
 751
 752         if (prog->extflags & RXf_CANY_SEEN) {
 753             start_point= (U8*)(s + srch_start_shift);
 754             end_point= (U8*)(strend - srch_end_shift);
 755         } else {
 756             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 757             end_point= HOP3(strend, -srch_end_shift, strbeg);
 758         }
 759         DEBUG_OPTIMISE_MORE_r({
 760             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 761                 (int)(end_point - start_point),
 762                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 763                 start_point);
 764         });
 765
 766         s = fbm_instr( start_point, end_point,
 767                       check, multiline ? FBMrf_MULTILINE : 0);
 768     }
 769     /* Update the count-of-usability, remove useless subpatterns,
 770         unshift s.  */
 771
 772     DEBUG_EXECUTE_r({
 773         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 774             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 775         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 776                           (s ? "Found" : "Did not find"),
 777             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 778                 ? "anchored" : "floating"),
 779             quoted,
 780             RE_SV_TAIL(check),
 781             (s ? " at offset " : "...\n") );
 782     });
 783
 784     if (!s)
 785         goto fail_finish;
 786     /* Finish the diagnostic message */
 787     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 788
 789     /* XXX dmq: first branch is for positive lookbehind...
 790        Our check string is offset from the beginning of the pattern.
 791        So we need to do any stclass tests offset forward from that
 792        point. I think. :-(
 793      */
 794
 795
 796
 797     check_at=s;
 798
 799
 800     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 801        Start with the other substr.
 802        XXXX no SCREAM optimization yet - and a very coarse implementation
 803        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 804                 *always* match.  Probably should be marked during compile...
 805        Probably it is right to do no SCREAM here...
 806      */
 807
 808     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 809                 : (prog->float_substr && prog->anchored_substr))
 810     {
 811         /* Take into account the "other" substring. */
 812         /* XXXX May be hopelessly wrong for UTF... */
 813         if (!other_last)
 814             other_last = strpos;
 815         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 816           do_other_anchored:
 817             {
 818                 char * const last = HOP3c(s, -start_shift, strbeg);
 819                 char *last1, *last2;
 820                 char * const saved_s = s;
 821                 SV* must;
 822
 823                 t = s - prog->check_offset_max;
 824                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 825                     && (!utf8_target
 826                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 827                             && t > strpos)))
 828                     NOOP;
 829                 else
 830                     t = strpos;
 831                 t = HOP3c(t, prog->anchored_offset, strend);
 832                 if (t < other_last)     /* These positions already checked */
 833                     t = other_last;
 834                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 835                 if (last < last1)
 836                     last1 = last;
 837                 /* XXXX It is not documented what units *_offsets are in.
 838                    We assume bytes, but this is clearly wrong.
 839                    Meaning this code needs to be carefully reviewed for errors.
 840                    dmq.
 841                   */
 842
 843                 /* On end-of-str: see comment below. */
 844                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 845                 if (must == &PL_sv_undef) {
 846                     s = (char*)NULL;
 847                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 848                 }
 849                 else
 850                     s = fbm_instr(
 851                         (unsigned char*)t,
 852                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 853                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 854                         must,
 855                         multiline ? FBMrf_MULTILINE : 0
 856                     );
 857                 DEBUG_EXECUTE_r({
 858                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 859                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 860                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 861                         (s ? "Found" : "Contradicts"),
 862                         quoted, RE_SV_TAIL(must));
 863                 });
 864
 865
 866                 if (!s) {
 867                     if (last1 >= last2) {
 868                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 869                                                 ", giving up...\n"));
 870                         goto fail_finish;
 871                     }
 872                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 873                         ", trying floating at offset %ld...\n",
 874                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 875                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 876                     s = HOP3c(last, 1, strend);
 877                     goto restart;
 878                 }
 879                 else {
 880                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 881                           (long)(s - i_strpos)));
 882                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 883                     other_last = HOP3c(s, 1, strend);
 884                     s = saved_s;
 885                     if (t == strpos)
 886                         goto try_at_start;
 887                     goto try_at_offset;
 888                 }
 889             }
 890         }
 891         else {          /* Take into account the floating substring. */
 892             char *last, *last1;
 893             char * const saved_s = s;
 894             SV* must;
 895
 896             t = HOP3c(s, -start_shift, strbeg);
 897             last1 = last =
 898                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 899             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 900                 last = HOP3c(t, prog->float_max_offset, strend);
 901             s = HOP3c(t, prog->float_min_offset, strend);
 902             if (s < other_last)
 903                 s = other_last;
 904  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 905             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 906             /* fbm_instr() takes into account exact value of end-of-str
 907                if the check is SvTAIL(ed).  Since false positives are OK,
 908                and end-of-str is not later than strend we are OK. */
 909             if (must == &PL_sv_undef) {
 910                 s = (char*)NULL;
 911                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 912             }
 913             else
 914                 s = fbm_instr((unsigned char*)s,
 915                               (unsigned char*)last + SvCUR(must)
 916                                   - (SvTAIL(must)!=0),
 917                               must, multiline ? FBMrf_MULTILINE : 0);
 918             DEBUG_EXECUTE_r({
 919                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 920                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 921                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 922                     (s ? "Found" : "Contradicts"),
 923                     quoted, RE_SV_TAIL(must));
 924             });
 925             if (!s) {
 926                 if (last1 == last) {
 927                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 928                                             ", giving up...\n"));
 929                     goto fail_finish;
 930                 }
 931                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 932                     ", trying anchored starting at offset %ld...\n",
 933                     (long)(saved_s + 1 - i_strpos)));
 934                 other_last = last;
 935                 s = HOP3c(t, 1, strend);
 936                 goto restart;
 937             }
 938             else {
 939                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 940                       (long)(s - i_strpos)));
 941                 other_last = s; /* Fix this later. --Hugo */
 942                 s = saved_s;
 943                 if (t == strpos)
 944                     goto try_at_start;
 945                 goto try_at_offset;
 946             }
 947         }
 948     }
 949
 950
 951     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 952
 953     DEBUG_OPTIMISE_MORE_r(
 954         PerlIO_printf(Perl_debug_log,
 955             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 956             (IV)prog->check_offset_min,
 957             (IV)prog->check_offset_max,
 958             (IV)(s-strpos),
 959             (IV)(t-strpos),
 960             (IV)(t-s),
 961             (IV)(strend-strpos)
 962         )
 963     );
 964
 965     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 966         && (!utf8_target
 967             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 968                  && t > strpos)))
 969     {
 970         /* Fixed substring is found far enough so that the match
 971            cannot start at strpos. */
 972       try_at_offset:
 973         if (ml_anch && t[-1] != '\n') {
 974             /* Eventually fbm_*() should handle this, but often
 975                anchored_offset is not 0, so this check will not be wasted. */
 976             /* XXXX In the code below we prefer to look for "^" even in
 977                presence of anchored substrings.  And we search even
 978                beyond the found float position.  These pessimizations
 979                are historical artefacts only.  */
 980           find_anchor:
 981             while (t < strend - prog->minlen) {
 982                 if (*t == '\n') {
 983                     if (t < check_at - prog->check_offset_min) {
 984                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 985                             /* Since we moved from the found position,
 986                                we definitely contradict the found anchored
 987                                substr.  Due to the above check we do not
 988                                contradict "check" substr.
 989                                Thus we can arrive here only if check substr
 990                                is float.  Redo checking for "other"=="fixed".
 991                              */
 992                             strpos = t + 1;
 993                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 994                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 995                             goto do_other_anchored;
 996                         }
 997                         /* We don't contradict the found floating substring. */
 998                         /* XXXX Why not check for STCLASS? */
 999                         s = t + 1;
1000                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1001                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1002                         goto set_useful;
1003                     }
1004                     /* Position contradicts check-string */
1005                     /* XXXX probably better to look for check-string
1006                        than for "\n", so one should lower the limit for t? */
1007                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1008                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1009                     other_last = strpos = s = t + 1;
1010                     goto restart;
1011                 }
1012                 t++;
1013             }
1014             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1015                         PL_colors[0], PL_colors[1]));
1016             goto fail_finish;
1017         }
1018         else {
1019             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1020                         PL_colors[0], PL_colors[1]));
1021         }
1022         s = t;
1023       set_useful:
1024         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1025     }
1026     else {
1027         /* The found string does not prohibit matching at strpos,
1028            - no optimization of calling REx engine can be performed,
1029            unless it was an MBOL and we are not after MBOL,
1030            or a future STCLASS check will fail this. */
1031       try_at_start:
1032         /* Even in this situation we may use MBOL flag if strpos is offset
1033            wrt the start of the string. */
1034         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1035             && (strpos != strbeg) && strpos[-1] != '\n'
1036             /* May be due to an implicit anchor of m{.*foo}  */
1037             && !(prog->intflags & PREGf_IMPLICIT))
1038         {
1039             t = strpos;
1040             goto find_anchor;
1041         }
1042         DEBUG_EXECUTE_r( if (ml_anch)
1043             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1044                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1045         );
1046       success_at_start:
1047         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1048             && (utf8_target ? (
1049                 prog->check_utf8                /* Could be deleted already */
1050                 && --BmUSEFUL(prog->check_utf8) < 0
1051                 && (prog->check_utf8 == prog->float_utf8)
1052             ) : (
1053                 prog->check_substr              /* Could be deleted already */
1054                 && --BmUSEFUL(prog->check_substr) < 0
1055                 && (prog->check_substr == prog->float_substr)
1056             )))
1057         {
1058             /* If flags & SOMETHING - do not do it many times on the same match */
1059             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1060             /* XXX Does the destruction order has to change with utf8_target? */
1061             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1062             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1063             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1064             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1065             check = NULL;                       /* abort */
1066             s = strpos;
1067             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1068                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1069             if (prog->intflags & PREGf_IMPLICIT)
1070                 prog->extflags &= ~RXf_ANCH_MBOL;
1071             /* XXXX This is a remnant of the old implementation.  It
1072                     looks wasteful, since now INTUIT can use many
1073                     other heuristics. */
1074             prog->extflags &= ~RXf_USE_INTUIT;
1075             /* XXXX What other flags might need to be cleared in this branch? */
1076         }
1077         else
1078             s = strpos;
1079     }
1080
1081     /* Last resort... */
1082     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1083     /* trie stclasses are too expensive to use here, we are better off to
1084        leave it to regmatch itself */
1085     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1086         /* minlen == 0 is possible if regstclass is \b or \B,
1087            and the fixed substr is ''$.
1088            Since minlen is already taken into account, s+1 is before strend;
1089            accidentally, minlen >= 1 guaranties no false positives at s + 1
1090            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1091            regstclass does not come from lookahead...  */
1092         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1093            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1094         const U8* const str = (U8*)STRING(progi->regstclass);
1095         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1096                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1097                     : 1);
1098         char * endpos;
1099         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1100             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1101         else if (prog->float_substr || prog->float_utf8)
1102             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1103         else
1104             endpos= strend;
1105
1106         if (checked_upto < s)
1107            checked_upto = s;
1108         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1109                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1110
1111         t = s;
1112         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1113         if (s) {
1114             checked_upto = s;
1115         } else {
1116 #ifdef DEBUGGING
1117             const char *what = NULL;
1118 #endif
1119             if (endpos == strend) {
1120                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1121                                 "Could not match STCLASS...\n") );
1122                 goto fail;
1123             }
1124             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1125                                    "This position contradicts STCLASS...\n") );
1126             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1127                 goto fail;
1128             checked_upto = HOPBACKc(endpos, start_shift);
1129             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1130                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1131             /* Contradict one of substrings */
1132             if (prog->anchored_substr || prog->anchored_utf8) {
1133                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1134                     DEBUG_EXECUTE_r( what = "anchored" );
1135                   hop_and_restart:
1136                     s = HOP3c(t, 1, strend);
1137                     if (s + start_shift + end_shift > strend) {
1138                         /* XXXX Should be taken into account earlier? */
1139                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1140                                                "Could not match STCLASS...\n") );
1141                         goto fail;
1142                     }
1143                     if (!check)
1144                         goto giveup;
1145                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1146                                 "Looking for %s substr starting at offset %ld...\n",
1147                                  what, (long)(s + start_shift - i_strpos)) );
1148                     goto restart;
1149                 }
1150                 /* Have both, check_string is floating */
1151                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1152                     goto retry_floating_check;
1153                 /* Recheck anchored substring, but not floating... */
1154                 s = check_at;
1155                 if (!check)
1156                     goto giveup;
1157                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1158                           "Looking for anchored substr starting at offset %ld...\n",
1159                           (long)(other_last - i_strpos)) );
1160                 goto do_other_anchored;
1161             }
1162             /* Another way we could have checked stclass at the
1163                current position only: */
1164             if (ml_anch) {
1165                 s = t = t + 1;
1166                 if (!check)
1167                     goto giveup;
1168                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1169                           "Looking for /%s^%s/m starting at offset %ld...\n",
1170                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1171                 goto try_at_offset;
1172             }
1173             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1174                 goto fail;
1175             /* Check is floating substring. */
1176           retry_floating_check:
1177             t = check_at - start_shift;
1178             DEBUG_EXECUTE_r( what = "floating" );
1179             goto hop_and_restart;
1180         }
1181         if (t != s) {
1182             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1183                         "By STCLASS: moving %ld --> %ld\n",
1184                                   (long)(t - i_strpos), (long)(s - i_strpos))
1185                    );
1186         }
1187         else {
1188             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1189                                   "Does not contradict STCLASS...\n");
1190                    );
1191         }
1192     }
1193   giveup:
1194     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1195                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1196                           PL_colors[5], (long)(s - i_strpos)) );
1197     return s;
1198
1199   fail_finish:                          /* Substring not found */
1200     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1201         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1202   fail:
1203     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1204                           PL_colors[4], PL_colors[5]));
1205     return NULL;
1206 }
1207
1208 #define DECL_TRIE_TYPE(scan) \
1209     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1210                     trie_type = ((scan->flags == EXACT) \
1211                               ? (utf8_target ? trie_utf8 : trie_plain) \
1212                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1213
1214 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1215 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1216     STRLEN skiplen;                                                                 \
1217     switch (trie_type) {                                                            \
1218     case trie_utf8_fold:                                                            \
1219         if ( foldlen>0 ) {                                                          \
1220             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1221             foldlen -= len;                                                         \
1222             uscan += len;                                                           \
1223             len=0;                                                                  \
1224         } else {                                                                    \
1225             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1226             len = UTF8SKIP(uc);                                                     \
1227             skiplen = UNISKIP( uvc );                                               \
1228             foldlen -= skiplen;                                                     \
1229             uscan = foldbuf + skiplen;                                              \
1230         }                                                                           \
1231         break;                                                                      \
1232     case trie_latin_utf8_fold:                                                      \
1233         if ( foldlen>0 ) {                                                          \
1234             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1235             foldlen -= len;                                                         \
1236             uscan += len;                                                           \
1237             len=0;                                                                  \
1238         } else {                                                                    \
1239             len = 1;                                                                \
1240             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1241             skiplen = UNISKIP( uvc );                                               \
1242             foldlen -= skiplen;                                                     \
1243             uscan = foldbuf + skiplen;                                              \
1244         }                                                                           \
1245         break;                                                                      \
1246     case trie_utf8:                                                                 \
1247         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1248         break;                                                                      \
1249     case trie_plain:                                                                \
1250         uvc = (UV)*uc;                                                              \
1251         len = 1;                                                                    \
1252     }                                                                               \
1253     if (uvc < 256) {                                                                \
1254         charid = trie->charmap[ uvc ];                                              \
1255     }                                                                               \
1256     else {                                                                          \
1257         charid = 0;                                                                 \
1258         if (widecharmap) {                                                          \
1259             SV** const svpp = hv_fetch(widecharmap,                                 \
1260                         (char*)&uvc, sizeof(UV), 0);                                \
1261             if (svpp)                                                               \
1262                 charid = (U16)SvIV(*svpp);                                          \
1263         }                                                                           \
1264     }                                                                               \
1265 } STMT_END
1266
1267 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1268 STMT_START {                                              \
1269     while (s <= e) {                                      \
1270         if ( (CoNd)                                       \
1271              && (ln == 1 || folder(s, pat_string, ln))    \
1272              && (!reginfo || regtry(reginfo, &s)) )       \
1273             goto got_it;                                  \
1274         s++;                                              \
1275     }                                                     \
1276 } STMT_END
1277
1278 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1279 STMT_START {                                          \
1280     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1281         CoDe                                          \
1282         s += uskip;                                   \
1283     }                                                 \
1284 } STMT_END
1285
1286 #define REXEC_FBC_SCAN(CoDe)                          \
1287 STMT_START {                                          \
1288     while (s < strend) {                              \
1289         CoDe                                          \
1290         s++;                                          \
1291     }                                                 \
1292 } STMT_END
1293
1294 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1295 REXEC_FBC_UTF8_SCAN(                                  \
1296     if (CoNd) {                                       \
1297         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1298             goto got_it;                              \
1299         else                                          \
1300             tmp = doevery;                            \
1301     }                                                 \
1302     else                                              \
1303         tmp = 1;                                      \
1304 )
1305
1306 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1307 REXEC_FBC_SCAN(                                       \
1308     if (CoNd) {                                       \
1309         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1310             goto got_it;                              \
1311         else                                          \
1312             tmp = doevery;                            \
1313     }                                                 \
1314     else                                              \
1315         tmp = 1;                                      \
1316 )
1317
1318 #define REXEC_FBC_TRYIT               \
1319 if ((!reginfo || regtry(reginfo, &s))) \
1320     goto got_it
1321
1322 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1323     if (utf8_target) {                                             \
1324         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1325     }                                                          \
1326     else {                                                     \
1327         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1328     }
1329
1330 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1331     if (utf8_target) {                                             \
1332         UtFpReLoAd;                                            \
1333         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1334     }                                                          \
1335     else {                                                     \
1336         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1337     }
1338
1339 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1340     PL_reg_flags |= RF_tainted;                                \
1341     if (utf8_target) {                                             \
1342         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1343     }                                                          \
1344     else {                                                     \
1345         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1346     }
1347
1348 #define DUMP_EXEC_POS(li,s,doutf8) \
1349     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1350
1351
1352 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1353         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1354         tmp = TEST_NON_UTF8(tmp);                                              \
1355         REXEC_FBC_UTF8_SCAN(                                                   \
1356             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1357                 tmp = !tmp;                                                    \
1358                 IF_SUCCESS;                                                    \
1359             }                                                                  \
1360             else {                                                             \
1361                 IF_FAIL;                                                       \
1362             }                                                                  \
1363         );                                                                     \
1364
1365 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1366         if (s == PL_bostr) {                                                   \
1367             tmp = '\n';                                                        \
1368         }                                                                      \
1369         else {                                                                 \
1370             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1371             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1372         }                                                                      \
1373         tmp = TeSt1_UtF8;                                                      \
1374         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1375         REXEC_FBC_UTF8_SCAN(                                                   \
1376             if (tmp == ! (TeSt2_UtF8)) { \
1377                 tmp = !tmp;                                                    \
1378                 IF_SUCCESS;                                                    \
1379             }                                                                  \
1380             else {                                                             \
1381                 IF_FAIL;                                                       \
1382             }                                                                  \
1383         );                                                                     \
1384
1385 /* The only difference between the BOUND and NBOUND cases is that
1386  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1387  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1388  * with the other one being empty */
1389 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1390     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1391
1392 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1393     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1394
1395 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1396     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1397
1398 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1399     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1400
1401
1402 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1403  * be passed in completely with the variable name being tested, which isn't
1404  * such a clean interface, but this is easier to read than it was before.  We
1405  * are looking for the boundary (or non-boundary between a word and non-word
1406  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1407  * must be different.  Find the "wordness" of the character just prior to this
1408  * one, and compare it with the wordness of this one.  If they differ, we have
1409  * a boundary.  At the beginning of the string, pretend that the previous
1410  * character was a new-line */
1411 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1412     if (utf8_target) {                                                         \
1413                 UTF8_CODE \
1414     }                                                                          \
1415     else {  /* Not utf8 */                                                     \
1416         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1417         tmp = TEST_NON_UTF8(tmp);                                              \
1418         REXEC_FBC_SCAN(                                                        \
1419             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1420                 tmp = !tmp;                                                    \
1421                 IF_SUCCESS;                                                    \
1422             }                                                                  \
1423             else {                                                             \
1424                 IF_FAIL;                                                       \
1425             }                                                                  \
1426         );                                                                     \
1427     }                                                                          \
1428     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1429         goto got_it;
1430
1431 /* We know what class REx starts with.  Try to find this position... */
1432 /* if reginfo is NULL, its a dryrun */
1433 /* annoyingly all the vars in this routine have different names from their counterparts
1434    in regmatch. /grrr */
1435
1436 STATIC char *
1437 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1438     const char *strend, regmatch_info *reginfo)
1439 {
1440         dVAR;
1441         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1442         char *pat_string;   /* The pattern's exactish string */
1443         char *pat_end;      /* ptr to end char of pat_string */
1444         re_fold_t folder;       /* Function for computing non-utf8 folds */
1445         const U8 *fold_array;   /* array for folding ords < 256 */
1446         STRLEN ln;
1447         STRLEN lnc;
1448         STRLEN uskip;
1449         U8 c1;
1450         U8 c2;
1451         char *e;
1452         I32 tmp = 1;    /* Scratch variable? */
1453         const bool utf8_target = PL_reg_match_utf8;
1454         UV utf8_fold_flags = 0;
1455         RXi_GET_DECL(prog,progi);
1456
1457         PERL_ARGS_ASSERT_FIND_BYCLASS;
1458
1459         /* We know what class it must start with. */
1460         switch (OP(c)) {
1461         case ANYOF:
1462             if (utf8_target) {
1463                 REXEC_FBC_UTF8_CLASS_SCAN(
1464                           reginclass(prog, c, (U8*)s, utf8_target));
1465             }
1466             else {
1467                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1468             }
1469             break;
1470         case CANY:
1471             REXEC_FBC_SCAN(
1472                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1473                     goto got_it;
1474                 else
1475                     tmp = doevery;
1476             );
1477             break;
1478
1479         case EXACTFA:
1480             if (UTF_PATTERN || utf8_target) {
1481                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1482                 goto do_exactf_utf8;
1483             }
1484             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1485             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1486             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1487
1488         case EXACTF:
1489             if (utf8_target) {
1490
1491                 /* regcomp.c already folded this if pattern is in UTF-8 */
1492                 utf8_fold_flags = 0;
1493                 goto do_exactf_utf8;
1494             }
1495             fold_array = PL_fold;
1496             folder = foldEQ;
1497             goto do_exactf_non_utf8;
1498
1499         case EXACTFL:
1500             if (UTF_PATTERN || utf8_target) {
1501                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1502                 goto do_exactf_utf8;
1503             }
1504             fold_array = PL_fold_locale;
1505             folder = foldEQ_locale;
1506             goto do_exactf_non_utf8;
1507
1508         case EXACTFU_SS:
1509             if (UTF_PATTERN) {
1510                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1511             }
1512             goto do_exactf_utf8;
1513
1514         case EXACTFU_TRICKYFOLD:
1515         case EXACTFU:
1516             if (UTF_PATTERN || utf8_target) {
1517                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1518                 goto do_exactf_utf8;
1519             }
1520
1521             /* Any 'ss' in the pattern should have been replaced by regcomp,
1522              * so we don't have to worry here about this single special case
1523              * in the Latin1 range */
1524             fold_array = PL_fold_latin1;
1525             folder = foldEQ_latin1;
1526
1527             /* FALL THROUGH */
1528
1529         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1530                                are no glitches with fold-length differences
1531                                between the target string and pattern */
1532
1533             /* The idea in the non-utf8 EXACTF* cases is to first find the
1534              * first character of the EXACTF* node and then, if necessary,
1535              * case-insensitively compare the full text of the node.  c1 is the
1536              * first character.  c2 is its fold.  This logic will not work for
1537              * Unicode semantics and the german sharp ss, which hence should
1538              * not be compiled into a node that gets here. */
1539             pat_string = STRING(c);
1540             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1541
1542             /* We know that we have to match at least 'ln' bytes (which is the
1543              * same as characters, since not utf8).  If we have to match 3
1544              * characters, and there are only 2 availabe, we know without
1545              * trying that it will fail; so don't start a match past the
1546              * required minimum number from the far end */
1547             e = HOP3c(strend, -((I32)ln), s);
1548
1549             if (!reginfo && e < s) {
1550                 e = s;                  /* Due to minlen logic of intuit() */
1551             }
1552
1553             c1 = *pat_string;
1554             c2 = fold_array[c1];
1555             if (c1 == c2) { /* If char and fold are the same */
1556                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1557             }
1558             else {
1559                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1560             }
1561             break;
1562
1563         do_exactf_utf8:
1564         {
1565             unsigned expansion;
1566
1567
1568             /* If one of the operands is in utf8, we can't use the simpler
1569              * folding above, due to the fact that many different characters
1570              * can have the same fold, or portion of a fold, or different-
1571              * length fold */
1572             pat_string = STRING(c);
1573             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1574             pat_end = pat_string + ln;
1575             lnc = (UTF_PATTERN) /* length to match in characters */
1576                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1577                     : ln;
1578
1579             /* We have 'lnc' characters to match in the pattern, but because of
1580              * multi-character folding, each character in the target can match
1581              * up to 3 characters (Unicode guarantees it will never exceed
1582              * this) if it is utf8-encoded; and up to 2 if not (based on the
1583              * fact that the Latin 1 folds are already determined, and the
1584              * only multi-char fold in that range is the sharp-s folding to
1585              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1586              * string character.  Adjust lnc accordingly, rounding up, so that
1587              * if we need to match at least 4+1/3 chars, that really is 5. */
1588             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1589             lnc = (lnc + expansion - 1) / expansion;
1590
1591             /* As in the non-UTF8 case, if we have to match 3 characters, and
1592              * only 2 are left, it's guaranteed to fail, so don't start a
1593              * match that would require us to go beyond the end of the string
1594              */
1595             e = HOP3c(strend, -((I32)lnc), s);
1596
1597             if (!reginfo && e < s) {
1598                 e = s;                  /* Due to minlen logic of intuit() */
1599             }
1600
1601             /* XXX Note that we could recalculate e to stop the loop earlier,
1602              * as the worst case expansion above will rarely be met, and as we
1603              * go along we would usually find that e moves further to the left.
1604              * This would happen only after we reached the point in the loop
1605              * where if there were no expansion we should fail.  Unclear if
1606              * worth the expense */
1607
1608             while (s <= e) {
1609                 char *my_strend= (char *)strend;
1610                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1611                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1612                     && (!reginfo || regtry(reginfo, &s)) )
1613                 {
1614                     goto got_it;
1615                 }
1616                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1617             }
1618             break;
1619         }
1620         case BOUNDL:
1621             PL_reg_flags |= RF_tainted;
1622             FBC_BOUND(isALNUM_LC,
1623                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1624                       isALNUM_LC_utf8((U8*)s));
1625             break;
1626         case NBOUNDL:
1627             PL_reg_flags |= RF_tainted;
1628             FBC_NBOUND(isALNUM_LC,
1629                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1630                        isALNUM_LC_utf8((U8*)s));
1631             break;
1632         case BOUND:
1633             FBC_BOUND(isWORDCHAR,
1634                       isALNUM_uni(tmp),
1635                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1636             break;
1637         case BOUNDA:
1638             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1639                              isWORDCHAR_A(tmp),
1640                              isWORDCHAR_A((U8*)s));
1641             break;
1642         case NBOUND:
1643             FBC_NBOUND(isWORDCHAR,
1644                        isALNUM_uni(tmp),
1645                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1646             break;
1647         case NBOUNDA:
1648             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1649                               isWORDCHAR_A(tmp),
1650                               isWORDCHAR_A((U8*)s));
1651             break;
1652         case BOUNDU:
1653             FBC_BOUND(isWORDCHAR_L1,
1654                       isALNUM_uni(tmp),
1655                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1656             break;
1657         case NBOUNDU:
1658             FBC_NBOUND(isWORDCHAR_L1,
1659                        isALNUM_uni(tmp),
1660                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1661             break;
1662         case ALNUML:
1663             REXEC_FBC_CSCAN_TAINT(
1664                 isALNUM_LC_utf8((U8*)s),
1665                 isALNUM_LC(*s)
1666             );
1667             break;
1668         case ALNUMU:
1669             REXEC_FBC_CSCAN_PRELOAD(
1670                 LOAD_UTF8_CHARCLASS_ALNUM(),
1671                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1672                 isWORDCHAR_L1((U8) *s)
1673             );
1674             break;
1675         case ALNUM:
1676             REXEC_FBC_CSCAN_PRELOAD(
1677                 LOAD_UTF8_CHARCLASS_ALNUM(),
1678                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1679                 isWORDCHAR((U8) *s)
1680             );
1681             break;
1682         case ALNUMA:
1683             /* Don't need to worry about utf8, as it can match only a single
1684              * byte invariant character */
1685             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1686             break;
1687         case NALNUMU:
1688             REXEC_FBC_CSCAN_PRELOAD(
1689                 LOAD_UTF8_CHARCLASS_ALNUM(),
1690                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1691                 ! isWORDCHAR_L1((U8) *s)
1692             );
1693             break;
1694         case NALNUM:
1695             REXEC_FBC_CSCAN_PRELOAD(
1696                 LOAD_UTF8_CHARCLASS_ALNUM(),
1697                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1698                 ! isALNUM(*s)
1699             );
1700             break;
1701         case NALNUMA:
1702             REXEC_FBC_CSCAN(
1703                 !isWORDCHAR_A(*s),
1704                 !isWORDCHAR_A(*s)
1705             );
1706             break;
1707         case NALNUML:
1708             REXEC_FBC_CSCAN_TAINT(
1709                 !isALNUM_LC_utf8((U8*)s),
1710                 !isALNUM_LC(*s)
1711             );
1712             break;
1713         case SPACEU:
1714             REXEC_FBC_CSCAN(
1715                 is_XPERLSPACE_utf8(s),
1716                 isSPACE_L1((U8) *s)
1717             );
1718             break;
1719         case SPACE:
1720             REXEC_FBC_CSCAN(
1721                 is_XPERLSPACE_utf8(s),
1722                 isSPACE((U8) *s)
1723             );
1724             break;
1725         case SPACEA:
1726             /* Don't need to worry about utf8, as it can match only a single
1727              * byte invariant character */
1728             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1729             break;
1730         case SPACEL:
1731             REXEC_FBC_CSCAN_TAINT(
1732                 isSPACE_LC_utf8((U8*)s),
1733                 isSPACE_LC(*s)
1734             );
1735             break;
1736         case NSPACEU:
1737             REXEC_FBC_CSCAN(
1738                 ! is_XPERLSPACE_utf8(s),
1739                 ! isSPACE_L1((U8) *s)
1740             );
1741             break;
1742         case NSPACE:
1743             REXEC_FBC_CSCAN(
1744                 ! is_XPERLSPACE_utf8(s),
1745                 ! isSPACE((U8) *s)
1746             );
1747             break;
1748         case NSPACEA:
1749             REXEC_FBC_CSCAN(
1750                 !isSPACE_A(*s),
1751                 !isSPACE_A(*s)
1752             );
1753             break;
1754         case NSPACEL:
1755             REXEC_FBC_CSCAN_TAINT(
1756                 !isSPACE_LC_utf8((U8*)s),
1757                 !isSPACE_LC(*s)
1758             );
1759             break;
1760         case DIGIT:
1761             REXEC_FBC_CSCAN_PRELOAD(
1762                 LOAD_UTF8_CHARCLASS_DIGIT(),
1763                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1764                 isDIGIT(*s)
1765             );
1766             break;
1767         case DIGITA:
1768             /* Don't need to worry about utf8, as it can match only a single
1769              * byte invariant character */
1770             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1771             break;
1772         case DIGITL:
1773             REXEC_FBC_CSCAN_TAINT(
1774                 isDIGIT_LC_utf8((U8*)s),
1775                 isDIGIT_LC(*s)
1776             );
1777             break;
1778         case NDIGIT:
1779             REXEC_FBC_CSCAN_PRELOAD(
1780                 LOAD_UTF8_CHARCLASS_DIGIT(),
1781                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1782                 !isDIGIT(*s)
1783             );
1784             break;
1785         case NDIGITA:
1786             REXEC_FBC_CSCAN(
1787                 !isDIGIT_A(*s),
1788                 !isDIGIT_A(*s)
1789             );
1790             break;
1791         case NDIGITL:
1792             REXEC_FBC_CSCAN_TAINT(
1793                 !isDIGIT_LC_utf8((U8*)s),
1794                 !isDIGIT_LC(*s)
1795             );
1796             break;
1797         case LNBREAK:
1798             REXEC_FBC_CSCAN(
1799                 is_LNBREAK_utf8_safe(s, strend),
1800                 is_LNBREAK_latin1_safe(s, strend)
1801             );
1802             break;
1803         case VERTWS:
1804             REXEC_FBC_CSCAN(
1805                 is_VERTWS_utf8_safe(s, strend),
1806                 is_VERTWS_latin1_safe(s, strend)
1807             );
1808             break;
1809         case NVERTWS:
1810             REXEC_FBC_CSCAN(
1811                 !is_VERTWS_utf8_safe(s, strend),
1812                 !is_VERTWS_latin1_safe(s, strend)
1813             );
1814             break;
1815         case HORIZWS:
1816             REXEC_FBC_CSCAN(
1817                 is_HORIZWS_utf8_safe(s, strend),
1818                 is_HORIZWS_latin1_safe(s, strend)
1819             );
1820             break;
1821         case NHORIZWS:
1822             REXEC_FBC_CSCAN(
1823                 !is_HORIZWS_utf8_safe(s, strend),
1824                 !is_HORIZWS_latin1_safe(s, strend)
1825             );
1826             break;
1827         case POSIXA:
1828             /* Don't need to worry about utf8, as it can match only a single
1829             * byte invariant character.  The flag in this node type is the
1830             * class number to pass to _generic_isCC() to build a mask for
1831             * searching in PL_charclass[] */
1832             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1833             break;
1834         case NPOSIXA:
1835             REXEC_FBC_CSCAN(
1836                 !_generic_isCC_A(*s, FLAGS(c)),
1837                 !_generic_isCC_A(*s, FLAGS(c))
1838             );
1839             break;
1840
1841         case AHOCORASICKC:
1842         case AHOCORASICK:
1843             {
1844                 DECL_TRIE_TYPE(c);
1845                 /* what trie are we using right now */
1846                 reg_ac_data *aho
1847                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1848                 reg_trie_data *trie
1849                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1850                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1851
1852                 const char *last_start = strend - trie->minlen;
1853 #ifdef DEBUGGING
1854                 const char *real_start = s;
1855 #endif
1856                 STRLEN maxlen = trie->maxlen;
1857                 SV *sv_points;
1858                 U8 **points; /* map of where we were in the input string
1859                                 when reading a given char. For ASCII this
1860                                 is unnecessary overhead as the relationship
1861                                 is always 1:1, but for Unicode, especially
1862                                 case folded Unicode this is not true. */
1863                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1864                 U8 *bitmap=NULL;
1865
1866
1867                 GET_RE_DEBUG_FLAGS_DECL;
1868
1869                 /* We can't just allocate points here. We need to wrap it in
1870                  * an SV so it gets freed properly if there is a croak while
1871                  * running the match */
1872                 ENTER;
1873                 SAVETMPS;
1874                 sv_points=newSV(maxlen * sizeof(U8 *));
1875                 SvCUR_set(sv_points,
1876                     maxlen * sizeof(U8 *));
1877                 SvPOK_on(sv_points);
1878                 sv_2mortal(sv_points);
1879                 points=(U8**)SvPV_nolen(sv_points );
1880                 if ( trie_type != trie_utf8_fold
1881                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1882                 {
1883                     if (trie->bitmap)
1884                         bitmap=(U8*)trie->bitmap;
1885                     else
1886                         bitmap=(U8*)ANYOF_BITMAP(c);
1887                 }
1888                 /* this is the Aho-Corasick algorithm modified a touch
1889                    to include special handling for long "unknown char"
1890                    sequences. The basic idea being that we use AC as long
1891                    as we are dealing with a possible matching char, when
1892                    we encounter an unknown char (and we have not encountered
1893                    an accepting state) we scan forward until we find a legal
1894                    starting char.
1895                    AC matching is basically that of trie matching, except
1896                    that when we encounter a failing transition, we fall back
1897                    to the current states "fail state", and try the current char
1898                    again, a process we repeat until we reach the root state,
1899                    state 1, or a legal transition. If we fail on the root state
1900                    then we can either terminate if we have reached an accepting
1901                    state previously, or restart the entire process from the beginning
1902                    if we have not.
1903
1904                  */
1905                 while (s <= last_start) {
1906                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1907                     U8 *uc = (U8*)s;
1908                     U16 charid = 0;
1909                     U32 base = 1;
1910                     U32 state = 1;
1911                     UV uvc = 0;
1912                     STRLEN len = 0;
1913                     STRLEN foldlen = 0;
1914                     U8 *uscan = (U8*)NULL;
1915                     U8 *leftmost = NULL;
1916 #ifdef DEBUGGING
1917                     U32 accepted_word= 0;
1918 #endif
1919                     U32 pointpos = 0;
1920
1921                     while ( state && uc <= (U8*)strend ) {
1922                         int failed=0;
1923                         U32 word = aho->states[ state ].wordnum;
1924
1925                         if( state==1 ) {
1926                             if ( bitmap ) {
1927                                 DEBUG_TRIE_EXECUTE_r(
1928                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1929                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1930                                             (char *)uc, utf8_target );
1931                                         PerlIO_printf( Perl_debug_log,
1932                                             " Scanning for legal start char...\n");
1933                                     }
1934                                 );
1935                                 if (utf8_target) {
1936                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1937                                         uc += UTF8SKIP(uc);
1938                                     }
1939                                 } else {
1940                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1941                                         uc++;
1942                                     }
1943                                 }
1944                                 s= (char *)uc;
1945                             }
1946                             if (uc >(U8*)last_start) break;
1947                         }
1948
1949                         if ( word ) {
1950                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1951                             if (!leftmost || lpos < leftmost) {
1952                                 DEBUG_r(accepted_word=word);
1953                                 leftmost= lpos;
1954                             }
1955                             if (base==0) break;
1956
1957                         }
1958                         points[pointpos++ % maxlen]= uc;
1959                         if (foldlen || uc < (U8*)strend) {
1960                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1961                                              widecharmap, uc,
1962                                              uscan, len, uvc, charid, foldlen,
1963                                              foldbuf, uniflags);
1964                             DEBUG_TRIE_EXECUTE_r({
1965                                 dump_exec_pos( (char *)uc, c, strend,
1966                                             real_start, s, utf8_target);
1967                                 PerlIO_printf(Perl_debug_log,
1968                                     " Charid:%3u CP:%4"UVxf" ",
1969                                      charid, uvc);
1970                             });
1971                         }
1972                         else {
1973                             len = 0;
1974                             charid = 0;
1975                         }
1976
1977
1978                         do {
1979 #ifdef DEBUGGING
1980                             word = aho->states[ state ].wordnum;
1981 #endif
1982                             base = aho->states[ state ].trans.base;
1983
1984                             DEBUG_TRIE_EXECUTE_r({
1985                                 if (failed)
1986                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1987                                         s,   utf8_target );
1988                                 PerlIO_printf( Perl_debug_log,
1989                                     "%sState: %4"UVxf", word=%"UVxf,
1990                                     failed ? " Fail transition to " : "",
1991                                     (UV)state, (UV)word);
1992                             });
1993                             if ( base ) {
1994                                 U32 tmp;
1995                                 I32 offset;
1996                                 if (charid &&
1997                                      ( ((offset = base + charid
1998                                         - 1 - trie->uniquecharcount)) >= 0)
1999                                      && ((U32)offset < trie->lasttrans)
2000                                      && trie->trans[offset].check == state
2001                                      && (tmp=trie->trans[offset].next))
2002                                 {
2003                                     DEBUG_TRIE_EXECUTE_r(
2004                                         PerlIO_printf( Perl_debug_log," - legal\n"));
2005                                     state = tmp;
2006                                     break;
2007                                 }
2008                                 else {
2009                                     DEBUG_TRIE_EXECUTE_r(
2010                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2011                                     failed = 1;
2012                                     state = aho->fail[state];
2013                                 }
2014                             }
2015                             else {
2016                                 /* we must be accepting here */
2017                                 DEBUG_TRIE_EXECUTE_r(
2018                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2019                                 failed = 1;
2020                                 break;
2021                             }
2022                         } while(state);
2023                         uc += len;
2024                         if (failed) {
2025                             if (leftmost)
2026                                 break;
2027                             if (!state) state = 1;
2028                         }
2029                     }
2030                     if ( aho->states[ state ].wordnum ) {
2031                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2032                         if (!leftmost || lpos < leftmost) {
2033                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2034                             leftmost = lpos;
2035                         }
2036                     }
2037                     if (leftmost) {
2038                         s = (char*)leftmost;
2039                         DEBUG_TRIE_EXECUTE_r({
2040                             PerlIO_printf(
2041                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2042                                 (UV)accepted_word, (IV)(s - real_start)
2043                             );
2044                         });
2045                         if (!reginfo || regtry(reginfo, &s)) {
2046                             FREETMPS;
2047                             LEAVE;
2048                             goto got_it;
2049                         }
2050                         s = HOPc(s,1);
2051                         DEBUG_TRIE_EXECUTE_r({
2052                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2053                         });
2054                     } else {
2055                         DEBUG_TRIE_EXECUTE_r(
2056                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2057                         break;
2058                     }
2059                 }
2060                 FREETMPS;
2061                 LEAVE;
2062             }
2063             break;
2064         default:
2065             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2066             break;
2067         }
2068         return 0;
2069       got_it:
2070         return s;
2071 }
2072
2073
2074 /*
2075  - regexec_flags - match a regexp against a string
2076  */
2077 I32
2078 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2079               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2080 /* stringarg: the point in the string at which to begin matching */
2081 /* strend:    pointer to null at end of string */
2082 /* strbeg:    real beginning of string */
2083 /* minend:    end of match must be >= minend bytes after stringarg. */
2084 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2085  *            itself is accessed via the pointers above */
2086 /* data:      May be used for some additional optimizations.
2087               Currently its only used, with a U32 cast, for transmitting
2088               the ganch offset when doing a /g match. This will change */
2089 /* nosave:    For optimizations. */
2090
2091 {
2092     dVAR;
2093     struct regexp *const prog = ReANY(rx);
2094     char *s;
2095     regnode *c;
2096     char *startpos = stringarg;
2097     I32 minlen;         /* must match at least this many chars */
2098     I32 dontbother = 0; /* how many characters not to try at end */
2099     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2100     I32 scream_pos = -1;                /* Internal iterator of scream. */
2101     char *scream_olds = NULL;
2102     const bool utf8_target = cBOOL(DO_UTF8(sv));
2103     I32 multiline;
2104     RXi_GET_DECL(prog,progi);
2105     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2106     regexp_paren_pair *swap = NULL;
2107     GET_RE_DEBUG_FLAGS_DECL;
2108
2109     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2110     PERL_UNUSED_ARG(data);
2111
2112     /* Be paranoid... */
2113     if (prog == NULL || startpos == NULL) {
2114         Perl_croak(aTHX_ "NULL regexp parameter");
2115         return 0;
2116     }
2117
2118     multiline = prog->extflags & RXf_PMf_MULTILINE;
2119     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2120
2121     RX_MATCH_UTF8_set(rx, utf8_target);
2122     DEBUG_EXECUTE_r(
2123         debug_start_match(rx, utf8_target, startpos, strend,
2124         "Matching");
2125     );
2126
2127     minlen = prog->minlen;
2128
2129     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2130         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2131                               "String too short [regexec_flags]...\n"));
2132         goto phooey;
2133     }
2134
2135
2136     /* Check validity of program. */
2137     if (UCHARAT(progi->program) != REG_MAGIC) {
2138         Perl_croak(aTHX_ "corrupted regexp program");
2139     }
2140
2141     PL_reg_flags = 0;
2142     PL_reg_state.re_state_eval_setup_done = FALSE;
2143     PL_reg_maxiter = 0;
2144
2145     if (RX_UTF8(rx))
2146         PL_reg_flags |= RF_utf8;
2147
2148     /* Mark beginning of line for ^ and lookbehind. */
2149     reginfo.bol = startpos; /* XXX not used ??? */
2150     PL_bostr  = strbeg;
2151     reginfo.sv = sv;
2152
2153     /* Mark end of line for $ (and such) */
2154     PL_regeol = strend;
2155
2156     /* see how far we have to get to not match where we matched before */
2157     reginfo.till = startpos+minend;
2158
2159     /* If there is a "must appear" string, look for it. */
2160     s = startpos;
2161
2162     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2163         MAGIC *mg;
2164         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2165             reginfo.ganch = startpos + prog->gofs;
2166             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2167               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2168         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2169                   && SvMAGIC(sv)
2170                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2171                   && mg->mg_len >= 0) {
2172             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2173             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2174                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2175
2176             if (prog->extflags & RXf_ANCH_GPOS) {
2177                 if (s > reginfo.ganch)
2178                     goto phooey;
2179                 s = reginfo.ganch - prog->gofs;
2180                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2181                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2182                 if (s < strbeg)
2183                     goto phooey;
2184             }
2185         }
2186         else if (data) {
2187             reginfo.ganch = strbeg + PTR2UV(data);
2188             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2189                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2190
2191         } else {                                /* pos() not defined */
2192             reginfo.ganch = strbeg;
2193             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194                  "GPOS: reginfo.ganch = strbeg\n"));
2195         }
2196     }
2197     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2198         /* We have to be careful. If the previous successful match
2199            was from this regex we don't want a subsequent partially
2200            successful match to clobber the old results.
2201            So when we detect this possibility we add a swap buffer
2202            to the re, and switch the buffer each match. If we fail
2203            we switch it back, otherwise we leave it swapped.
2204         */
2205         swap = prog->offs;
2206         /* do we need a save destructor here for eval dies? */
2207         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2208         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2209             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2210             PTR2UV(prog),
2211             PTR2UV(swap),
2212             PTR2UV(prog->offs)
2213         ));
2214     }
2215     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2216         re_scream_pos_data d;
2217
2218         d.scream_olds = &scream_olds;
2219         d.scream_pos = &scream_pos;
2220         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2221         if (!s) {
2222             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2223             goto phooey;        /* not present */
2224         }
2225     }
2226
2227
2228
2229     /* Simplest case:  anchored match need be tried only once. */
2230     /*  [unless only anchor is BOL and multiline is set] */
2231     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2232         if (s == startpos && regtry(&reginfo, &startpos))
2233             goto got_it;
2234         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2235                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2236         {
2237             char *end;
2238
2239             if (minlen)
2240                 dontbother = minlen - 1;
2241             end = HOP3c(strend, -dontbother, strbeg) - 1;
2242             /* for multiline we only have to try after newlines */
2243             if (prog->check_substr || prog->check_utf8) {
2244                 /* because of the goto we can not easily reuse the macros for bifurcating the
2245                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2246                 if (utf8_target) {
2247                     if (s == startpos)
2248                         goto after_try_utf8;
2249                     while (1) {
2250                         if (regtry(&reginfo, &s)) {
2251                             goto got_it;
2252                         }
2253                       after_try_utf8:
2254                         if (s > end) {
2255                             goto phooey;
2256                         }
2257                         if (prog->extflags & RXf_USE_INTUIT) {
2258                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2259                             if (!s) {
2260                                 goto phooey;
2261                             }
2262                         }
2263                         else {
2264                             s += UTF8SKIP(s);
2265                         }
2266                     }
2267                 } /* end search for check string in unicode */
2268                 else {
2269                     if (s == startpos) {
2270                         goto after_try_latin;
2271                     }
2272                     while (1) {
2273                         if (regtry(&reginfo, &s)) {
2274                             goto got_it;
2275                         }
2276                       after_try_latin:
2277                         if (s > end) {
2278                             goto phooey;
2279                         }
2280                         if (prog->extflags & RXf_USE_INTUIT) {
2281                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2282                             if (!s) {
2283                                 goto phooey;
2284                             }
2285                         }
2286                         else {
2287                             s++;
2288                         }
2289                     }
2290                 } /* end search for check string in latin*/
2291             } /* end search for check string */
2292             else { /* search for newline */
2293                 if (s > startpos) {
2294                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2295                     s--;
2296                 }
2297                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2298                 while (s <= end) { /* note it could be possible to match at the end of the string */
2299                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2300                         if (regtry(&reginfo, &s))
2301                             goto got_it;
2302                     }
2303                 }
2304             } /* end search for newline */
2305         } /* end anchored/multiline check string search */
2306         goto phooey;
2307     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2308     {
2309         /* the warning about reginfo.ganch being used without initialization
2310            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2311            and we only enter this block when the same bit is set. */
2312         char *tmp_s = reginfo.ganch - prog->gofs;
2313
2314         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2315             goto got_it;
2316         goto phooey;
2317     }
2318
2319     /* Messy cases:  unanchored match. */
2320     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2321         /* we have /x+whatever/ */
2322         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2323         char ch;
2324 #ifdef DEBUGGING
2325         int did_match = 0;
2326 #endif
2327         if (utf8_target) {
2328             if (! prog->anchored_utf8) {
2329                 to_utf8_substr(prog);
2330             }
2331             ch = SvPVX_const(prog->anchored_utf8)[0];
2332             REXEC_FBC_SCAN(
2333                 if (*s == ch) {
2334                     DEBUG_EXECUTE_r( did_match = 1 );
2335                     if (regtry(&reginfo, &s)) goto got_it;
2336                     s += UTF8SKIP(s);
2337                     while (s < strend && *s == ch)
2338                         s += UTF8SKIP(s);
2339                 }
2340             );
2341
2342         }
2343         else {
2344             if (! prog->anchored_substr) {
2345                 if (! to_byte_substr(prog)) {
2346                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2347                 }
2348             }
2349             ch = SvPVX_const(prog->anchored_substr)[0];
2350             REXEC_FBC_SCAN(
2351                 if (*s == ch) {
2352                     DEBUG_EXECUTE_r( did_match = 1 );
2353                     if (regtry(&reginfo, &s)) goto got_it;
2354                     s++;
2355                     while (s < strend && *s == ch)
2356                         s++;
2357                 }
2358             );
2359         }
2360         DEBUG_EXECUTE_r(if (!did_match)
2361                 PerlIO_printf(Perl_debug_log,
2362                                   "Did not find anchored character...\n")
2363                );
2364     }
2365     else if (prog->anchored_substr != NULL
2366               || prog->anchored_utf8 != NULL
2367               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2368                   && prog->float_max_offset < strend - s)) {
2369         SV *must;
2370         I32 back_max;
2371         I32 back_min;
2372         char *last;
2373         char *last1;            /* Last position checked before */
2374 #ifdef DEBUGGING
2375         int did_match = 0;
2376 #endif
2377         if (prog->anchored_substr || prog->anchored_utf8) {
2378             if (utf8_target) {
2379                 if (! prog->anchored_utf8) {
2380                     to_utf8_substr(prog);
2381                 }
2382                 must = prog->anchored_utf8;
2383             }
2384             else {
2385                 if (! prog->anchored_substr) {
2386                     if (! to_byte_substr(prog)) {
2387                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2388                     }
2389                 }
2390                 must = prog->anchored_substr;
2391             }
2392             back_max = back_min = prog->anchored_offset;
2393         } else {
2394             if (utf8_target) {
2395                 if (! prog->float_utf8) {
2396                     to_utf8_substr(prog);
2397                 }
2398                 must = prog->float_utf8;
2399             }
2400             else {
2401                 if (! prog->float_substr) {
2402                     if (! to_byte_substr(prog)) {
2403                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2404                     }
2405                 }
2406                 must = prog->float_substr;
2407             }
2408             back_max = prog->float_max_offset;
2409             back_min = prog->float_min_offset;
2410         }
2411
2412         if (back_min<0) {
2413             last = strend;
2414         } else {
2415             last = HOP3c(strend,        /* Cannot start after this */
2416                   -(I32)(CHR_SVLEN(must)
2417                          - (SvTAIL(must) != 0) + back_min), strbeg);
2418         }
2419         if (s > PL_bostr)
2420             last1 = HOPc(s, -1);
2421         else
2422             last1 = s - 1;      /* bogus */
2423
2424         /* XXXX check_substr already used to find "s", can optimize if
2425            check_substr==must. */
2426         scream_pos = -1;
2427         dontbother = end_shift;
2428         strend = HOPc(strend, -dontbother);
2429         while ( (s <= last) &&
2430                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2431                                   (unsigned char*)strend, must,
2432                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2433             DEBUG_EXECUTE_r( did_match = 1 );
2434             if (HOPc(s, -back_max) > last1) {
2435                 last1 = HOPc(s, -back_min);
2436                 s = HOPc(s, -back_max);
2437             }
2438             else {
2439                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2440
2441                 last1 = HOPc(s, -back_min);
2442                 s = t;
2443             }
2444             if (utf8_target) {
2445                 while (s <= last1) {
2446                     if (regtry(&reginfo, &s))
2447                         goto got_it;
2448                     if (s >= last1) {
2449                         s++; /* to break out of outer loop */
2450                         break;
2451                     }
2452                     s += UTF8SKIP(s);
2453                 }
2454             }
2455             else {
2456                 while (s <= last1) {
2457                     if (regtry(&reginfo, &s))
2458                         goto got_it;
2459                     s++;
2460                 }
2461             }
2462         }
2463         DEBUG_EXECUTE_r(if (!did_match) {
2464             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2465                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2466             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2467                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2468                                ? "anchored" : "floating"),
2469                 quoted, RE_SV_TAIL(must));
2470         });
2471         goto phooey;
2472     }
2473     else if ( (c = progi->regstclass) ) {
2474         if (minlen) {
2475             const OPCODE op = OP(progi->regstclass);
2476             /* don't bother with what can't match */
2477             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2478                 strend = HOPc(strend, -(minlen - 1));
2479         }
2480         DEBUG_EXECUTE_r({
2481             SV * const prop = sv_newmortal();
2482             regprop(prog, prop, c);
2483             {
2484                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2485                     s,strend-s,60);
2486                 PerlIO_printf(Perl_debug_log,
2487                     "Matching stclass %.*s against %s (%d bytes)\n",
2488                     (int)SvCUR(prop), SvPVX_const(prop),
2489                      quoted, (int)(strend - s));
2490             }
2491         });
2492         if (find_byclass(prog, c, s, strend, &reginfo))
2493             goto got_it;
2494         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2495     }
2496     else {
2497         dontbother = 0;
2498         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2499             /* Trim the end. */
2500             char *last= NULL;
2501             SV* float_real;
2502             STRLEN len;
2503             const char *little;
2504
2505             if (utf8_target) {
2506                 if (! prog->float_utf8) {
2507                     to_utf8_substr(prog);
2508                 }
2509                 float_real = prog->float_utf8;
2510             }
2511             else {
2512                 if (! prog->float_substr) {
2513                     if (! to_byte_substr(prog)) {
2514                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2515                     }
2516                 }
2517                 float_real = prog->float_substr;
2518             }
2519
2520             little = SvPV_const(float_real, len);
2521             if (SvTAIL(float_real)) {
2522                     /* This means that float_real contains an artificial \n on
2523                      * the end due to the presence of something like this:
2524                      * /foo$/ where we can match both "foo" and "foo\n" at the
2525                      * end of the string.  So we have to compare the end of the
2526                      * string first against the float_real without the \n and
2527                      * then against the full float_real with the string.  We
2528                      * have to watch out for cases where the string might be
2529                      * smaller than the float_real or the float_real without
2530                      * the \n. */
2531                     char *checkpos= strend - len;
2532                     DEBUG_OPTIMISE_r(
2533                         PerlIO_printf(Perl_debug_log,
2534                             "%sChecking for float_real.%s\n",
2535                             PL_colors[4], PL_colors[5]));
2536                     if (checkpos + 1 < strbeg) {
2537                         /* can't match, even if we remove the trailing \n
2538                          * string is too short to match */
2539                         DEBUG_EXECUTE_r(
2540                             PerlIO_printf(Perl_debug_log,
2541                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2542                                 PL_colors[4], PL_colors[5]));
2543                         goto phooey;
2544                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2545                         /* can match, the end of the string matches without the
2546                          * "\n" */
2547                         last = checkpos + 1;
2548                     } else if (checkpos < strbeg) {
2549                         /* cant match, string is too short when the "\n" is
2550                          * included */
2551                         DEBUG_EXECUTE_r(
2552                             PerlIO_printf(Perl_debug_log,
2553                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2554                                 PL_colors[4], PL_colors[5]));
2555                         goto phooey;
2556                     } else if (!multiline) {
2557                         /* non multiline match, so compare with the "\n" at the
2558                          * end of the string */
2559                         if (memEQ(checkpos, little, len)) {
2560                             last= checkpos;
2561                         } else {
2562                             DEBUG_EXECUTE_r(
2563                                 PerlIO_printf(Perl_debug_log,
2564                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2565                                     PL_colors[4], PL_colors[5]));
2566                             goto phooey;
2567                         }
2568                     } else {
2569                         /* multiline match, so we have to search for a place
2570                          * where the full string is located */
2571                         goto find_last;
2572                     }
2573             } else {
2574                   find_last:
2575                     if (len)
2576                         last = rninstr(s, strend, little, little + len);
2577                     else
2578                         last = strend;  /* matching "$" */
2579             }
2580             if (!last) {
2581                 /* at one point this block contained a comment which was
2582                  * probably incorrect, which said that this was a "should not
2583                  * happen" case.  Even if it was true when it was written I am
2584                  * pretty sure it is not anymore, so I have removed the comment
2585                  * and replaced it with this one. Yves */
2586                 DEBUG_EXECUTE_r(
2587                     PerlIO_printf(Perl_debug_log,
2588                         "String does not contain required substring, cannot match.\n"
2589                     ));
2590                 goto phooey;
2591             }
2592             dontbother = strend - last + prog->float_min_offset;
2593         }
2594         if (minlen && (dontbother < minlen))
2595             dontbother = minlen - 1;
2596         strend -= dontbother;              /* this one's always in bytes! */
2597         /* We don't know much -- general case. */
2598         if (utf8_target) {
2599             for (;;) {
2600                 if (regtry(&reginfo, &s))
2601                     goto got_it;
2602                 if (s >= strend)
2603                     break;
2604                 s += UTF8SKIP(s);
2605             };
2606         }
2607         else {
2608             do {
2609                 if (regtry(&reginfo, &s))
2610                     goto got_it;
2611             } while (s++ < strend);
2612         }
2613     }
2614
2615     /* Failure. */
2616     goto phooey;
2617
2618 got_it:
2619     DEBUG_BUFFERS_r(
2620         if (swap)
2621             PerlIO_printf(Perl_debug_log,
2622                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2623                 PTR2UV(prog),
2624                 PTR2UV(swap)
2625             );
2626     );
2627     Safefree(swap);
2628     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2629
2630     if (PL_reg_state.re_state_eval_setup_done)
2631         restore_pos(aTHX_ prog);
2632     if (RXp_PAREN_NAMES(prog))
2633         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2634
2635     /* make sure $`, $&, $', and $digit will work later */
2636     if ( !(flags & REXEC_NOT_FIRST) ) {
2637         if (flags & REXEC_COPY_STR) {
2638 #ifdef PERL_ANY_COW
2639             if (SvCANCOW(sv)) {
2640                 if (DEBUG_C_TEST) {
2641                     PerlIO_printf(Perl_debug_log,
2642                                   "Copy on write: regexp capture, type %d\n",
2643                                   (int) SvTYPE(sv));
2644                 }
2645                 RX_MATCH_COPY_FREE(rx);
2646                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2647                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2648                 assert (SvPOKp(prog->saved_copy));
2649                 prog->sublen  = PL_regeol - strbeg;
2650                 prog->suboffset = 0;
2651                 prog->subcoffset = 0;
2652             } else
2653 #endif
2654             {
2655                 I32 min = 0;
2656                 I32 max = PL_regeol - strbeg;
2657                 I32 sublen;
2658
2659                 if (    (flags & REXEC_COPY_SKIP_POST)
2660                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2661                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2662                 ) { /* don't copy $' part of string */
2663                     U32 n = 0;
2664                     max = -1;
2665                     /* calculate the right-most part of the string covered
2666                      * by a capture. Due to look-ahead, this may be to
2667                      * the right of $&, so we have to scan all captures */
2668                     while (n <= prog->lastparen) {
2669                         if (prog->offs[n].end > max)
2670                             max = prog->offs[n].end;
2671                         n++;
2672                     }
2673                     if (max == -1)
2674                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2675                                 ? prog->offs[0].start
2676                                 : 0;
2677                     assert(max >= 0 && max <= PL_regeol - strbeg);
2678                 }
2679
2680                 if (    (flags & REXEC_COPY_SKIP_PRE)
2681                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2682                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2683                 ) { /* don't copy $` part of string */
2684                     U32 n = 0;
2685                     min = max;
2686                     /* calculate the left-most part of the string covered
2687                      * by a capture. Due to look-behind, this may be to
2688                      * the left of $&, so we have to scan all captures */
2689                     while (min && n <= prog->lastparen) {
2690                         if (   prog->offs[n].start != -1
2691                             && prog->offs[n].start < min)
2692                         {
2693                             min = prog->offs[n].start;
2694                         }
2695                         n++;
2696                     }
2697                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2698                         && min >  prog->offs[0].end
2699                     )
2700                         min = prog->offs[0].end;
2701
2702                 }
2703
2704                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2705                 sublen = max - min;
2706
2707                 if (RX_MATCH_COPIED(rx)) {
2708                     if (sublen > prog->sublen)
2709                         prog->subbeg =
2710                                 (char*)saferealloc(prog->subbeg, sublen+1);
2711                 }
2712                 else
2713                     prog->subbeg = (char*)safemalloc(sublen+1);
2714                 Copy(strbeg + min, prog->subbeg, sublen, char);
2715                 prog->subbeg[sublen] = '\0';
2716                 prog->suboffset = min;
2717                 prog->sublen = sublen;
2718                 RX_MATCH_COPIED_on(rx);
2719             }
2720             prog->subcoffset = prog->suboffset;
2721             if (prog->suboffset && utf8_target) {
2722                 /* Convert byte offset to chars.
2723                  * XXX ideally should only compute this if @-/@+
2724                  * has been seen, a la PL_sawampersand ??? */
2725
2726                 /* If there's a direct correspondence between the
2727                  * string which we're matching and the original SV,
2728                  * then we can use the utf8 len cache associated with
2729                  * the SV. In particular, it means that under //g,
2730                  * sv_pos_b2u() will use the previously cached
2731                  * position to speed up working out the new length of
2732                  * subcoffset, rather than counting from the start of
2733                  * the string each time. This stops
2734                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2735                  * from going quadratic */
2736                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2737                     sv_pos_b2u(sv, &(prog->subcoffset));
2738                 else
2739                     prog->subcoffset = utf8_length((U8*)strbeg,
2740                                         (U8*)(strbeg+prog->suboffset));
2741             }
2742         }
2743         else {
2744             RX_MATCH_COPY_FREE(rx);
2745             prog->subbeg = strbeg;
2746             prog->suboffset = 0;
2747             prog->subcoffset = 0;
2748             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2749         }
2750     }
2751
2752     return 1;
2753
2754 phooey:
2755     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2756                           PL_colors[4], PL_colors[5]));
2757     if (PL_reg_state.re_state_eval_setup_done)
2758         restore_pos(aTHX_ prog);
2759     if (swap) {
2760         /* we failed :-( roll it back */
2761         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2762             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2763             PTR2UV(prog),
2764             PTR2UV(prog->offs),
2765             PTR2UV(swap)
2766         ));
2767         Safefree(prog->offs);
2768         prog->offs = swap;
2769     }
2770     return 0;
2771 }
2772
2773
2774 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2775  * Do inc before dec, in case old and new rex are the same */
2776 #define SET_reg_curpm(Re2) \
2777     if (PL_reg_state.re_state_eval_setup_done) {    \
2778         (void)ReREFCNT_inc(Re2);                    \
2779         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2780         PM_SETRE((PL_reg_curpm), (Re2));            \
2781     }
2782
2783
2784 /*
2785  - regtry - try match at specific point
2786  */
2787 STATIC I32                      /* 0 failure, 1 success */
2788 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2789 {
2790     dVAR;
2791     CHECKPOINT lastcp;
2792     REGEXP *const rx = reginfo->prog;
2793     regexp *const prog = ReANY(rx);
2794     I32 result;
2795     RXi_GET_DECL(prog,progi);
2796     GET_RE_DEBUG_FLAGS_DECL;
2797
2798     PERL_ARGS_ASSERT_REGTRY;
2799
2800     reginfo->cutpoint=NULL;
2801
2802     if ((prog->extflags & RXf_EVAL_SEEN)
2803         && !PL_reg_state.re_state_eval_setup_done)
2804     {
2805         MAGIC *mg;
2806
2807         PL_reg_state.re_state_eval_setup_done = TRUE;
2808         if (reginfo->sv) {
2809             /* Make $_ available to executed code. */
2810             if (reginfo->sv != DEFSV) {
2811                 SAVE_DEFSV;
2812                 DEFSV_set(reginfo->sv);
2813             }
2814
2815             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2816                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2817                 /* prepare for quick setting of pos */
2818 #ifdef PERL_OLD_COPY_ON_WRITE
2819                 if (SvIsCOW(reginfo->sv))
2820                     sv_force_normal_flags(reginfo->sv, 0);
2821 #endif
2822                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2823                                  &PL_vtbl_mglob, NULL, 0);
2824                 mg->mg_len = -1;
2825             }
2826             PL_reg_magic    = mg;
2827             PL_reg_oldpos   = mg->mg_len;
2828             SAVEDESTRUCTOR_X(restore_pos, prog);
2829         }
2830         if (!PL_reg_curpm) {
2831             Newxz(PL_reg_curpm, 1, PMOP);
2832 #ifdef USE_ITHREADS
2833             {
2834                 SV* const repointer = &PL_sv_undef;
2835                 /* this regexp is also owned by the new PL_reg_curpm, which
2836                    will try to free it.  */
2837                 av_push(PL_regex_padav, repointer);
2838                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2839                 PL_regex_pad = AvARRAY(PL_regex_padav);
2840             }
2841 #endif
2842         }
2843         SET_reg_curpm(rx);
2844         PL_reg_oldcurpm = PL_curpm;
2845         PL_curpm = PL_reg_curpm;
2846         if (RXp_MATCH_COPIED(prog)) {
2847             /*  Here is a serious problem: we cannot rewrite subbeg,
2848                 since it may be needed if this match fails.  Thus
2849                 $` inside (?{}) could fail... */
2850             PL_reg_oldsaved = prog->subbeg;
2851             PL_reg_oldsavedlen = prog->sublen;
2852             PL_reg_oldsavedoffset = prog->suboffset;
2853             PL_reg_oldsavedcoffset = prog->suboffset;
2854 #ifdef PERL_ANY_COW
2855             PL_nrs = prog->saved_copy;
2856 #endif
2857             RXp_MATCH_COPIED_off(prog);
2858         }
2859         else
2860             PL_reg_oldsaved = NULL;
2861         prog->subbeg = PL_bostr;
2862         prog->suboffset = 0;
2863         prog->subcoffset = 0;
2864         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2865     }
2866 #ifdef DEBUGGING
2867     PL_reg_starttry = *startposp;
2868 #endif
2869     prog->offs[0].start = *startposp - PL_bostr;
2870     prog->lastparen = 0;
2871     prog->lastcloseparen = 0;
2872     PL_regsize = 0;
2873
2874     /* XXXX What this code is doing here?!!!  There should be no need
2875        to do this again and again, prog->lastparen should take care of
2876        this!  --ilya*/
2877
2878     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2879      * Actually, the code in regcppop() (which Ilya may be meaning by
2880      * prog->lastparen), is not needed at all by the test suite
2881      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2882      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2883      * Meanwhile, this code *is* needed for the
2884      * above-mentioned test suite tests to succeed.  The common theme
2885      * on those tests seems to be returning null fields from matches.
2886      * --jhi updated by dapm */
2887 #if 1
2888     if (prog->nparens) {
2889         regexp_paren_pair *pp = prog->offs;
2890         I32 i;
2891         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2892             ++pp;
2893             pp->start = -1;
2894             pp->end = -1;
2895         }
2896     }
2897 #endif
2898     REGCP_SET(lastcp);
2899     result = regmatch(reginfo, *startposp, progi->program + 1);
2900     if (result != -1) {
2901         prog->offs[0].end = result;
2902         return 1;
2903     }
2904     if (reginfo->cutpoint)
2905         *startposp= reginfo->cutpoint;
2906     REGCP_UNWIND(lastcp);
2907     return 0;
2908 }
2909
2910
2911 #define sayYES goto yes
2912 #define sayNO goto no
2913 #define sayNO_SILENT goto no_silent
2914
2915 /* we dont use STMT_START/END here because it leads to
2916    "unreachable code" warnings, which are bogus, but distracting. */
2917 #define CACHEsayNO \
2918     if (ST.cache_mask) \
2919        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2920     sayNO
2921
2922 /* this is used to determine how far from the left messages like
2923    'failed...' are printed. It should be set such that messages
2924    are inline with the regop output that created them.
2925 */
2926 #define REPORT_CODE_OFF 32
2927
2928
2929 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2930 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2931 #define CHRTEST_NOT_A_CP_1 -999
2932 #define CHRTEST_NOT_A_CP_2 -998
2933
2934 #define SLAB_FIRST(s) (&(s)->states[0])
2935 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2936
2937 /* grab a new slab and return the first slot in it */
2938
2939 STATIC regmatch_state *
2940 S_push_slab(pTHX)
2941 {
2942 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2943     dMY_CXT;
2944 #endif
2945     regmatch_slab *s = PL_regmatch_slab->next;
2946     if (!s) {
2947         Newx(s, 1, regmatch_slab);
2948         s->prev = PL_regmatch_slab;
2949         s->next = NULL;
2950         PL_regmatch_slab->next = s;
2951     }
2952     PL_regmatch_slab = s;
2953     return SLAB_FIRST(s);
2954 }
2955
2956
2957 /* push a new state then goto it */
2958
2959 #define PUSH_STATE_GOTO(state, node, input) \
2960     pushinput = input; \
2961     scan = node; \
2962     st->resume_state = state; \
2963     goto push_state;
2964
2965 /* push a new state with success backtracking, then goto it */
2966
2967 #define PUSH_YES_STATE_GOTO(state, node, input) \
2968     pushinput = input; \
2969     scan = node; \
2970     st->resume_state = state; \
2971     goto push_yes_state;
2972
2973
2974
2975
2976 /*
2977
2978 regmatch() - main matching routine
2979
2980 This is basically one big switch statement in a loop. We execute an op,
2981 set 'next' to point the next op, and continue. If we come to a point which
2982 we may need to backtrack to on failure such as (A|B|C), we push a
2983 backtrack state onto the backtrack stack. On failure, we pop the top
2984 state, and re-enter the loop at the state indicated. If there are no more
2985 states to pop, we return failure.
2986
2987 Sometimes we also need to backtrack on success; for example /A+/, where
2988 after successfully matching one A, we need to go back and try to
2989 match another one; similarly for lookahead assertions: if the assertion
2990 completes successfully, we backtrack to the state just before the assertion
2991 and then carry on.  In these cases, the pushed state is marked as
2992 'backtrack on success too'. This marking is in fact done by a chain of
2993 pointers, each pointing to the previous 'yes' state. On success, we pop to
2994 the nearest yes state, discarding any intermediate failure-only states.
2995 Sometimes a yes state is pushed just to force some cleanup code to be
2996 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2997 it to free the inner regex.
2998
2999 Note that failure backtracking rewinds the cursor position, while
3000 success backtracking leaves it alone.
3001
3002 A pattern is complete when the END op is executed, while a subpattern
3003 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3004 ops trigger the "pop to last yes state if any, otherwise return true"
3005 behaviour.
3006
3007 A common convention in this function is to use A and B to refer to the two
3008 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3009 the subpattern to be matched possibly multiple times, while B is the entire
3010 rest of the pattern. Variable and state names reflect this convention.
3011
3012 The states in the main switch are the union of ops and failure/success of
3013 substates associated with with that op.  For example, IFMATCH is the op
3014 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3015 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3016 successfully matched A and IFMATCH_A_fail is a state saying that we have
3017 just failed to match A. Resume states always come in pairs. The backtrack
3018 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3019 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3020 on success or failure.
3021
3022 The struct that holds a backtracking state is actually a big union, with
3023 one variant for each major type of op. The variable st points to the
3024 top-most backtrack struct. To make the code clearer, within each
3025 block of code we #define ST to alias the relevant union.
3026
3027 Here's a concrete example of a (vastly oversimplified) IFMATCH
3028 implementation:
3029
3030     switch (state) {
3031     ....
3032
3033 #define ST st->u.ifmatch
3034
3035     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3036         ST.foo = ...; // some state we wish to save
3037         ...
3038         // push a yes backtrack state with a resume value of
3039         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3040         // first node of A:
3041         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3042         // NOTREACHED
3043
3044     case IFMATCH_A: // we have successfully executed A; now continue with B
3045         next = B;
3046         bar = ST.foo; // do something with the preserved value
3047         break;
3048
3049     case IFMATCH_A_fail: // A failed, so the assertion failed
3050         ...;   // do some housekeeping, then ...
3051         sayNO; // propagate the failure
3052
3053 #undef ST
3054
3055     ...
3056     }
3057
3058 For any old-timers reading this who are familiar with the old recursive
3059 approach, the code above is equivalent to:
3060
3061     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3062     {
3063         int foo = ...
3064         ...
3065         if (regmatch(A)) {
3066             next = B;
3067             bar = foo;
3068             break;
3069         }
3070         ...;   // do some housekeeping, then ...
3071         sayNO; // propagate the failure
3072     }
3073
3074 The topmost backtrack state, pointed to by st, is usually free. If you
3075 want to claim it, populate any ST.foo fields in it with values you wish to
3076 save, then do one of
3077
3078         PUSH_STATE_GOTO(resume_state, node, newinput);
3079         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3080
3081 which sets that backtrack state's resume value to 'resume_state', pushes a
3082 new free entry to the top of the backtrack stack, then goes to 'node'.
3083 On backtracking, the free slot is popped, and the saved state becomes the
3084 new free state. An ST.foo field in this new top state can be temporarily
3085 accessed to retrieve values, but once the main loop is re-entered, it
3086 becomes available for reuse.
3087
3088 Note that the depth of the backtrack stack constantly increases during the
3089 left-to-right execution of the pattern, rather than going up and down with
3090 the pattern nesting. For example the stack is at its maximum at Z at the
3091 end of the pattern, rather than at X in the following:
3092
3093     /(((X)+)+)+....(Y)+....Z/
3094
3095 The only exceptions to this are lookahead/behind assertions and the cut,
3096 (?>A), which pop all the backtrack states associated with A before
3097 continuing.
3098
3099 Backtrack state structs are allocated in slabs of about 4K in size.
3100 PL_regmatch_state and st always point to the currently active state,
3101 and PL_regmatch_slab points to the slab currently containing
3102 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3103 allocated, and is never freed until interpreter destruction. When the slab
3104 is full, a new one is allocated and chained to the end. At exit from
3105 regmatch(), slabs allocated since entry are freed.
3106
3107 */
3108
3109
3110 #define DEBUG_STATE_pp(pp)                                  \
3111     DEBUG_STATE_r({                                         \
3112         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3113         PerlIO_printf(Perl_debug_log,                       \
3114             "    %*s"pp" %s%s%s%s%s\n",                     \
3115             depth*2, "",                                    \
3116             PL_reg_name[st->resume_state],                     \
3117             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3118             ((st==yes_state) ? "Y" : ""),                   \
3119             ((st==mark_state) ? "M" : ""),                  \
3120             ((st==yes_state||st==mark_state) ? "]" : "")    \
3121         );                                                  \
3122     });
3123
3124
3125 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3126
3127 #ifdef DEBUGGING
3128
3129 STATIC void
3130 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3131     const char *start, const char *end, const char *blurb)
3132 {
3133     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3134
3135     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3136
3137     if (!PL_colorset)
3138             reginitcolors();
3139     {
3140         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3141             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3142
3143         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3144             start, end - start, 60);
3145
3146         PerlIO_printf(Perl_debug_log,
3147             "%s%s REx%s %s against %s\n",
3148                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3149
3150         if (utf8_target||utf8_pat)
3151             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3152                 utf8_pat ? "pattern" : "",
3153                 utf8_pat && utf8_target ? " and " : "",
3154                 utf8_target ? "string" : ""
3155             );
3156     }
3157 }
3158
3159 STATIC void
3160 S_dump_exec_pos(pTHX_ const char *locinput,
3161                       const regnode *scan,
3162                       const char *loc_regeol,
3163                       const char *loc_bostr,
3164                       const char *loc_reg_starttry,
3165                       const bool utf8_target)
3166 {
3167     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3168     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3169     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3170     /* The part of the string before starttry has one color
3171        (pref0_len chars), between starttry and current
3172        position another one (pref_len - pref0_len chars),
3173        after the current position the third one.
3174        We assume that pref0_len <= pref_len, otherwise we
3175        decrease pref0_len.  */
3176     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3177         ? (5 + taill) - l : locinput - loc_bostr;
3178     int pref0_len;
3179
3180     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3181
3182     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3183         pref_len++;
3184     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3185     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3186         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3187               ? (5 + taill) - pref_len : loc_regeol - locinput);
3188     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3189         l--;
3190     if (pref0_len < 0)
3191         pref0_len = 0;
3192     if (pref0_len > pref_len)
3193         pref0_len = pref_len;
3194     {
3195         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3196
3197         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3198             (locinput - pref_len),pref0_len, 60, 4, 5);
3199
3200         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3201                     (locinput - pref_len + pref0_len),
3202                     pref_len - pref0_len, 60, 2, 3);
3203
3204         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3205                     locinput, loc_regeol - locinput, 10, 0, 1);
3206
3207         const STRLEN tlen=len0+len1+len2;
3208         PerlIO_printf(Perl_debug_log,
3209                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3210                     (IV)(locinput - loc_bostr),
3211                     len0, s0,
3212                     len1, s1,
3213                     (docolor ? "" : "> <"),
3214                     len2, s2,
3215                     (int)(tlen > 19 ? 0 :  19 - tlen),
3216                     "");
3217     }
3218 }
3219
3220 #endif
3221
3222 /* reg_check_named_buff_matched()
3223  * Checks to see if a named buffer has matched. The data array of
3224  * buffer numbers corresponding to the buffer is expected to reside
3225  * in the regexp->data->data array in the slot stored in the ARG() of
3226  * node involved. Note that this routine doesn't actually care about the
3227  * name, that information is not preserved from compilation to execution.
3228  * Returns the index of the leftmost defined buffer with the given name
3229  * or 0 if non of the buffers matched.
3230  */
3231 STATIC I32
3232 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3233 {
3234     I32 n;
3235     RXi_GET_DECL(rex,rexi);
3236     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3237     I32 *nums=(I32*)SvPVX(sv_dat);
3238
3239     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3240
3241     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3242         if ((I32)rex->lastparen >= nums[n] &&
3243             rex->offs[nums[n]].end != -1)
3244         {
3245             return nums[n];
3246         }
3247     }
3248     return 0;
3249 }
3250
3251
3252 /* free all slabs above current one  - called during LEAVE_SCOPE */
3253
3254 STATIC void
3255 S_clear_backtrack_stack(pTHX_ void *p)
3256 {
3257     regmatch_slab *s = PL_regmatch_slab->next;
3258     PERL_UNUSED_ARG(p);
3259
3260     if (!s)
3261         return;
3262     PL_regmatch_slab->next = NULL;
3263     while (s) {
3264         regmatch_slab * const osl = s;
3265         s = s->next;
3266         Safefree(osl);
3267     }
3268 }
3269 static bool
3270 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3271 {
3272     /* This function determines if there are one or two characters that match
3273      * the first character of the passed-in EXACTish node <text_node>, and if
3274      * so, returns them in the passed-in pointers.
3275      *
3276      * If it determines that no possible character in the target string can
3277      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3278      * the first character in <text_node> requires UTF-8 to represent, and the
3279      * target string isn't in UTF-8.)
3280      *
3281      * If there are more than two characters that could match the beginning of
3282      * <text_node>, or if more context is required to determine a match or not,
3283      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3284      *
3285      * The motiviation behind this function is to allow the caller to set up
3286      * tight loops for matching.  If <text_node> is of type EXACT, there is
3287      * only one possible character that can match its first character, and so
3288      * the situation is quite simple.  But things get much more complicated if
3289      * folding is involved.  It may be that the first character of an EXACTFish
3290      * node doesn't participate in any possible fold, e.g., punctuation, so it
3291      * can be matched only by itself.  The vast majority of characters that are
3292      * in folds match just two things, their lower and upper-case equivalents.
3293      * But not all are like that; some have multiple possible matches, or match
3294      * sequences of more than one character.  This function sorts all that out.
3295      *
3296      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3297      * loop of trying to match A*, we know we can't exit where the thing
3298      * following it isn't a B.  And something can't be a B unless it is the
3299      * beginning of B.  By putting a quick test for that beginning in a tight
3300      * loop, we can rule out things that can't possibly be B without having to
3301      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3302      * character, we can make a tight loop matching A*, using the outputs of
3303      * this function.
3304      *
3305      * If the target string to match isn't in UTF-8, and there aren't
3306      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3307      * the one or two possible octets (which are characters in this situation)
3308      * that can match.  In all cases, if there is only one character that can
3309      * match, *<c1p> and *<c2p> will be identical.
3310      *
3311      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3312      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3313      * can match the beginning of <text_node>.  They should be declared with at
3314      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3315      * undefined what these contain.)  If one or both of the buffers are
3316      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3317      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3318      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3319      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3320      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3321
3322     const bool utf8_target = PL_reg_match_utf8;
3323
3324     UV c1 = CHRTEST_NOT_A_CP_1;
3325     UV c2 = CHRTEST_NOT_A_CP_2;
3326     bool use_chrtest_void = FALSE;
3327
3328     /* Used when we have both utf8 input and utf8 output, to avoid converting
3329      * to/from code points */
3330     bool utf8_has_been_setup = FALSE;
3331
3332     dVAR;
3333
3334     U8 *pat = (U8*)STRING(text_node);
3335
3336     if (OP(text_node) == EXACT) {
3337
3338         /* In an exact node, only one thing can be matched, that first
3339          * character.  If both the pat and the target are UTF-8, we can just
3340          * copy the input to the output, avoiding finding the code point of
3341          * that character */
3342         if (! UTF_PATTERN) {
3343             c2 = c1 = *pat;
3344         }
3345         else if (utf8_target) {
3346             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3347             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3348             utf8_has_been_setup = TRUE;
3349         }
3350         else {
3351             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3352         }
3353     }
3354     else /* an EXACTFish node */
3355          if ((UTF_PATTERN
3356                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3357                                                     pat + STR_LEN(text_node)))
3358              || (! UTF_PATTERN
3359                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3360                                                     pat + STR_LEN(text_node))))
3361     {
3362         /* Multi-character folds require more context to sort out.  Also
3363          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3364          * handled outside this routine */
3365         use_chrtest_void = TRUE;
3366     }
3367     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3368         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3369         if (c1 > 256) {
3370             /* Load the folds hash, if not already done */
3371             SV** listp;
3372             if (! PL_utf8_foldclosures) {
3373                 if (! PL_utf8_tofold) {
3374                     U8 dummy[UTF8_MAXBYTES+1];
3375
3376                     /* Force loading this by folding an above-Latin1 char */
3377                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3378                     assert(PL_utf8_tofold); /* Verify that worked */
3379                 }
3380                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3381             }
3382
3383             /* The fold closures data structure is a hash with the keys being
3384              * the UTF-8 of every character that is folded to, like 'k', and
3385              * the values each an array of all code points that fold to its
3386              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3387              * not included */
3388             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3389                                      (char *) pat,
3390                                      UTF8SKIP(pat),
3391                                      FALSE))))
3392             {
3393                 /* Not found in the hash, therefore there are no folds
3394                  * containing it, so there is only a single character that
3395                  * could match */
3396                 c2 = c1;
3397             }
3398             else {  /* Does participate in folds */
3399                 AV* list = (AV*) *listp;
3400                 if (av_len(list) != 1) {
3401
3402                     /* If there aren't exactly two folds to this, it is outside
3403                      * the scope of this function */
3404                     use_chrtest_void = TRUE;
3405                 }
3406                 else {  /* There are two.  Get them */
3407                     SV** c_p = av_fetch(list, 0, FALSE);
3408                     if (c_p == NULL) {
3409                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3410                     }
3411                     c1 = SvUV(*c_p);
3412
3413                     c_p = av_fetch(list, 1, FALSE);
3414                     if (c_p == NULL) {
3415                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3416                     }
3417                     c2 = SvUV(*c_p);
3418
3419                     /* Folds that cross the 255/256 boundary are forbidden if
3420                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3421                      * pattern character is above 256, and its only other match
3422                      * is below 256, the only legal match will be to itself.
3423                      * We have thrown away the original, so have to compute
3424                      * which is the one above 255 */
3425                     if ((c1 < 256) != (c2 < 256)) {
3426                         if (OP(text_node) == EXACTFL
3427                             || (OP(text_node) == EXACTFA
3428                                 && (isASCII(c1) || isASCII(c2))))
3429                         {
3430                             if (c1 < 256) {
3431                                 c1 = c2;
3432                             }
3433                             else {
3434                                 c2 = c1;
3435                             }
3436                         }
3437                     }
3438                 }
3439             }
3440         }
3441         else /* Here, c1 is < 255 */
3442              if (utf8_target
3443                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3444                  && OP(text_node) != EXACTFL
3445                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3446         {
3447             /* Here, there could be something above Latin1 in the target which
3448              * folds to this character in the pattern.  All such cases except
3449              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3450              * involved in their folds, so are outside the scope of this
3451              * function */
3452             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3453                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3454             }
3455             else {
3456                 use_chrtest_void = TRUE;
3457             }
3458         }
3459         else { /* Here nothing above Latin1 can fold to the pattern character */
3460             switch (OP(text_node)) {
3461
3462                 case EXACTFL:   /* /l rules */
3463                     c2 = PL_fold_locale[c1];
3464                     break;
3465
3466                 case EXACTF:
3467                     if (! utf8_target) {    /* /d rules */
3468                         c2 = PL_fold[c1];
3469                         break;
3470                     }
3471                     /* FALLTHROUGH */
3472                     /* /u rules for all these.  This happens to work for
3473                      * EXACTFA as nothing in Latin1 folds to ASCII */
3474                 case EXACTFA:
3475                 case EXACTFU_TRICKYFOLD:
3476                 case EXACTFU_SS:
3477                 case EXACTFU:
3478                     c2 = PL_fold_latin1[c1];
3479                     break;
3480
3481                 default:
3482                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3483                     assert(0); /* NOTREACHED */
3484             }
3485         }
3486     }
3487
3488     /* Here have figured things out.  Set up the returns */
3489     if (use_chrtest_void) {
3490         *c2p = *c1p = CHRTEST_VOID;
3491     }
3492     else if (utf8_target) {
3493         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3494             uvchr_to_utf8(c1_utf8, c1);
3495             uvchr_to_utf8(c2_utf8, c2);
3496         }
3497
3498         /* Invariants are stored in both the utf8 and byte outputs; Use
3499          * negative numbers otherwise for the byte ones.  Make sure that the
3500          * byte ones are the same iff the utf8 ones are the same */
3501         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3502         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3503                 ? *c2_utf8
3504                 : (c1 == c2)
3505                   ? CHRTEST_NOT_A_CP_1
3506                   : CHRTEST_NOT_A_CP_2;
3507     }
3508     else if (c1 > 255) {
3509        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3510                            can represent */
3511            return FALSE;
3512        }
3513
3514        *c1p = *c2p = c2;    /* c2 is the only representable value */
3515     }
3516     else {  /* c1 is representable; see about c2 */
3517        *c1p = c1;
3518        *c2p = (c2 < 256) ? c2 : c1;
3519     }
3520
3521     return TRUE;
3522 }
3523
3524 /* returns -1 on failure, $+[0] on success */
3525 STATIC I32
3526 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3527 {
3528 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3529     dMY_CXT;
3530 #endif
3531     dVAR;
3532     const bool utf8_target = PL_reg_match_utf8;
3533     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3534     REGEXP *rex_sv = reginfo->prog;
3535     regexp *rex = ReANY(rex_sv);
3536     RXi_GET_DECL(rex,rexi);
3537     I32 oldsave;
3538     /* the current state. This is a cached copy of PL_regmatch_state */
3539     regmatch_state *st;
3540     /* cache heavy used fields of st in registers */
3541     regnode *scan;
3542     regnode *next;
3543     U32 n = 0;  /* general value; init to avoid compiler warning */
3544     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3545     char *locinput = startpos;
3546     char *pushinput; /* where to continue after a PUSH */
3547     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3548
3549     bool result = 0;        /* return value of S_regmatch */
3550     int depth = 0;          /* depth of backtrack stack */
3551     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3552     const U32 max_nochange_depth =
3553         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3554         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3555     regmatch_state *yes_state = NULL; /* state to pop to on success of
3556                                                             subpattern */
3557     /* mark_state piggy backs on the yes_state logic so that when we unwind
3558        the stack on success we can update the mark_state as we go */
3559     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3560     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3561     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3562     U32 state_num;
3563     bool no_final = 0;      /* prevent failure from backtracking? */
3564     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3565     char *startpoint = locinput;
3566     SV *popmark = NULL;     /* are we looking for a mark? */
3567     SV *sv_commit = NULL;   /* last mark name seen in failure */
3568     SV *sv_yes_mark = NULL; /* last mark name we have seen
3569                                during a successful match */
3570     U32 lastopen = 0;       /* last open we saw */
3571     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3572     SV* const oreplsv = GvSV(PL_replgv);
3573     /* these three flags are set by various ops to signal information to
3574      * the very next op. They have a useful lifetime of exactly one loop
3575      * iteration, and are not preserved or restored by state pushes/pops
3576      */
3577     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3578     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3579     int logical = 0;        /* the following EVAL is:
3580                                 0: (?{...})
3581                                 1: (?(?{...})X|Y)
3582                                 2: (??{...})
3583                                or the following IFMATCH/UNLESSM is:
3584                                 false: plain (?=foo)
3585                                 true:  used as a condition: (?(?=foo))
3586                             */
3587     PAD* last_pad = NULL;
3588     dMULTICALL;
3589     I32 gimme = G_SCALAR;
3590     CV *caller_cv = NULL;       /* who called us */
3591     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3592     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3593
3594 #ifdef DEBUGGING
3595     GET_RE_DEBUG_FLAGS_DECL;
3596 #endif
3597
3598     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3599     multicall_oldcatch = 0;
3600     multicall_cv = NULL;
3601     cx = NULL;
3602     PERL_UNUSED_VAR(multicall_cop);
3603     PERL_UNUSED_VAR(newsp);
3604
3605
3606     PERL_ARGS_ASSERT_REGMATCH;
3607
3608     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3609             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3610     }));
3611     /* on first ever call to regmatch, allocate first slab */
3612     if (!PL_regmatch_slab) {
3613         Newx(PL_regmatch_slab, 1, regmatch_slab);
3614         PL_regmatch_slab->prev = NULL;
3615         PL_regmatch_slab->next = NULL;
3616         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3617     }
3618
3619     oldsave = PL_savestack_ix;
3620     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3621     SAVEVPTR(PL_regmatch_slab);
3622     SAVEVPTR(PL_regmatch_state);
3623
3624     /* grab next free state slot */
3625     st = ++PL_regmatch_state;
3626     if (st >  SLAB_LAST(PL_regmatch_slab))
3627         st = PL_regmatch_state = S_push_slab(aTHX);
3628
3629     /* Note that nextchr is a byte even in UTF */
3630     SET_nextchr;
3631     scan = prog;
3632     while (scan != NULL) {
3633
3634         DEBUG_EXECUTE_r( {
3635             SV * const prop = sv_newmortal();
3636             regnode *rnext=regnext(scan);
3637             DUMP_EXEC_POS( locinput, scan, utf8_target );
3638             regprop(rex, prop, scan);
3639
3640             PerlIO_printf(Perl_debug_log,
3641                     "%3"IVdf":%*s%s(%"IVdf")\n",
3642                     (IV)(scan - rexi->program), depth*2, "",
3643                     SvPVX_const(prop),
3644                     (PL_regkind[OP(scan)] == END || !rnext) ?
3645                         0 : (IV)(rnext - rexi->program));
3646         });
3647
3648         next = scan + NEXT_OFF(scan);
3649         if (next == scan)
3650             next = NULL;
3651         state_num = OP(scan);
3652
3653       reenter_switch:
3654
3655         SET_nextchr;
3656         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3657
3658         switch (state_num) {
3659         case BOL: /*  /^../  */
3660             if (locinput == PL_bostr)
3661             {
3662                 /* reginfo->till = reginfo->bol; */
3663                 break;
3664             }
3665             sayNO;
3666
3667         case MBOL: /*  /^../m  */
3668             if (locinput == PL_bostr ||
3669                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3670             {
3671                 break;
3672             }
3673             sayNO;
3674
3675         case SBOL: /*  /^../s  */
3676             if (locinput == PL_bostr)
3677                 break;
3678             sayNO;
3679
3680         case GPOS: /*  \G  */
3681             if (locinput == reginfo->ganch)
3682                 break;
3683             sayNO;
3684
3685         case KEEPS: /*   \K  */
3686             /* update the startpoint */
3687             st->u.keeper.val = rex->offs[0].start;
3688             rex->offs[0].start = locinput - PL_bostr;
3689             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3690             assert(0); /*NOTREACHED*/
3691         case KEEPS_next_fail:
3692             /* rollback the start point change */
3693             rex->offs[0].start = st->u.keeper.val;
3694             sayNO_SILENT;
3695             assert(0); /*NOTREACHED*/
3696
3697         case EOL: /* /..$/  */
3698                 goto seol;
3699
3700         case MEOL: /* /..$/m  */
3701             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3702                 sayNO;
3703             break;
3704
3705         case SEOL: /* /..$/s  */
3706           seol:
3707             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3708                 sayNO;
3709             if (PL_regeol - locinput > 1)
3710                 sayNO;
3711             break;
3712
3713         case EOS: /*  \z  */
3714             if (!NEXTCHR_IS_EOS)
3715                 sayNO;
3716             break;
3717
3718         case SANY: /*  /./s  */
3719             if (NEXTCHR_IS_EOS)
3720                 sayNO;
3721             goto increment_locinput;
3722
3723         case CANY: /*  \C  */
3724             if (NEXTCHR_IS_EOS)
3725                 sayNO;
3726             locinput++;
3727             break;
3728
3729         case REG_ANY: /*  /./  */
3730             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3731                 sayNO;
3732             goto increment_locinput;
3733
3734
3735 #undef  ST
3736 #define ST st->u.trie
3737         case TRIEC: /* (ab|cd) with known charclass */
3738             /* In this case the charclass data is available inline so
3739                we can fail fast without a lot of extra overhead.
3740              */
3741             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3742                 DEBUG_EXECUTE_r(
3743                     PerlIO_printf(Perl_debug_log,
3744                               "%*s  %sfailed to match trie start class...%s\n",
3745                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3746                 );
3747                 sayNO_SILENT;
3748                 assert(0); /* NOTREACHED */
3749             }
3750             /* FALL THROUGH */
3751         case TRIE:  /* (ab|cd)  */
3752             /* the basic plan of execution of the trie is:
3753              * At the beginning, run though all the states, and
3754              * find the longest-matching word. Also remember the position
3755              * of the shortest matching word. For example, this pattern:
3756              *    1  2 3 4    5
3757              *    ab|a|x|abcd|abc
3758              * when matched against the string "abcde", will generate
3759              * accept states for all words except 3, with the longest
3760              * matching word being 4, and the shortest being 2 (with
3761              * the position being after char 1 of the string).
3762              *
3763              * Then for each matching word, in word order (i.e. 1,2,4,5),
3764              * we run the remainder of the pattern; on each try setting
3765              * the current position to the character following the word,
3766              * returning to try the next word on failure.
3767              *
3768              * We avoid having to build a list of words at runtime by
3769              * using a compile-time structure, wordinfo[].prev, which
3770              * gives, for each word, the previous accepting word (if any).
3771              * In the case above it would contain the mappings 1->2, 2->0,
3772              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3773              * the longest word (4 above), a list of all words, by
3774              * following the list of prev pointers; this gives us the
3775              * unordered list 4,5,1,2. Then given the current word we have
3776              * just tried, we can go through the list and find the
3777              * next-biggest word to try (so if we just failed on word 2,
3778              * the next in the list is 4).
3779              *
3780              * Since at runtime we don't record the matching position in
3781              * the string for each word, we have to work that out for
3782              * each word we're about to process. The wordinfo table holds
3783              * the character length of each word; given that we recorded
3784              * at the start: the position of the shortest word and its
3785              * length in chars, we just need to move the pointer the
3786              * difference between the two char lengths. Depending on
3787              * Unicode status and folding, that's cheap or expensive.
3788              *
3789              * This algorithm is optimised for the case where are only a
3790              * small number of accept states, i.e. 0,1, or maybe 2.
3791              * With lots of accepts states, and having to try all of them,
3792              * it becomes quadratic on number of accept states to find all
3793              * the next words.
3794              */
3795
3796             {
3797                 /* what type of TRIE am I? (utf8 makes this contextual) */
3798                 DECL_TRIE_TYPE(scan);
3799
3800                 /* what trie are we using right now */
3801                 reg_trie_data * const trie
3802                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3803                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3804                 U32 state = trie->startstate;
3805
3806                 if (   trie->bitmap
3807                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3808                 {
3809                     if (trie->states[ state ].wordnum) {
3810                          DEBUG_EXECUTE_r(
3811                             PerlIO_printf(Perl_debug_log,
3812                                           "%*s  %smatched empty string...%s\n",
3813                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3814                         );
3815                         if (!trie->jump)
3816                             break;
3817                     } else {
3818                         DEBUG_EXECUTE_r(
3819                             PerlIO_printf(Perl_debug_log,
3820                                           "%*s  %sfailed to match trie start class...%s\n",
3821                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3822                         );
3823                         sayNO_SILENT;
3824                    }
3825                 }
3826
3827             {
3828                 U8 *uc = ( U8* )locinput;
3829
3830                 STRLEN len = 0;
3831                 STRLEN foldlen = 0;
3832                 U8 *uscan = (U8*)NULL;
3833                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3834                 U32 charcount = 0; /* how many input chars we have matched */
3835                 U32 accepted = 0; /* have we seen any accepting states? */
3836
3837                 ST.jump = trie->jump;
3838                 ST.me = scan;
3839                 ST.firstpos = NULL;
3840                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3841                 ST.nextword = 0;
3842
3843                 /* fully traverse the TRIE; note the position of the
3844                    shortest accept state and the wordnum of the longest
3845                    accept state */
3846
3847                 while ( state && uc <= (U8*)PL_regeol ) {
3848                     U32 base = trie->states[ state ].trans.base;
3849                     UV uvc = 0;
3850                     U16 charid = 0;
3851                     U16 wordnum;
3852                     wordnum = trie->states[ state ].wordnum;
3853
3854                     if (wordnum) { /* it's an accept state */
3855                         if (!accepted) {
3856                             accepted = 1;
3857                             /* record first match position */
3858                             if (ST.longfold) {
3859                                 ST.firstpos = (U8*)locinput;
3860                                 ST.firstchars = 0;
3861                             }
3862                             else {
3863                                 ST.firstpos = uc;
3864                                 ST.firstchars = charcount;
3865                             }
3866                         }
3867                         if (!ST.nextword || wordnum < ST.nextword)
3868                             ST.nextword = wordnum;
3869                         ST.topword = wordnum;
3870                     }
3871
3872                     DEBUG_TRIE_EXECUTE_r({
3873                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3874                                 PerlIO_printf( Perl_debug_log,
3875                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3876                                     2+depth * 2, "", PL_colors[4],
3877                                     (UV)state, (accepted ? 'Y' : 'N'));
3878                     });
3879
3880                     /* read a char and goto next state */
3881                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3882                         I32 offset;
3883                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3884                                              uscan, len, uvc, charid, foldlen,
3885                                              foldbuf, uniflags);
3886                         charcount++;
3887                         if (foldlen>0)
3888                             ST.longfold = TRUE;
3889                         if (charid &&
3890                              ( ((offset =
3891                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3892
3893                              && ((U32)offset < trie->lasttrans)
3894                              && trie->trans[offset].check == state)
3895                         {
3896                             state = trie->trans[offset].next;
3897                         }
3898                         else {
3899                             state = 0;
3900                         }
3901                         uc += len;
3902
3903                     }
3904                     else {
3905                         state = 0;
3906                     }
3907                     DEBUG_TRIE_EXECUTE_r(
3908                         PerlIO_printf( Perl_debug_log,
3909                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3910                             charid, uvc, (UV)state, PL_colors[5] );
3911                     );
3912                 }
3913                 if (!accepted)
3914                    sayNO;
3915
3916                 /* calculate total number of accept states */
3917                 {
3918                     U16 w = ST.topword;
3919                     accepted = 0;
3920                     while (w) {
3921                         w = trie->wordinfo[w].prev;
3922                         accepted++;
3923                     }
3924                     ST.accepted = accepted;
3925                 }
3926
3927                 DEBUG_EXECUTE_r(
3928                     PerlIO_printf( Perl_debug_log,
3929                         "%*s  %sgot %"IVdf" possible matches%s\n",
3930                         REPORT_CODE_OFF + depth * 2, "",
3931                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3932                 );
3933                 goto trie_first_try; /* jump into the fail handler */
3934             }}
3935             assert(0); /* NOTREACHED */
3936
3937         case TRIE_next_fail: /* we failed - try next alternative */
3938         {
3939             U8 *uc;
3940             if ( ST.jump) {
3941                 REGCP_UNWIND(ST.cp);
3942                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3943             }
3944             if (!--ST.accepted) {
3945                 DEBUG_EXECUTE_r({
3946                     PerlIO_printf( Perl_debug_log,
3947                         "%*s  %sTRIE failed...%s\n",
3948                         REPORT_CODE_OFF+depth*2, "",
3949                         PL_colors[4],
3950                         PL_colors[5] );
3951                 });
3952                 sayNO_SILENT;
3953             }
3954             {
3955                 /* Find next-highest word to process.  Note that this code
3956                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3957                 U16 min = 0;
3958                 U16 word;
3959                 U16 const nextword = ST.nextword;
3960                 reg_trie_wordinfo * const wordinfo
3961                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3962                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3963                     if (word > nextword && (!min || word < min))
3964                         min = word;
3965                 }
3966                 ST.nextword = min;
3967             }
3968
3969           trie_first_try:
3970             if (do_cutgroup) {
3971                 do_cutgroup = 0;
3972                 no_final = 0;
3973             }
3974
3975             if ( ST.jump) {
3976                 ST.lastparen = rex->lastparen;
3977                 ST.lastcloseparen = rex->lastcloseparen;
3978                 REGCP_SET(ST.cp);
3979             }
3980
3981             /* find start char of end of current word */
3982             {
3983                 U32 chars; /* how many chars to skip */
3984                 reg_trie_data * const trie
3985                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3986
3987                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3988                             >=  ST.firstchars);
3989                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3990                             - ST.firstchars;
3991                 uc = ST.firstpos;
3992
3993                 if (ST.longfold) {
3994                     /* the hard option - fold each char in turn and find
3995                      * its folded length (which may be different */
3996                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3997                     STRLEN foldlen;
3998                     STRLEN len;
3999                     UV uvc;
4000                     U8 *uscan;
4001
4002                     while (chars) {
4003                         if (utf8_target) {
4004                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4005                                                     uniflags);
4006                             uc += len;
4007                         }
4008                         else {
4009                             uvc = *uc;
4010                             uc++;
4011                         }
4012                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4013                         uscan = foldbuf;
4014                         while (foldlen) {
4015                             if (!--chars)
4016                                 break;
4017                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4018                                             uniflags);
4019                             uscan += len;
4020                             foldlen -= len;
4021                         }
4022                     }
4023                 }
4024                 else {
4025                     if (utf8_target)
4026                         while (chars--)
4027                             uc += UTF8SKIP(uc);
4028                     else
4029                         uc += chars;
4030                 }
4031             }
4032
4033             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4034                             ? ST.jump[ST.nextword]
4035                             : NEXT_OFF(ST.me));
4036
4037             DEBUG_EXECUTE_r({
4038                 PerlIO_printf( Perl_debug_log,
4039                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4040                     REPORT_CODE_OFF+depth*2, "",
4041                     PL_colors[4],
4042                     ST.nextword,
4043                     PL_colors[5]
4044                     );
4045             });
4046
4047             if (ST.accepted > 1 || has_cutgroup) {
4048                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4049                 assert(0); /* NOTREACHED */
4050             }
4051             /* only one choice left - just continue */
4052             DEBUG_EXECUTE_r({
4053                 AV *const trie_words
4054                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4055                 SV ** const tmp = av_fetch( trie_words,
4056                     ST.nextword-1, 0 );
4057                 SV *sv= tmp ? sv_newmortal() : NULL;
4058
4059                 PerlIO_printf( Perl_debug_log,
4060                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4061                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4062                     ST.nextword,
4063                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4064                             PL_colors[0], PL_colors[1],
4065                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4066                         )
4067                     : "not compiled under -Dr",
4068                     PL_colors[5] );
4069             });
4070
4071             locinput = (char*)uc;
4072             continue; /* execute rest of RE */
4073             assert(0); /* NOTREACHED */
4074         }
4075 #undef  ST
4076
4077         case EXACT: {            /*  /abc/        */
4078             char *s = STRING(scan);
4079             ln = STR_LEN(scan);
4080             if (utf8_target != UTF_PATTERN) {
4081                 /* The target and the pattern have differing utf8ness. */
4082                 char *l = locinput;
4083                 const char * const e = s + ln;
4084
4085                 if (utf8_target) {
4086                     /* The target is utf8, the pattern is not utf8.
4087                      * Above-Latin1 code points can't match the pattern;
4088                      * invariants match exactly, and the other Latin1 ones need
4089                      * to be downgraded to a single byte in order to do the
4090                      * comparison.  (If we could be confident that the target
4091                      * is not malformed, this could be refactored to have fewer
4092                      * tests by just assuming that if the first bytes match, it
4093                      * is an invariant, but there are tests in the test suite
4094                      * dealing with (??{...}) which violate this) */
4095                     while (s < e) {
4096                         if (l >= PL_regeol)
4097                              sayNO;
4098                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4099                             sayNO;
4100                         }
4101                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4102                             if (*l != *s) {
4103                                 sayNO;
4104                             }
4105                             l++;
4106                         }
4107                         else {
4108                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4109                                 sayNO;
4110                             }
4111                             l += 2;
4112                         }
4113                         s++;
4114                     }
4115                 }
4116                 else {
4117                     /* The target is not utf8, the pattern is utf8. */
4118                     while (s < e) {
4119                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4120                         {
4121                             sayNO;
4122                         }
4123                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4124                             if (*s != *l) {
4125                                 sayNO;
4126                             }
4127                             s++;
4128                         }
4129                         else {
4130                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4131                                 sayNO;
4132                             }
4133                             s += 2;
4134                         }
4135                         l++;
4136                     }
4137                 }
4138                 locinput = l;
4139                 break;
4140             }
4141             /* The target and the pattern have the same utf8ness. */
4142             /* Inline the first character, for speed. */
4143             if (UCHARAT(s) != nextchr)
4144                 sayNO;
4145             if (PL_regeol - locinput < ln)
4146                 sayNO;
4147             if (ln > 1 && memNE(s, locinput, ln))
4148                 sayNO;
4149             locinput += ln;
4150             break;
4151             }
4152
4153         case EXACTFL: {          /*  /abc/il      */
4154             re_fold_t folder;
4155             const U8 * fold_array;
4156             const char * s;
4157             U32 fold_utf8_flags;
4158
4159             PL_reg_flags |= RF_tainted;
4160             folder = foldEQ_locale;
4161             fold_array = PL_fold_locale;
4162             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4163             goto do_exactf;
4164
4165         case EXACTFU_SS:         /*  /\x{df}/iu   */
4166         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4167         case EXACTFU:            /*  /abc/iu      */
4168             folder = foldEQ_latin1;
4169             fold_array = PL_fold_latin1;
4170             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4171             goto do_exactf;
4172
4173         case EXACTFA:            /*  /abc/iaa     */
4174             folder = foldEQ_latin1;
4175             fold_array = PL_fold_latin1;
4176             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4177             goto do_exactf;
4178
4179         case EXACTF:             /*  /abc/i       */
4180             folder = foldEQ;
4181             fold_array = PL_fold;
4182             fold_utf8_flags = 0;
4183
4184           do_exactf:
4185             s = STRING(scan);
4186             ln = STR_LEN(scan);
4187
4188             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4189               /* Either target or the pattern are utf8, or has the issue where
4190                * the fold lengths may differ. */
4191                 const char * const l = locinput;
4192                 char *e = PL_regeol;
4193
4194                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4195                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4196                 {
4197                     sayNO;
4198                 }
4199                 locinput = e;
4200                 break;
4201             }
4202
4203             /* Neither the target nor the pattern are utf8 */
4204             if (UCHARAT(s) != nextchr
4205                 && !NEXTCHR_IS_EOS
4206                 && UCHARAT(s) != fold_array[nextchr])
4207             {
4208                 sayNO;
4209             }
4210             if (PL_regeol - locinput < ln)
4211                 sayNO;
4212             if (ln > 1 && ! folder(s, locinput, ln))
4213                 sayNO;
4214             locinput += ln;
4215             break;
4216         }
4217
4218         /* XXX Could improve efficiency by separating these all out using a
4219          * macro or in-line function.  At that point regcomp.c would no longer
4220          * have to set the FLAGS fields of these */
4221         case BOUNDL:  /*  /\b/l  */
4222         case NBOUNDL: /*  /\B/l  */
4223             PL_reg_flags |= RF_tainted;
4224             /* FALL THROUGH */
4225         case BOUND:   /*  /\b/   */
4226         case BOUNDU:  /*  /\b/u  */
4227         case BOUNDA:  /*  /\b/a  */
4228         case NBOUND:  /*  /\B/   */
4229         case NBOUNDU: /*  /\B/u  */
4230         case NBOUNDA: /*  /\B/a  */
4231             /* was last char in word? */
4232             if (utf8_target
4233                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4234                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4235             {
4236                 if (locinput == PL_bostr)
4237                     ln = '\n';
4238                 else {
4239                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4240
4241                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4242                 }
4243                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4244                     ln = isALNUM_uni(ln);
4245                     if (NEXTCHR_IS_EOS)
4246                         n = 0;
4247                     else {
4248                         LOAD_UTF8_CHARCLASS_ALNUM();
4249                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4250                                                                 utf8_target);
4251                     }
4252                 }
4253                 else {
4254                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4255                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4256                 }
4257             }
4258             else {
4259
4260                 /* Here the string isn't utf8, or is utf8 and only ascii
4261                  * characters are to match \w.  In the latter case looking at
4262                  * the byte just prior to the current one may be just the final
4263                  * byte of a multi-byte character.  This is ok.  There are two
4264                  * cases:
4265                  * 1) it is a single byte character, and then the test is doing
4266                  *      just what it's supposed to.
4267                  * 2) it is a multi-byte character, in which case the final
4268                  *      byte is never mistakable for ASCII, and so the test
4269                  *      will say it is not a word character, which is the
4270                  *      correct answer. */
4271                 ln = (locinput != PL_bostr) ?
4272                     UCHARAT(locinput - 1) : '\n';
4273                 switch (FLAGS(scan)) {
4274                     case REGEX_UNICODE_CHARSET:
4275                         ln = isWORDCHAR_L1(ln);
4276                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4277                         break;
4278                     case REGEX_LOCALE_CHARSET:
4279                         ln = isALNUM_LC(ln);
4280                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4281                         break;
4282                     case REGEX_DEPENDS_CHARSET:
4283                         ln = isALNUM(ln);
4284                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4285                         break;
4286                     case REGEX_ASCII_RESTRICTED_CHARSET:
4287                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4288                         ln = isWORDCHAR_A(ln);
4289                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4290                         break;
4291                     default:
4292                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4293                         break;
4294                 }
4295             }
4296             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4297              * regcomp.sym */
4298             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4299                     sayNO;
4300             break;
4301
4302         case ANYOF:  /*  /[abc]/       */
4303             if (NEXTCHR_IS_EOS)
4304                 sayNO;
4305             if (utf8_target) {
4306                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4307                     sayNO;
4308                 locinput += UTF8SKIP(locinput);
4309                 break;
4310             }
4311             else {
4312                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4313                     sayNO;
4314                 locinput++;
4315                 break;
4316             }
4317             break;
4318
4319         /* Special char classes: \d, \w etc.
4320          * The defines start on line 166 or so */
4321         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4322                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4323                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4324                   ALNUMA, NALNUMA, isWORDCHAR_A,
4325                   alnum, "a");
4326
4327         case SPACEL:
4328             PL_reg_flags |= RF_tainted;
4329             if (NEXTCHR_IS_EOS) {
4330                 sayNO;
4331             }
4332             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4333                 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4334                     sayNO;
4335                 }
4336             }
4337             else if (! isSPACE_LC((U8) nextchr)) {
4338                     sayNO;
4339             }
4340             goto increment_locinput;
4341
4342         case NSPACEL:
4343             PL_reg_flags |= RF_tainted;
4344             if (NEXTCHR_IS_EOS) {
4345                 sayNO;
4346             }
4347             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4348                 if (isSPACE_LC_utf8((U8 *) locinput)) {
4349                     sayNO;
4350                 }
4351             }
4352             else if (isSPACE_LC(nextchr)) {
4353                     sayNO;
4354             }
4355             goto increment_locinput;
4356
4357         case SPACE:
4358             if (utf8_target) {
4359                 goto utf8_space;
4360             }
4361             /* FALL THROUGH */
4362         case SPACEA:
4363             if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4364                 sayNO;
4365             }
4366             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4367             locinput++;
4368             break;
4369
4370         case NSPACE:
4371             if (utf8_target) {
4372                 goto utf8_nspace;
4373             }
4374             /* FALL THROUGH */
4375         case NSPACEA:
4376             if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4377                 sayNO;
4378             }
4379             goto increment_locinput;
4380
4381         case SPACEU:
4382           utf8_space:
4383             if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4384                 sayNO;
4385             }
4386             goto increment_locinput;
4387
4388         case NSPACEU:
4389           utf8_nspace:
4390             if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4391                 sayNO;
4392             }
4393             goto increment_locinput;
4394
4395         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4396                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4397                 DIGITA, NDIGITA, isDIGIT_A,
4398                 digit, "0");
4399
4400         case POSIXA: /* /[[:ascii:]]/ etc */
4401             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4402                 sayNO;
4403             }
4404             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4405             locinput++;
4406             break;
4407
4408         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4409             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4410                 sayNO;
4411             }
4412             goto increment_locinput;
4413
4414         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4415                        a Unicode extended Grapheme Cluster */
4416             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4417               extended Grapheme Cluster is:
4418
4419                CR LF
4420                | Prepend* Begin Extend*
4421                | .
4422
4423                Begin is:           ( Special_Begin | ! Control )
4424                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4425                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4426                Control is:         [ GCB_Control  CR  LF ]
4427                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4428
4429                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4430                we can rewrite
4431
4432                    Begin is ( Regular_Begin + Special Begin )
4433
4434                It turns out that 98.4% of all Unicode code points match
4435                Regular_Begin.  Doing it this way eliminates a table match in
4436                the previous implementation for almost all Unicode code points.
4437
4438                There is a subtlety with Prepend* which showed up in testing.
4439                Note that the Begin, and only the Begin is required in:
4440                 | Prepend* Begin Extend*
4441                Also, Begin contains '! Control'.  A Prepend must be a
4442                '!  Control', which means it must also be a Begin.  What it
4443                comes down to is that if we match Prepend* and then find no
4444                suitable Begin afterwards, that if we backtrack the last
4445                Prepend, that one will be a suitable Begin.
4446             */
4447
4448             if (NEXTCHR_IS_EOS)
4449                 sayNO;
4450             if  (! utf8_target) {
4451
4452                 /* Match either CR LF  or '.', as all the other possibilities
4453                  * require utf8 */
4454                 locinput++;         /* Match the . or CR */
4455                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4456                                        match the LF */
4457                     && locinput < PL_regeol
4458                     && UCHARAT(locinput) == '\n')
4459                 {
4460                     locinput++;
4461                 }
4462             }
4463             else {
4464
4465                 /* Utf8: See if is ( CR LF ); already know that locinput <
4466                  * PL_regeol, so locinput+1 is in bounds */
4467                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4468                         && UCHARAT(locinput + 1) == '\n')
4469                 {
4470                     locinput += 2;
4471                 }
4472                 else {
4473                     STRLEN len;
4474
4475                     /* In case have to backtrack to beginning, then match '.' */
4476                     char *starting = locinput;
4477
4478                     /* In case have to backtrack the last prepend */
4479                     char *previous_prepend = NULL;
4480
4481                     LOAD_UTF8_CHARCLASS_GCB();
4482
4483                     /* Match (prepend)*   */
4484                     while (locinput < PL_regeol
4485                            && (len = is_GCB_Prepend_utf8(locinput)))
4486                     {
4487                         previous_prepend = locinput;
4488                         locinput += len;
4489                     }
4490
4491                     /* As noted above, if we matched a prepend character, but
4492                      * the next thing won't match, back off the last prepend we
4493                      * matched, as it is guaranteed to match the begin */
4494                     if (previous_prepend
4495                         && (locinput >=  PL_regeol
4496                             || (! swash_fetch(PL_utf8_X_regular_begin,
4497                                              (U8*)locinput, utf8_target)
4498                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4499                         )
4500                     {
4501                         locinput = previous_prepend;
4502                     }
4503
4504                     /* Note that here we know PL_regeol > locinput, as we
4505                      * tested that upon input to this switch case, and if we
4506                      * moved locinput forward, we tested the result just above
4507                      * and it either passed, or we backed off so that it will
4508                      * now pass */
4509                     if (swash_fetch(PL_utf8_X_regular_begin,
4510                                     (U8*)locinput, utf8_target)) {
4511                         locinput += UTF8SKIP(locinput);
4512                     }
4513                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4514
4515                         /* Here did not match the required 'Begin' in the
4516                          * second term.  So just match the very first
4517                          * character, the '.' of the final term of the regex */
4518                         locinput = starting + UTF8SKIP(starting);
4519                         goto exit_utf8;
4520                     } else {
4521
4522                         /* Here is a special begin.  It can be composed of
4523                          * several individual characters.  One possibility is
4524                          * RI+ */
4525                         if ((len = is_GCB_RI_utf8(locinput))) {
4526                             locinput += len;
4527                             while (locinput < PL_regeol
4528                                    && (len = is_GCB_RI_utf8(locinput)))
4529                             {
4530                                 locinput += len;
4531                             }
4532                         } else if ((len = is_GCB_T_utf8(locinput))) {
4533                             /* Another possibility is T+ */
4534                             locinput += len;
4535                             while (locinput < PL_regeol
4536                                 && (len = is_GCB_T_utf8(locinput)))
4537                             {
4538                                 locinput += len;
4539                             }
4540                         } else {
4541
4542                             /* Here, neither RI+ nor T+; must be some other
4543                              * Hangul.  That means it is one of the others: L,
4544                              * LV, LVT or V, and matches:
4545                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4546
4547                             /* Match L*           */
4548                             while (locinput < PL_regeol
4549                                    && (len = is_GCB_L_utf8(locinput)))
4550                             {
4551                                 locinput += len;
4552                             }
4553
4554                             /* Here, have exhausted L*.  If the next character
4555                              * is not an LV, LVT nor V, it means we had to have
4556                              * at least one L, so matches L+ in the original
4557                              * equation, we have a complete hangul syllable.
4558                              * Are done. */
4559
4560                             if (locinput < PL_regeol
4561                                 && is_GCB_LV_LVT_V_utf8(locinput))
4562                             {
4563
4564                                 /* Otherwise keep going.  Must be LV, LVT or V.
4565                                  * See if LVT */
4566                                 if (is_utf8_X_LVT((U8*)locinput)) {
4567                                     locinput += UTF8SKIP(locinput);
4568                                 } else {
4569
4570                                     /* Must be  V or LV.  Take it, then match
4571                                      * V*     */
4572                                     locinput += UTF8SKIP(locinput);
4573                                     while (locinput < PL_regeol
4574                                            && (len = is_GCB_V_utf8(locinput)))
4575                                     {
4576                                         locinput += len;
4577                                     }
4578                                 }
4579
4580                                 /* And any of LV, LVT, or V can be followed
4581                                  * by T*            */
4582                                 while (locinput < PL_regeol
4583                                        && (len = is_GCB_T_utf8(locinput)))
4584                                 {
4585                                     locinput += len;
4586                                 }
4587                             }
4588                         }
4589                     }
4590
4591                     /* Match any extender */
4592                     while (locinput < PL_regeol
4593                             && swash_fetch(PL_utf8_X_extend,
4594                                             (U8*)locinput, utf8_target))
4595                     {
4596                         locinput += UTF8SKIP(locinput);
4597                     }
4598                 }
4599             exit_utf8:
4600                 if (locinput > PL_regeol) sayNO;
4601             }
4602             break;
4603
4604         case NREFFL:  /*  /\g{name}/il  */
4605         {   /* The capture buffer cases.  The ones beginning with N for the
4606                named buffers just convert to the equivalent numbered and
4607                pretend they were called as the corresponding numbered buffer
4608                op.  */
4609             /* don't initialize these in the declaration, it makes C++
4610                unhappy */
4611             char *s;
4612             char type;
4613             re_fold_t folder;
4614             const U8 *fold_array;
4615             UV utf8_fold_flags;
4616
4617             PL_reg_flags |= RF_tainted;
4618             folder = foldEQ_locale;
4619             fold_array = PL_fold_locale;
4620             type = REFFL;
4621             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4622             goto do_nref;
4623
4624         case NREFFA:  /*  /\g{name}/iaa  */
4625             folder = foldEQ_latin1;
4626             fold_array = PL_fold_latin1;
4627             type = REFFA;
4628             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4629             goto do_nref;
4630
4631         case NREFFU:  /*  /\g{name}/iu  */
4632             folder = foldEQ_latin1;
4633             fold_array = PL_fold_latin1;
4634             type = REFFU;
4635             utf8_fold_flags = 0;
4636             goto do_nref;
4637
4638         case NREFF:  /*  /\g{name}/i  */
4639             folder = foldEQ;
4640             fold_array = PL_fold;
4641             type = REFF;
4642             utf8_fold_flags = 0;
4643             goto do_nref;
4644
4645         case NREF:  /*  /\g{name}/   */
4646             type = REF;
4647             folder = NULL;
4648             fold_array = NULL;
4649             utf8_fold_flags = 0;
4650           do_nref:
4651
4652             /* For the named back references, find the corresponding buffer
4653              * number */
4654             n = reg_check_named_buff_matched(rex,scan);
4655
4656             if ( ! n ) {
4657                 sayNO;
4658             }
4659             goto do_nref_ref_common;
4660
4661         case REFFL:  /*  /\1/il  */
4662             PL_reg_flags |= RF_tainted;
4663             folder = foldEQ_locale;
4664             fold_array = PL_fold_locale;
4665             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4666             goto do_ref;
4667
4668         case REFFA:  /*  /\1/iaa  */
4669             folder = foldEQ_latin1;
4670             fold_array = PL_fold_latin1;
4671             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4672             goto do_ref;
4673
4674         case REFFU:  /*  /\1/iu  */
4675             folder = foldEQ_latin1;
4676             fold_array = PL_fold_latin1;
4677             utf8_fold_flags = 0;
4678             goto do_ref;
4679
4680         case REFF:  /*  /\1/i  */
4681             folder = foldEQ;
4682             fold_array = PL_fold;
4683             utf8_fold_flags = 0;
4684             goto do_ref;
4685
4686         case REF:  /*  /\1/    */
4687             folder = NULL;
4688             fold_array = NULL;
4689             utf8_fold_flags = 0;
4690
4691           do_ref:
4692             type = OP(scan);
4693             n = ARG(scan);  /* which paren pair */
4694
4695           do_nref_ref_common:
4696             ln = rex->offs[n].start;
4697             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4698             if (rex->lastparen < n || ln == -1)
4699                 sayNO;                  /* Do not match unless seen CLOSEn. */
4700             if (ln == rex->offs[n].end)
4701                 break;
4702
4703             s = PL_bostr + ln;
4704             if (type != REF     /* REF can do byte comparison */
4705                 && (utf8_target || type == REFFU))
4706             { /* XXX handle REFFL better */
4707                 char * limit = PL_regeol;
4708
4709                 /* This call case insensitively compares the entire buffer
4710                     * at s, with the current input starting at locinput, but
4711                     * not going off the end given by PL_regeol, and returns in
4712                     * <limit> upon success, how much of the current input was
4713                     * matched */
4714                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4715                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4716                 {
4717                     sayNO;
4718                 }
4719                 locinput = limit;
4720                 break;
4721             }
4722
4723             /* Not utf8:  Inline the first character, for speed. */
4724             if (!NEXTCHR_IS_EOS &&
4725                 UCHARAT(s) != nextchr &&
4726                 (type == REF ||
4727                  UCHARAT(s) != fold_array[nextchr]))
4728                 sayNO;
4729             ln = rex->offs[n].end - ln;
4730             if (locinput + ln > PL_regeol)
4731                 sayNO;
4732             if (ln > 1 && (type == REF
4733                            ? memNE(s, locinput, ln)
4734                            : ! folder(s, locinput, ln)))
4735                 sayNO;
4736             locinput += ln;
4737             break;
4738         }
4739
4740         case NOTHING: /* null op; e.g. the 'nothing' following
4741                        * the '*' in m{(a+|b)*}' */
4742             break;
4743         case TAIL: /* placeholder while compiling (A|B|C) */
4744             break;
4745
4746         case BACK: /* ??? doesn't appear to be used ??? */
4747             break;
4748
4749 #undef  ST
4750 #define ST st->u.eval
4751         {
4752             SV *ret;
4753             REGEXP *re_sv;
4754             regexp *re;
4755             regexp_internal *rei;
4756             regnode *startpoint;
4757
4758         case GOSTART: /*  (?R)  */
4759         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4760             if (cur_eval && cur_eval->locinput==locinput) {
4761                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4762                     Perl_croak(aTHX_ "Infinite recursion in regex");
4763                 if ( ++nochange_depth > max_nochange_depth )
4764                     Perl_croak(aTHX_
4765                         "Pattern subroutine nesting without pos change"
4766                         " exceeded limit in regex");
4767             } else {
4768                 nochange_depth = 0;
4769             }
4770             re_sv = rex_sv;
4771             re = rex;
4772             rei = rexi;
4773             if (OP(scan)==GOSUB) {
4774                 startpoint = scan + ARG2L(scan);
4775                 ST.close_paren = ARG(scan);
4776             } else {
4777                 startpoint = rei->program+1;
4778                 ST.close_paren = 0;
4779             }
4780             goto eval_recurse_doit;
4781             assert(0); /* NOTREACHED */
4782
4783         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4784             if (cur_eval && cur_eval->locinput==locinput) {
4785                 if ( ++nochange_depth > max_nochange_depth )
4786                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4787             } else {
4788                 nochange_depth = 0;
4789             }
4790             {
4791                 /* execute the code in the {...} */
4792
4793                 dSP;
4794                 PADOFFSET before;
4795                 OP * const oop = PL_op;
4796                 COP * const ocurcop = PL_curcop;
4797                 OP *nop;
4798                 char *saved_regeol = PL_regeol;
4799                 struct re_save_state saved_state;
4800                 CV *newcv;
4801
4802                 /* save *all* paren positions */
4803                 regcppush(rex, 0);
4804                 REGCP_SET(runops_cp);
4805
4806                 /* To not corrupt the existing regex state while executing the
4807                  * eval we would normally put it on the save stack, like with
4808                  * save_re_context. However, re-evals have a weird scoping so we
4809                  * can't just add ENTER/LEAVE here. With that, things like
4810                  *
4811                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4812                  *
4813                  * would break, as they expect the localisation to be unwound
4814                  * only when the re-engine backtracks through the bit that
4815                  * localised it.
4816                  *
4817                  * What we do instead is just saving the state in a local c
4818                  * variable.
4819                  */
4820                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4821
4822                 PL_reg_state.re_reparsing = FALSE;
4823
4824                 if (!caller_cv)
4825                     caller_cv = find_runcv(NULL);
4826
4827                 n = ARG(scan);
4828
4829                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4830                     newcv = (ReANY(
4831                                                 (REGEXP*)(rexi->data->data[n])
4832                                             ))->qr_anoncv
4833                                         ;
4834                     nop = (OP*)rexi->data->data[n+1];
4835                 }
4836                 else if (rexi->data->what[n] == 'l') { /* literal code */
4837                     newcv = caller_cv;
4838                     nop = (OP*)rexi->data->data[n];
4839                     assert(CvDEPTH(newcv));
4840                 }
4841                 else {
4842                     /* literal with own CV */
4843                     assert(rexi->data->what[n] == 'L');
4844                     newcv = rex->qr_anoncv;
4845                     nop = (OP*)rexi->data->data[n];
4846                 }
4847
4848                 /* normally if we're about to execute code from the same
4849                  * CV that we used previously, we just use the existing
4850                  * CX stack entry. However, its possible that in the
4851                  * meantime we may have backtracked, popped from the save
4852                  * stack, and undone the SAVECOMPPAD(s) associated with
4853                  * PUSH_MULTICALL; in which case PL_comppad no longer
4854                  * points to newcv's pad. */
4855                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4856                 {
4857                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4858                     if (last_pushed_cv) {
4859                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4860                     }
4861                     else {
4862                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4863                     }
4864                     last_pushed_cv = newcv;
4865                 }
4866                 else {
4867                     /* these assignments are just to silence compiler
4868                      * warnings */
4869                     multicall_cop = NULL;
4870                     newsp = NULL;
4871                 }
4872                 last_pad = PL_comppad;
4873
4874                 /* the initial nextstate you would normally execute
4875                  * at the start of an eval (which would cause error
4876                  * messages to come from the eval), may be optimised
4877                  * away from the execution path in the regex code blocks;
4878                  * so manually set PL_curcop to it initially */
4879                 {
4880                     OP *o = cUNOPx(nop)->op_first;
4881                     assert(o->op_type == OP_NULL);
4882                     if (o->op_targ == OP_SCOPE) {
4883                         o = cUNOPo->op_first;
4884                     }
4885                     else {
4886                         assert(o->op_targ == OP_LEAVE);
4887                         o = cUNOPo->op_first;
4888                         assert(o->op_type == OP_ENTER);
4889                         o = o->op_sibling;
4890                     }
4891
4892                     if (o->op_type != OP_STUB) {
4893                         assert(    o->op_type == OP_NEXTSTATE
4894                                 || o->op_type == OP_DBSTATE
4895                                 || (o->op_type == OP_NULL
4896                                     &&  (  o->op_targ == OP_NEXTSTATE
4897                                         || o->op_targ == OP_DBSTATE
4898                                         )
4899                                     )
4900                         );
4901                         PL_curcop = (COP*)o;
4902                     }
4903                 }
4904                 nop = nop->op_next;
4905
4906                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4907                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4908
4909                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4910
4911                 if (sv_yes_mark) {
4912                     SV *sv_mrk = get_sv("REGMARK", 1);
4913                     sv_setsv(sv_mrk, sv_yes_mark);
4914                 }
4915
4916                 /* we don't use MULTICALL here as we want to call the
4917                  * first op of the block of interest, rather than the
4918                  * first op of the sub */
4919                 before = SP-PL_stack_base;
4920                 PL_op = nop;
4921                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4922                 SPAGAIN;
4923                 if (SP-PL_stack_base == before)
4924                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4925                 else {
4926                     ret = POPs;
4927                     PUTBACK;
4928                 }
4929
4930                 /* before restoring everything, evaluate the returned
4931                  * value, so that 'uninit' warnings don't use the wrong
4932                  * PL_op or pad. Also need to process any magic vars
4933                  * (e.g. $1) *before* parentheses are restored */
4934
4935                 PL_op = NULL;
4936
4937                 re_sv = NULL;
4938                 if (logical == 0)        /*   (?{})/   */
4939                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4940                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4941                     sw = cBOOL(SvTRUE(ret));
4942                     logical = 0;
4943                 }
4944                 else {                   /*  /(??{})  */
4945                     /*  if its overloaded, let the regex compiler handle
4946                      *  it; otherwise extract regex, or stringify  */
4947                     if (!SvAMAGIC(ret)) {
4948                         SV *sv = ret;
4949                         if (SvROK(sv))
4950                             sv = SvRV(sv);
4951                         if (SvTYPE(sv) == SVt_REGEXP)
4952                             re_sv = (REGEXP*) sv;
4953                         else if (SvSMAGICAL(sv)) {
4954                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4955                             if (mg)
4956                                 re_sv = (REGEXP *) mg->mg_obj;
4957                         }
4958
4959                         /* force any magic, undef warnings here */
4960                         if (!re_sv) {
4961                             ret = sv_mortalcopy(ret);
4962                             (void) SvPV_force_nolen(ret);
4963                         }
4964                     }
4965
4966                 }
4967
4968                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4969
4970                 /* *** Note that at this point we don't restore
4971                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4972                  * be used again soon. This is safe as long as nothing
4973                  * in the regexp code uses the pad ! */
4974                 PL_op = oop;
4975                 PL_curcop = ocurcop;
4976                 PL_regeol = saved_regeol;
4977                 S_regcp_restore(aTHX_ rex, runops_cp);
4978
4979                 if (logical != 2)
4980                     break;
4981             }
4982
4983                 /* only /(??{})/  from now on */
4984                 logical = 0;
4985                 {
4986                     /* extract RE object from returned value; compiling if
4987                      * necessary */
4988
4989                     if (re_sv) {
4990                         re_sv = reg_temp_copy(NULL, re_sv);
4991                     }
4992                     else {
4993                         U32 pm_flags = 0;
4994                         const I32 osize = PL_regsize;
4995
4996                         if (SvUTF8(ret) && IN_BYTES) {
4997                             /* In use 'bytes': make a copy of the octet
4998                              * sequence, but without the flag on */
4999                             STRLEN len;
5000                             const char *const p = SvPV(ret, len);
5001                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5002                         }
5003                         if (rex->intflags & PREGf_USE_RE_EVAL)
5004                             pm_flags |= PMf_USE_RE_EVAL;
5005
5006                         /* if we got here, it should be an engine which
5007                          * supports compiling code blocks and stuff */
5008                         assert(rex->engine && rex->engine->op_comp);
5009                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5010                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5011                                     rex->engine, NULL, NULL,
5012                                     /* copy /msix etc to inner pattern */
5013                                     scan->flags,
5014                                     pm_flags);
5015
5016                         if (!(SvFLAGS(ret)
5017                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5018                                  | SVs_GMG))) {
5019                             /* This isn't a first class regexp. Instead, it's
5020                                caching a regexp onto an existing, Perl visible
5021                                scalar.  */
5022                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5023                         }
5024                         PL_regsize = osize;
5025                         /* safe to do now that any $1 etc has been
5026                          * interpolated into the new pattern string and
5027                          * compiled */
5028                         S_regcp_restore(aTHX_ rex, runops_cp);
5029                     }
5030                     SAVEFREESV(re_sv);
5031                     re = ReANY(re_sv);
5032                 }
5033                 RXp_MATCH_COPIED_off(re);
5034                 re->subbeg = rex->subbeg;
5035                 re->sublen = rex->sublen;
5036                 re->suboffset = rex->suboffset;
5037                 re->subcoffset = rex->subcoffset;
5038                 rei = RXi_GET(re);
5039                 DEBUG_EXECUTE_r(
5040                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5041                         "Matching embedded");
5042                 );
5043                 startpoint = rei->program + 1;
5044                 ST.close_paren = 0; /* only used for GOSUB */
5045
5046         eval_recurse_doit: /* Share code with GOSUB below this line */
5047                 /* run the pattern returned from (??{...}) */
5048                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
5049                 REGCP_SET(ST.lastcp);
5050
5051                 re->lastparen = 0;
5052                 re->lastcloseparen = 0;
5053
5054                 PL_regsize = 0;
5055
5056                 /* XXXX This is too dramatic a measure... */
5057                 PL_reg_maxiter = 0;
5058
5059                 ST.toggle_reg_flags = PL_reg_flags;
5060                 if (RX_UTF8(re_sv))
5061                     PL_reg_flags |= RF_utf8;
5062                 else
5063                     PL_reg_flags &= ~RF_utf8;
5064                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5065
5066                 ST.prev_rex = rex_sv;
5067                 ST.prev_curlyx = cur_curlyx;
5068                 rex_sv = re_sv;
5069                 SET_reg_curpm(rex_sv);
5070                 rex = re;
5071                 rexi = rei;
5072                 cur_curlyx = NULL;
5073                 ST.B = next;
5074                 ST.prev_eval = cur_eval;
5075                 cur_eval = st;
5076                 /* now continue from first node in postoned RE */
5077                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5078                 assert(0); /* NOTREACHED */
5079         }
5080
5081         case EVAL_AB: /* cleanup after a successful (??{A})B */
5082             /* note: this is called twice; first after popping B, then A */
5083             PL_reg_flags ^= ST.toggle_reg_flags;
5084             rex_sv = ST.prev_rex;
5085             SET_reg_curpm(rex_sv);
5086             rex = ReANY(rex_sv);
5087             rexi = RXi_GET(rex);
5088             regcpblow(ST.cp);
5089             cur_eval = ST.prev_eval;
5090             cur_curlyx = ST.prev_curlyx;
5091
5092             /* XXXX This is too dramatic a measure... */
5093             PL_reg_maxiter = 0;
5094             if ( nochange_depth )
5095                 nochange_depth--;
5096             sayYES;
5097
5098
5099         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5100             /* note: this is called twice; first after popping B, then A */
5101             PL_reg_flags ^= ST.toggle_reg_flags;
5102             rex_sv = ST.prev_rex;
5103             SET_reg_curpm(rex_sv);
5104             rex = ReANY(rex_sv);
5105             rexi = RXi_GET(rex);
5106
5107             REGCP_UNWIND(ST.lastcp);
5108             regcppop(rex);
5109             cur_eval = ST.prev_eval;
5110             cur_curlyx = ST.prev_curlyx;
5111             /* XXXX This is too dramatic a measure... */
5112             PL_reg_maxiter = 0;
5113             if ( nochange_depth )
5114                 nochange_depth--;
5115             sayNO_SILENT;
5116 #undef ST
5117
5118         case OPEN: /*  (  */
5119             n = ARG(scan);  /* which paren pair */
5120             rex->offs[n].start_tmp = locinput - PL_bostr;
5121             if (n > PL_regsize)
5122                 PL_regsize = n;
5123             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5124                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5125                 PTR2UV(rex),
5126                 PTR2UV(rex->offs),
5127                 (UV)n,
5128                 (IV)rex->offs[n].start_tmp,
5129                 (UV)PL_regsize
5130             ));
5131             lastopen = n;
5132             break;
5133
5134 /* XXX really need to log other places start/end are set too */
5135 #define CLOSE_CAPTURE \
5136     rex->offs[n].start = rex->offs[n].start_tmp; \
5137     rex->offs[n].end = locinput - PL_bostr; \
5138     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5139         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5140         PTR2UV(rex), \
5141         PTR2UV(rex->offs), \
5142         (UV)n, \
5143         (IV)rex->offs[n].start, \
5144         (IV)rex->offs[n].end \
5145     ))
5146
5147         case CLOSE:  /*  )  */
5148             n = ARG(scan);  /* which paren pair */
5149             CLOSE_CAPTURE;
5150             /*if (n > PL_regsize)
5151                 PL_regsize = n;*/
5152             if (n > rex->lastparen)
5153                 rex->lastparen = n;
5154             rex->lastcloseparen = n;
5155             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5156                 goto fake_end;
5157             }
5158             break;
5159
5160         case ACCEPT:  /*  (*ACCEPT)  */
5161             if (ARG(scan)){
5162                 regnode *cursor;
5163                 for (cursor=scan;
5164                      cursor && OP(cursor)!=END;
5165                      cursor=regnext(cursor))
5166                 {
5167                     if ( OP(cursor)==CLOSE ){
5168                         n = ARG(cursor);
5169                         if ( n <= lastopen ) {
5170                             CLOSE_CAPTURE;
5171                             /*if (n > PL_regsize)
5172                             PL_regsize = n;*/
5173                             if (n > rex->lastparen)
5174                                 rex->lastparen = n;
5175                             rex->lastcloseparen = n;
5176                             if ( n == ARG(scan) || (cur_eval &&
5177                                 cur_eval->u.eval.close_paren == n))
5178                                 break;
5179                         }
5180                     }
5181                 }
5182             }
5183             goto fake_end;
5184             /*NOTREACHED*/
5185
5186         case GROUPP:  /*  (?(1))  */
5187             n = ARG(scan);  /* which paren pair */
5188             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5189             break;
5190
5191         case NGROUPP:  /*  (?(<name>))  */
5192             /* reg_check_named_buff_matched returns 0 for no match */
5193             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5194             break;
5195
5196         case INSUBP:   /*  (?(R))  */
5197             n = ARG(scan);
5198             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5199             break;
5200
5201         case DEFINEP:  /*  (?(DEFINE))  */
5202             sw = 0;
5203             break;
5204
5205         case IFTHEN:   /*  (?(cond)A|B)  */
5206             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5207             if (sw)
5208                 next = NEXTOPER(NEXTOPER(scan));
5209             else {
5210                 next = scan + ARG(scan);
5211                 if (OP(next) == IFTHEN) /* Fake one. */
5212                     next = NEXTOPER(NEXTOPER(next));
5213             }
5214             break;
5215
5216         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5217             logical = scan->flags;
5218             break;
5219
5220 /*******************************************************************
5221
5222 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5223 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5224 STAR/PLUS/CURLY/CURLYN are used instead.)
5225
5226 A*B is compiled as <CURLYX><A><WHILEM><B>
5227
5228 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5229 state, which contains the current count, initialised to -1. It also sets
5230 cur_curlyx to point to this state, with any previous value saved in the
5231 state block.
5232
5233 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5234 since the pattern may possibly match zero times (i.e. it's a while {} loop
5235 rather than a do {} while loop).
5236
5237 Each entry to WHILEM represents a successful match of A. The count in the
5238 CURLYX block is incremented, another WHILEM state is pushed, and execution
5239 passes to A or B depending on greediness and the current count.
5240
5241 For example, if matching against the string a1a2a3b (where the aN are
5242 substrings that match /A/), then the match progresses as follows: (the
5243 pushed states are interspersed with the bits of strings matched so far):
5244
5245     <CURLYX cnt=-1>
5246     <CURLYX cnt=0><WHILEM>
5247     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5248     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5249     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5250     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5251
5252 (Contrast this with something like CURLYM, which maintains only a single
5253 backtrack state:
5254
5255     <CURLYM cnt=0> a1
5256     a1 <CURLYM cnt=1> a2
5257     a1 a2 <CURLYM cnt=2> a3
5258     a1 a2 a3 <CURLYM cnt=3> b
5259 )
5260
5261 Each WHILEM state block marks a point to backtrack to upon partial failure
5262 of A or B, and also contains some minor state data related to that
5263 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5264 overall state, such as the count, and pointers to the A and B ops.
5265
5266 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5267 must always point to the *current* CURLYX block, the rules are:
5268
5269 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5270 and set cur_curlyx to point the new block.
5271
5272 When popping the CURLYX block after a successful or unsuccessful match,
5273 restore the previous cur_curlyx.
5274
5275 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5276 to the outer one saved in the CURLYX block.
5277
5278 When popping the WHILEM block after a successful or unsuccessful B match,
5279 restore the previous cur_curlyx.
5280
5281 Here's an example for the pattern (AI* BI)*BO
5282 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5283
5284 cur_
5285 curlyx backtrack stack
5286 ------ ---------------
5287 NULL
5288 CO     <CO prev=NULL> <WO>
5289 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5290 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5291 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5292
5293 At this point the pattern succeeds, and we work back down the stack to
5294 clean up, restoring as we go:
5295
5296 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5297 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5298 CO     <CO prev=NULL> <WO>
5299 NULL
5300
5301 *******************************************************************/
5302
5303 #define ST st->u.curlyx
5304
5305         case CURLYX:    /* start of /A*B/  (for complex A) */
5306         {
5307             /* No need to save/restore up to this paren */
5308             I32 parenfloor = scan->flags;
5309
5310             assert(next); /* keep Coverity happy */
5311             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5312                 next += ARG(next);
5313
5314             /* XXXX Probably it is better to teach regpush to support
5315                parenfloor > PL_regsize... */
5316             if (parenfloor > (I32)rex->lastparen)
5317                 parenfloor = rex->lastparen; /* Pessimization... */
5318
5319             ST.prev_curlyx= cur_curlyx;
5320             cur_curlyx = st;
5321             ST.cp = PL_savestack_ix;
5322
5323             /* these fields contain the state of the current curly.
5324              * they are accessed by subsequent WHILEMs */
5325             ST.parenfloor = parenfloor;
5326             ST.me = scan;
5327             ST.B = next;
5328             ST.minmod = minmod;
5329             minmod = 0;
5330             ST.count = -1;      /* this will be updated by WHILEM */
5331             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5332
5333             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5334             assert(0); /* NOTREACHED */
5335         }
5336
5337         case CURLYX_end: /* just finished matching all of A*B */
5338             cur_curlyx = ST.prev_curlyx;
5339             sayYES;
5340             assert(0); /* NOTREACHED */
5341
5342         case CURLYX_end_fail: /* just failed to match all of A*B */
5343             regcpblow(ST.cp);
5344             cur_curlyx = ST.prev_curlyx;
5345             sayNO;
5346             assert(0); /* NOTREACHED */
5347
5348
5349 #undef ST
5350 #define ST st->u.whilem
5351
5352         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5353         {
5354             /* see the discussion above about CURLYX/WHILEM */
5355             I32 n;
5356             int min = ARG1(cur_curlyx->u.curlyx.me);
5357             int max = ARG2(cur_curlyx->u.curlyx.me);
5358             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5359
5360             assert(cur_curlyx); /* keep Coverity happy */
5361             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5362             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5363             ST.cache_offset = 0;
5364             ST.cache_mask = 0;
5365
5366
5367             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5368                   "%*s  whilem: matched %ld out of %d..%d\n",
5369                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5370             );
5371
5372             /* First just match a string of min A's. */
5373
5374             if (n < min) {
5375                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5376                 cur_curlyx->u.curlyx.lastloc = locinput;
5377                 REGCP_SET(ST.lastcp);
5378
5379                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5380                 assert(0); /* NOTREACHED */
5381             }
5382
5383             /* If degenerate A matches "", assume A done. */
5384
5385             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5386                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5387                    "%*s  whilem: empty match detected, trying continuation...\n",
5388                    REPORT_CODE_OFF+depth*2, "")
5389                 );
5390                 goto do_whilem_B_max;
5391             }
5392
5393             /* super-linear cache processing */
5394
5395             if (scan->flags) {
5396
5397                 if (!PL_reg_maxiter) {
5398                     /* start the countdown: Postpone detection until we
5399                      * know the match is not *that* much linear. */
5400                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5401                     /* possible overflow for long strings and many CURLYX's */
5402                     if (PL_reg_maxiter < 0)
5403                         PL_reg_maxiter = I32_MAX;
5404                     PL_reg_leftiter = PL_reg_maxiter;
5405                 }
5406
5407                 if (PL_reg_leftiter-- == 0) {
5408                     /* initialise cache */
5409                     const I32 size = (PL_reg_maxiter + 7)/8;
5410                     if (PL_reg_poscache) {
5411                         if ((I32)PL_reg_poscache_size < size) {
5412                             Renew(PL_reg_poscache, size, char);
5413                             PL_reg_poscache_size = size;
5414                         }
5415                         Zero(PL_reg_poscache, size, char);
5416                     }
5417                     else {
5418                         PL_reg_poscache_size = size;
5419                         Newxz(PL_reg_poscache, size, char);
5420                     }
5421                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5422       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5423                               PL_colors[4], PL_colors[5])
5424                     );
5425                 }
5426
5427                 if (PL_reg_leftiter < 0) {
5428                     /* have we already failed at this position? */
5429                     I32 offset, mask;
5430                     offset  = (scan->flags & 0xf) - 1
5431                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5432                     mask    = 1 << (offset % 8);
5433                     offset /= 8;
5434                     if (PL_reg_poscache[offset] & mask) {
5435                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5436                             "%*s  whilem: (cache) already tried at this position...\n",
5437                             REPORT_CODE_OFF+depth*2, "")
5438                         );
5439                         sayNO; /* cache records failure */
5440                     }
5441                     ST.cache_offset = offset;
5442                     ST.cache_mask   = mask;
5443                 }
5444             }
5445
5446             /* Prefer B over A for minimal matching. */
5447
5448             if (cur_curlyx->u.curlyx.minmod) {
5449                 ST.save_curlyx = cur_curlyx;
5450                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5451                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5452                 REGCP_SET(ST.lastcp);
5453                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5454                                     locinput);
5455                 assert(0); /* NOTREACHED */
5456             }
5457
5458             /* Prefer A over B for maximal matching. */
5459
5460             if (n < max) { /* More greed allowed? */
5461                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5462                 cur_curlyx->u.curlyx.lastloc = locinput;
5463                 REGCP_SET(ST.lastcp);
5464                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5465                 assert(0); /* NOTREACHED */
5466             }
5467             goto do_whilem_B_max;
5468         }
5469         assert(0); /* NOTREACHED */
5470
5471         case WHILEM_B_min: /* just matched B in a minimal match */
5472         case WHILEM_B_max: /* just matched B in a maximal match */
5473             cur_curlyx = ST.save_curlyx;
5474             sayYES;
5475             assert(0); /* NOTREACHED */
5476
5477         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5478             cur_curlyx = ST.save_curlyx;
5479             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5480             cur_curlyx->u.curlyx.count--;
5481             CACHEsayNO;
5482             assert(0); /* NOTREACHED */
5483
5484         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5485             /* FALL THROUGH */
5486         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5487             REGCP_UNWIND(ST.lastcp);
5488             regcppop(rex);
5489             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5490             cur_curlyx->u.curlyx.count--;
5491             CACHEsayNO;
5492             assert(0); /* NOTREACHED */
5493
5494         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5495             REGCP_UNWIND(ST.lastcp);
5496             regcppop(rex);      /* Restore some previous $<digit>s? */
5497             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5498                 "%*s  whilem: failed, trying continuation...\n",
5499                 REPORT_CODE_OFF+depth*2, "")
5500             );
5501           do_whilem_B_max:
5502             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5503                 && ckWARN(WARN_REGEXP)
5504                 && !(PL_reg_flags & RF_warned))
5505             {
5506                 PL_reg_flags |= RF_warned;
5507                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5508                      "Complex regular subexpression recursion limit (%d) "
5509                      "exceeded",
5510                      REG_INFTY - 1);
5511             }
5512
5513             /* now try B */
5514             ST.save_curlyx = cur_curlyx;
5515             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5516             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5517                                 locinput);
5518             assert(0); /* NOTREACHED */
5519
5520         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5521             cur_curlyx = ST.save_curlyx;
5522             REGCP_UNWIND(ST.lastcp);
5523             regcppop(rex);
5524
5525             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5526                 /* Maximum greed exceeded */
5527                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5528                     && ckWARN(WARN_REGEXP)
5529                     && !(PL_reg_flags & RF_warned))
5530                 {
5531                     PL_reg_flags |= RF_warned;
5532                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5533                         "Complex regular subexpression recursion "
5534                         "limit (%d) exceeded",
5535                         REG_INFTY - 1);
5536                 }
5537                 cur_curlyx->u.curlyx.count--;
5538                 CACHEsayNO;
5539             }
5540
5541             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5542                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5543             );
5544             /* Try grabbing another A and see if it helps. */
5545             cur_curlyx->u.curlyx.lastloc = locinput;
5546             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5547             REGCP_SET(ST.lastcp);
5548             PUSH_STATE_GOTO(WHILEM_A_min,
5549                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5550                 locinput);
5551             assert(0); /* NOTREACHED */
5552
5553 #undef  ST
5554 #define ST st->u.branch
5555
5556         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5557             next = scan + ARG(scan);
5558             if (next == scan)
5559                 next = NULL;
5560             scan = NEXTOPER(scan);
5561             /* FALL THROUGH */
5562
5563         case BRANCH:        /*  /(...|A|...)/ */
5564             scan = NEXTOPER(scan); /* scan now points to inner node */
5565             ST.lastparen = rex->lastparen;
5566             ST.lastcloseparen = rex->lastcloseparen;
5567             ST.next_branch = next;
5568             REGCP_SET(ST.cp);
5569
5570             /* Now go into the branch */
5571             if (has_cutgroup) {
5572                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5573             } else {
5574                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5575             }
5576             assert(0); /* NOTREACHED */
5577
5578         case CUTGROUP:  /*  /(*THEN)/  */
5579             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5580                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5581             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5582             assert(0); /* NOTREACHED */
5583
5584         case CUTGROUP_next_fail:
5585             do_cutgroup = 1;
5586             no_final = 1;
5587             if (st->u.mark.mark_name)
5588                 sv_commit = st->u.mark.mark_name;
5589             sayNO;
5590             assert(0); /* NOTREACHED */
5591
5592         case BRANCH_next:
5593             sayYES;
5594             assert(0); /* NOTREACHED */
5595
5596         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5597             if (do_cutgroup) {
5598                 do_cutgroup = 0;
5599                 no_final = 0;
5600             }
5601             REGCP_UNWIND(ST.cp);
5602             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5603             scan = ST.next_branch;
5604             /* no more branches? */
5605             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5606                 DEBUG_EXECUTE_r({
5607                     PerlIO_printf( Perl_debug_log,
5608                         "%*s  %sBRANCH failed...%s\n",
5609                         REPORT_CODE_OFF+depth*2, "",
5610                         PL_colors[4],
5611                         PL_colors[5] );
5612                 });
5613                 sayNO_SILENT;
5614             }
5615             continue; /* execute next BRANCH[J] op */
5616             assert(0); /* NOTREACHED */
5617
5618         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5619             minmod = 1;
5620             break;
5621
5622 #undef  ST
5623 #define ST st->u.curlym
5624
5625         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5626
5627             /* This is an optimisation of CURLYX that enables us to push
5628              * only a single backtracking state, no matter how many matches
5629              * there are in {m,n}. It relies on the pattern being constant
5630              * length, with no parens to influence future backrefs
5631              */
5632
5633             ST.me = scan;
5634             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5635
5636             ST.lastparen      = rex->lastparen;
5637             ST.lastcloseparen = rex->lastcloseparen;
5638
5639             /* if paren positive, emulate an OPEN/CLOSE around A */
5640             if (ST.me->flags) {
5641                 U32 paren = ST.me->flags;
5642                 if (paren > PL_regsize)
5643                     PL_regsize = paren;
5644                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5645             }
5646             ST.A = scan;
5647             ST.B = next;
5648             ST.alen = 0;
5649             ST.count = 0;
5650             ST.minmod = minmod;
5651             minmod = 0;
5652             ST.c1 = CHRTEST_UNINIT;
5653             REGCP_SET(ST.cp);
5654
5655             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5656                 goto curlym_do_B;
5657
5658           curlym_do_A: /* execute the A in /A{m,n}B/  */
5659             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5660             assert(0); /* NOTREACHED */
5661
5662         case CURLYM_A: /* we've just matched an A */
5663             ST.count++;
5664             /* after first match, determine A's length: u.curlym.alen */
5665             if (ST.count == 1) {
5666                 if (PL_reg_match_utf8) {
5667                     char *s = st->locinput;
5668                     while (s < locinput) {
5669                         ST.alen++;
5670                         s += UTF8SKIP(s);
5671                     }
5672                 }
5673                 else {
5674                     ST.alen = locinput - st->locinput;
5675                 }
5676                 if (ST.alen == 0)
5677                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5678             }
5679             DEBUG_EXECUTE_r(
5680                 PerlIO_printf(Perl_debug_log,
5681                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5682                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5683                           (IV) ST.count, (IV)ST.alen)
5684             );
5685
5686             if (cur_eval && cur_eval->u.eval.close_paren &&
5687                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5688                 goto fake_end;
5689
5690             {
5691                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5692                 if ( max == REG_INFTY || ST.count < max )
5693                     goto curlym_do_A; /* try to match another A */
5694             }
5695             goto curlym_do_B; /* try to match B */
5696
5697         case CURLYM_A_fail: /* just failed to match an A */
5698             REGCP_UNWIND(ST.cp);
5699
5700             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5701                 || (cur_eval && cur_eval->u.eval.close_paren &&
5702                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5703                 sayNO;
5704
5705           curlym_do_B: /* execute the B in /A{m,n}B/  */
5706             if (ST.c1 == CHRTEST_UNINIT) {
5707                 /* calculate c1 and c2 for possible match of 1st char
5708                  * following curly */
5709                 ST.c1 = ST.c2 = CHRTEST_VOID;
5710                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5711                     regnode *text_node = ST.B;
5712                     if (! HAS_TEXT(text_node))
5713                         FIND_NEXT_IMPT(text_node);
5714                     /* this used to be
5715
5716                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5717
5718                         But the former is redundant in light of the latter.
5719
5720                         if this changes back then the macro for
5721                         IS_TEXT and friends need to change.
5722                      */
5723                     if (PL_regkind[OP(text_node)] == EXACT) {
5724                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5725                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5726                         {
5727                             sayNO;
5728                         }
5729                     }
5730                 }
5731             }
5732
5733             DEBUG_EXECUTE_r(
5734                 PerlIO_printf(Perl_debug_log,
5735                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5736                     (int)(REPORT_CODE_OFF+(depth*2)),
5737                     "", (IV)ST.count)
5738                 );
5739             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5740                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5741                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5742                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5743                     {
5744                         /* simulate B failing */
5745                         DEBUG_OPTIMISE_r(
5746                             PerlIO_printf(Perl_debug_log,
5747                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5748                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5749                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5750                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5751                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5752                         );
5753                         state_num = CURLYM_B_fail;
5754                         goto reenter_switch;
5755                     }
5756                 }
5757                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5758                     /* simulate B failing */
5759                     DEBUG_OPTIMISE_r(
5760                         PerlIO_printf(Perl_debug_log,
5761                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5762                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5763                             (int) nextchr, ST.c1, ST.c2)
5764                     );
5765                     state_num = CURLYM_B_fail;
5766                     goto reenter_switch;
5767                 }
5768             }
5769
5770             if (ST.me->flags) {
5771                 /* emulate CLOSE: mark current A as captured */
5772                 I32 paren = ST.me->flags;
5773                 if (ST.count) {
5774                     rex->offs[paren].start
5775                         = HOPc(locinput, -ST.alen) - PL_bostr;
5776                     rex->offs[paren].end = locinput - PL_bostr;
5777                     if ((U32)paren > rex->lastparen)
5778                         rex->lastparen = paren;
5779                     rex->lastcloseparen = paren;
5780                 }
5781                 else
5782                     rex->offs[paren].end = -1;
5783                 if (cur_eval && cur_eval->u.eval.close_paren &&
5784                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5785                 {
5786                     if (ST.count)
5787                         goto fake_end;
5788                     else
5789                         sayNO;
5790                 }
5791             }
5792
5793             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5794             assert(0); /* NOTREACHED */
5795
5796         case CURLYM_B_fail: /* just failed to match a B */
5797             REGCP_UNWIND(ST.cp);
5798             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5799             if (ST.minmod) {
5800                 I32 max = ARG2(ST.me);
5801                 if (max != REG_INFTY && ST.count == max)
5802                     sayNO;
5803                 goto curlym_do_A; /* try to match a further A */
5804             }
5805             /* backtrack one A */
5806             if (ST.count == ARG1(ST.me) /* min */)
5807                 sayNO;
5808             ST.count--;
5809             SET_locinput(HOPc(locinput, -ST.alen));
5810             goto curlym_do_B; /* try to match B */
5811
5812 #undef ST
5813 #define ST st->u.curly
5814
5815 #define CURLY_SETPAREN(paren, success) \
5816     if (paren) { \
5817         if (success) { \
5818             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5819             rex->offs[paren].end = locinput - PL_bostr; \
5820             if (paren > rex->lastparen) \
5821                 rex->lastparen = paren; \
5822             rex->lastcloseparen = paren; \
5823         } \
5824         else { \
5825             rex->offs[paren].end = -1; \
5826             rex->lastparen      = ST.lastparen; \
5827             rex->lastcloseparen = ST.lastcloseparen; \
5828         } \
5829     }
5830
5831         case STAR:              /*  /A*B/ where A is width 1 char */
5832             ST.paren = 0;
5833             ST.min = 0;
5834             ST.max = REG_INFTY;
5835             scan = NEXTOPER(scan);
5836             goto repeat;
5837
5838         case PLUS:              /*  /A+B/ where A is width 1 char */
5839             ST.paren = 0;
5840             ST.min = 1;
5841             ST.max = REG_INFTY;
5842             scan = NEXTOPER(scan);
5843             goto repeat;
5844
5845         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5846             ST.paren = scan->flags;     /* Which paren to set */
5847             ST.lastparen      = rex->lastparen;
5848             ST.lastcloseparen = rex->lastcloseparen;
5849             if (ST.paren > PL_regsize)
5850                 PL_regsize = ST.paren;
5851             ST.min = ARG1(scan);  /* min to match */
5852             ST.max = ARG2(scan);  /* max to match */
5853             if (cur_eval && cur_eval->u.eval.close_paren &&
5854                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5855                 ST.min=1;
5856                 ST.max=1;
5857             }
5858             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5859             goto repeat;
5860
5861         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5862             ST.paren = 0;
5863             ST.min = ARG1(scan);  /* min to match */
5864             ST.max = ARG2(scan);  /* max to match */
5865             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5866           repeat:
5867             /*
5868             * Lookahead to avoid useless match attempts
5869             * when we know what character comes next.
5870             *
5871             * Used to only do .*x and .*?x, but now it allows
5872             * for )'s, ('s and (?{ ... })'s to be in the way
5873             * of the quantifier and the EXACT-like node.  -- japhy
5874             */
5875
5876             assert(ST.min <= ST.max);
5877             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5878                 ST.c1 = ST.c2 = CHRTEST_VOID;
5879             }
5880             else {
5881                 regnode *text_node = next;
5882
5883                 if (! HAS_TEXT(text_node))
5884                     FIND_NEXT_IMPT(text_node);
5885
5886                 if (! HAS_TEXT(text_node))
5887                     ST.c1 = ST.c2 = CHRTEST_VOID;
5888                 else {
5889                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5890                         ST.c1 = ST.c2 = CHRTEST_VOID;
5891                     }
5892                     else {
5893
5894                     /*  Currently we only get here when
5895
5896                         PL_rekind[OP(text_node)] == EXACT
5897
5898                         if this changes back then the macro for IS_TEXT and
5899                         friends need to change. */
5900                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5901                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5902                         {
5903                             sayNO;
5904                         }
5905                     }
5906                 }
5907             }
5908
5909             ST.A = scan;
5910             ST.B = next;
5911             if (minmod) {
5912                 char *li = locinput;
5913                 minmod = 0;
5914                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5915                     sayNO;
5916                 SET_locinput(li);
5917                 ST.count = ST.min;
5918                 REGCP_SET(ST.cp);
5919                 if (ST.c1 == CHRTEST_VOID)
5920                     goto curly_try_B_min;
5921
5922                 ST.oldloc = locinput;
5923
5924                 /* set ST.maxpos to the furthest point along the
5925                  * string that could possibly match */
5926                 if  (ST.max == REG_INFTY) {
5927                     ST.maxpos = PL_regeol - 1;
5928                     if (utf8_target)
5929                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5930                             ST.maxpos--;
5931                 }
5932                 else if (utf8_target) {
5933                     int m = ST.max - ST.min;
5934                     for (ST.maxpos = locinput;
5935                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5936                         ST.maxpos += UTF8SKIP(ST.maxpos);
5937                 }
5938                 else {
5939                     ST.maxpos = locinput + ST.max - ST.min;
5940                     if (ST.maxpos >= PL_regeol)
5941                         ST.maxpos = PL_regeol - 1;
5942                 }
5943                 goto curly_try_B_min_known;
5944
5945             }
5946             else {
5947                 /* avoid taking address of locinput, so it can remain
5948                  * a register var */
5949                 char *li = locinput;
5950                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5951                 if (ST.count < ST.min)
5952                     sayNO;
5953                 SET_locinput(li);
5954                 if ((ST.count > ST.min)
5955                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5956                 {
5957                     /* A{m,n} must come at the end of the string, there's
5958                      * no point in backing off ... */
5959                     ST.min = ST.count;
5960                     /* ...except that $ and \Z can match before *and* after
5961                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5962                        We may back off by one in this case. */
5963                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5964                         ST.min--;
5965                 }
5966                 REGCP_SET(ST.cp);
5967                 goto curly_try_B_max;
5968             }
5969             assert(0); /* NOTREACHED */
5970
5971
5972         case CURLY_B_min_known_fail:
5973             /* failed to find B in a non-greedy match where c1,c2 valid */
5974
5975             REGCP_UNWIND(ST.cp);
5976             if (ST.paren) {
5977                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5978             }
5979             /* Couldn't or didn't -- move forward. */
5980             ST.oldloc = locinput;
5981             if (utf8_target)
5982                 locinput += UTF8SKIP(locinput);
5983             else
5984                 locinput++;
5985             ST.count++;
5986           curly_try_B_min_known:
5987              /* find the next place where 'B' could work, then call B */
5988             {
5989                 int n;
5990                 if (utf8_target) {
5991                     n = (ST.oldloc == locinput) ? 0 : 1;
5992                     if (ST.c1 == ST.c2) {
5993                         /* set n to utf8_distance(oldloc, locinput) */
5994                         while (locinput <= ST.maxpos
5995                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5996                         {
5997                             locinput += UTF8SKIP(locinput);
5998                             n++;
5999                         }
6000                     }
6001                     else {
6002                         /* set n to utf8_distance(oldloc, locinput) */
6003                         while (locinput <= ST.maxpos
6004                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6005                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6006                         {
6007                             locinput += UTF8SKIP(locinput);
6008                             n++;
6009                         }
6010                     }
6011                 }
6012                 else {  /* Not utf8_target */
6013                     if (ST.c1 == ST.c2) {
6014                         while (locinput <= ST.maxpos &&
6015                                UCHARAT(locinput) != ST.c1)
6016                             locinput++;
6017                     }
6018                     else {
6019                         while (locinput <= ST.maxpos
6020                                && UCHARAT(locinput) != ST.c1
6021                                && UCHARAT(locinput) != ST.c2)
6022                             locinput++;
6023                     }
6024                     n = locinput - ST.oldloc;
6025                 }
6026                 if (locinput > ST.maxpos)
6027                     sayNO;
6028                 if (n) {
6029                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6030                      * at b; check that everything between oldloc and
6031                      * locinput matches */
6032                     char *li = ST.oldloc;
6033                     ST.count += n;
6034                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
6035                         sayNO;
6036                     assert(n == REG_INFTY || locinput == li);
6037                 }
6038                 CURLY_SETPAREN(ST.paren, ST.count);
6039                 if (cur_eval && cur_eval->u.eval.close_paren &&
6040                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6041                     goto fake_end;
6042                 }
6043                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6044             }
6045             assert(0); /* NOTREACHED */
6046
6047
6048         case CURLY_B_min_fail:
6049             /* failed to find B in a non-greedy match where c1,c2 invalid */
6050
6051             REGCP_UNWIND(ST.cp);
6052             if (ST.paren) {
6053                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6054             }
6055             /* failed -- move forward one */
6056             {
6057                 char *li = locinput;
6058                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6059                     sayNO;
6060                 }
6061                 locinput = li;
6062             }
6063             {
6064                 ST.count++;
6065                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6066                         ST.count > 0)) /* count overflow ? */
6067                 {
6068                   curly_try_B_min:
6069                     CURLY_SETPAREN(ST.paren, ST.count);
6070                     if (cur_eval && cur_eval->u.eval.close_paren &&
6071                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6072                         goto fake_end;
6073                     }
6074                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6075                 }
6076             }
6077             sayNO;
6078             assert(0); /* NOTREACHED */
6079
6080
6081         curly_try_B_max:
6082             /* a successful greedy match: now try to match B */
6083             if (cur_eval && cur_eval->u.eval.close_paren &&
6084                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6085                 goto fake_end;
6086             }
6087             {
6088                 bool could_match = locinput < PL_regeol;
6089
6090                 /* If it could work, try it. */
6091                 if (ST.c1 != CHRTEST_VOID && could_match) {
6092                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6093                     {
6094                         could_match = memEQ(locinput,
6095                                             ST.c1_utf8,
6096                                             UTF8SKIP(locinput))
6097                                     || memEQ(locinput,
6098                                              ST.c2_utf8,
6099                                              UTF8SKIP(locinput));
6100                     }
6101                     else {
6102                         could_match = UCHARAT(locinput) == ST.c1
6103                                       || UCHARAT(locinput) == ST.c2;
6104                     }
6105                 }
6106                 if (ST.c1 == CHRTEST_VOID || could_match) {
6107                     CURLY_SETPAREN(ST.paren, ST.count);
6108                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6109                     assert(0); /* NOTREACHED */
6110                 }
6111             }
6112             /* FALL THROUGH */
6113
6114         case CURLY_B_max_fail:
6115             /* failed to find B in a greedy match */
6116
6117             REGCP_UNWIND(ST.cp);
6118             if (ST.paren) {
6119                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6120             }
6121             /*  back up. */
6122             if (--ST.count < ST.min)
6123                 sayNO;
6124             locinput = HOPc(locinput, -1);
6125             goto curly_try_B_max;
6126
6127 #undef ST
6128
6129         case END: /*  last op of main pattern  */
6130             fake_end:
6131             if (cur_eval) {
6132                 /* we've just finished A in /(??{A})B/; now continue with B */
6133                 st->u.eval.toggle_reg_flags
6134                             = cur_eval->u.eval.toggle_reg_flags;
6135                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6136
6137                 st->u.eval.prev_rex = rex_sv;           /* inner */
6138                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6139                 rex_sv = cur_eval->u.eval.prev_rex;
6140                 SET_reg_curpm(rex_sv);
6141                 rex = ReANY(rex_sv);
6142                 rexi = RXi_GET(rex);
6143                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6144
6145                 REGCP_SET(st->u.eval.lastcp);
6146
6147                 /* Restore parens of the outer rex without popping the
6148                  * savestack */
6149                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6150
6151                 st->u.eval.prev_eval = cur_eval;
6152                 cur_eval = cur_eval->u.eval.prev_eval;
6153                 DEBUG_EXECUTE_r(
6154                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6155                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6156                 if ( nochange_depth )
6157                     nochange_depth--;
6158
6159                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6160                                     locinput); /* match B */
6161             }
6162
6163             if (locinput < reginfo->till) {
6164                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6165                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6166                                       PL_colors[4],
6167                                       (long)(locinput - PL_reg_starttry),
6168                                       (long)(reginfo->till - PL_reg_starttry),
6169                                       PL_colors[5]));
6170
6171                 sayNO_SILENT;           /* Cannot match: too short. */
6172             }
6173             sayYES;                     /* Success! */
6174
6175         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6176             DEBUG_EXECUTE_r(
6177             PerlIO_printf(Perl_debug_log,
6178                 "%*s  %ssubpattern success...%s\n",
6179                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6180             sayYES;                     /* Success! */
6181
6182 #undef  ST
6183 #define ST st->u.ifmatch
6184
6185         {
6186             char *newstart;
6187
6188         case SUSPEND:   /* (?>A) */
6189             ST.wanted = 1;
6190             newstart = locinput;
6191             goto do_ifmatch;
6192
6193         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6194             ST.wanted = 0;
6195             goto ifmatch_trivial_fail_test;
6196
6197         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6198             ST.wanted = 1;
6199           ifmatch_trivial_fail_test:
6200             if (scan->flags) {
6201                 char * const s = HOPBACKc(locinput, scan->flags);
6202                 if (!s) {
6203                     /* trivial fail */
6204                     if (logical) {
6205                         logical = 0;
6206                         sw = 1 - cBOOL(ST.wanted);
6207                     }
6208                     else if (ST.wanted)
6209                         sayNO;
6210                     next = scan + ARG(scan);
6211                     if (next == scan)
6212                         next = NULL;
6213                     break;
6214                 }
6215                 newstart = s;
6216             }
6217             else
6218                 newstart = locinput;
6219
6220           do_ifmatch:
6221             ST.me = scan;
6222             ST.logical = logical;
6223             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6224
6225             /* execute body of (?...A) */
6226             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6227             assert(0); /* NOTREACHED */
6228         }
6229
6230         case IFMATCH_A_fail: /* body of (?...A) failed */
6231             ST.wanted = !ST.wanted;
6232             /* FALL THROUGH */
6233
6234         case IFMATCH_A: /* body of (?...A) succeeded */
6235             if (ST.logical) {
6236                 sw = cBOOL(ST.wanted);
6237             }
6238             else if (!ST.wanted)
6239                 sayNO;
6240
6241             if (OP(ST.me) != SUSPEND) {
6242                 /* restore old position except for (?>...) */
6243                 locinput = st->locinput;
6244             }
6245             scan = ST.me + ARG(ST.me);
6246             if (scan == ST.me)
6247                 scan = NULL;
6248             continue; /* execute B */
6249
6250 #undef ST
6251
6252         case LONGJMP: /*  alternative with many branches compiles to
6253                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6254             next = scan + ARG(scan);
6255             if (next == scan)
6256                 next = NULL;
6257             break;
6258
6259         case COMMIT:  /*  (*COMMIT)  */
6260             reginfo->cutpoint = PL_regeol;
6261             /* FALLTHROUGH */
6262
6263         case PRUNE:   /*  (*PRUNE)   */
6264             if (!scan->flags)
6265                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6266             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6267             assert(0); /* NOTREACHED */
6268
6269         case COMMIT_next_fail:
6270             no_final = 1;
6271             /* FALLTHROUGH */
6272
6273         case OPFAIL:   /* (*FAIL)  */
6274             sayNO;
6275             assert(0); /* NOTREACHED */
6276
6277 #define ST st->u.mark
6278         case MARKPOINT: /*  (*MARK:foo)  */
6279             ST.prev_mark = mark_state;
6280             ST.mark_name = sv_commit = sv_yes_mark
6281                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6282             mark_state = st;
6283             ST.mark_loc = locinput;
6284             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6285             assert(0); /* NOTREACHED */
6286
6287         case MARKPOINT_next:
6288             mark_state = ST.prev_mark;
6289             sayYES;
6290             assert(0); /* NOTREACHED */
6291
6292         case MARKPOINT_next_fail:
6293             if (popmark && sv_eq(ST.mark_name,popmark))
6294             {
6295                 if (ST.mark_loc > startpoint)
6296                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6297                 popmark = NULL; /* we found our mark */
6298                 sv_commit = ST.mark_name;
6299
6300                 DEBUG_EXECUTE_r({
6301                         PerlIO_printf(Perl_debug_log,
6302                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6303                             REPORT_CODE_OFF+depth*2, "",
6304                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6305                 });
6306             }
6307             mark_state = ST.prev_mark;
6308             sv_yes_mark = mark_state ?
6309                 mark_state->u.mark.mark_name : NULL;
6310             sayNO;
6311             assert(0); /* NOTREACHED */
6312
6313         case SKIP:  /*  (*SKIP)  */
6314             if (scan->flags) {
6315                 /* (*SKIP) : if we fail we cut here*/
6316                 ST.mark_name = NULL;
6317                 ST.mark_loc = locinput;
6318                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6319             } else {
6320                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6321                    otherwise do nothing.  Meaning we need to scan
6322                  */
6323                 regmatch_state *cur = mark_state;
6324                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6325
6326                 while (cur) {
6327                     if ( sv_eq( cur->u.mark.mark_name,
6328                                 find ) )
6329                     {
6330                         ST.mark_name = find;
6331                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6332                     }
6333                     cur = cur->u.mark.prev_mark;
6334                 }
6335             }
6336             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6337             break;
6338
6339         case SKIP_next_fail:
6340             if (ST.mark_name) {
6341                 /* (*CUT:NAME) - Set up to search for the name as we
6342                    collapse the stack*/
6343                 popmark = ST.mark_name;
6344             } else {
6345                 /* (*CUT) - No name, we cut here.*/
6346                 if (ST.mark_loc > startpoint)
6347                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6348                 /* but we set sv_commit to latest mark_name if there
6349                    is one so they can test to see how things lead to this
6350                    cut */
6351                 if (mark_state)
6352                     sv_commit=mark_state->u.mark.mark_name;
6353             }
6354             no_final = 1;
6355             sayNO;
6356             assert(0); /* NOTREACHED */
6357 #undef ST
6358
6359         case LNBREAK: /* \R */
6360             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6361                 locinput += n;
6362             } else
6363                 sayNO;
6364             break;
6365
6366 #define CASE_CLASS(nAmE)                              \
6367         case nAmE:                                    \
6368             if (NEXTCHR_IS_EOS)                       \
6369                 sayNO;                                \
6370             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6371                 locinput += n;                        \
6372             } else                                    \
6373                 sayNO;                                \
6374             break;                                    \
6375         case N##nAmE:                                 \
6376             if (NEXTCHR_IS_EOS)                       \
6377                 sayNO;                                \
6378             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6379                 sayNO;                                \
6380             } else {                                  \
6381                 locinput += UTF8SKIP(locinput);       \
6382             }                                         \
6383             break
6384
6385         CASE_CLASS(VERTWS);  /*  \v \V  */
6386         CASE_CLASS(HORIZWS); /*  \h \H  */
6387 #undef CASE_CLASS
6388
6389         default:
6390             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6391                           PTR2UV(scan), OP(scan));
6392             Perl_croak(aTHX_ "regexp memory corruption");
6393
6394         /* this is a point to jump to in order to increment
6395          * locinput by one character */
6396         increment_locinput:
6397             assert(!NEXTCHR_IS_EOS);
6398             if (utf8_target) {
6399                 locinput += PL_utf8skip[nextchr];
6400                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6401                 if (locinput > PL_regeol)
6402                     sayNO;
6403             }
6404             else
6405                 locinput++;
6406             break;
6407
6408         } /* end switch */
6409
6410         /* switch break jumps here */
6411         scan = next; /* prepare to execute the next op and ... */
6412         continue;    /* ... jump back to the top, reusing st */
6413         assert(0); /* NOTREACHED */
6414
6415       push_yes_state:
6416         /* push a state that backtracks on success */
6417         st->u.yes.prev_yes_state = yes_state;
6418         yes_state = st;
6419         /* FALL THROUGH */
6420       push_state:
6421         /* push a new regex state, then continue at scan  */
6422         {
6423             regmatch_state *newst;
6424
6425             DEBUG_STACK_r({
6426                 regmatch_state *cur = st;
6427                 regmatch_state *curyes = yes_state;
6428                 int curd = depth;
6429                 regmatch_slab *slab = PL_regmatch_slab;
6430                 for (;curd > -1;cur--,curd--) {
6431                     if (cur < SLAB_FIRST(slab)) {
6432                         slab = slab->prev;
6433                         cur = SLAB_LAST(slab);
6434                     }
6435                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6436                         REPORT_CODE_OFF + 2 + depth * 2,"",
6437                         curd, PL_reg_name[cur->resume_state],
6438                         (curyes == cur) ? "yes" : ""
6439                     );
6440                     if (curyes == cur)
6441                         curyes = cur->u.yes.prev_yes_state;
6442                 }
6443             } else
6444                 DEBUG_STATE_pp("push")
6445             );
6446             depth++;
6447             st->locinput = locinput;
6448             newst = st+1;
6449             if (newst >  SLAB_LAST(PL_regmatch_slab))
6450                 newst = S_push_slab(aTHX);
6451             PL_regmatch_state = newst;
6452
6453             locinput = pushinput;
6454             st = newst;
6455             continue;
6456             assert(0); /* NOTREACHED */
6457         }
6458     }
6459
6460     /*
6461     * We get here only if there's trouble -- normally "case END" is
6462     * the terminating point.
6463     */
6464     Perl_croak(aTHX_ "corrupted regexp pointers");
6465     /*NOTREACHED*/
6466     sayNO;
6467
6468 yes:
6469     if (yes_state) {
6470         /* we have successfully completed a subexpression, but we must now
6471          * pop to the state marked by yes_state and continue from there */
6472         assert(st != yes_state);
6473 #ifdef DEBUGGING
6474         while (st != yes_state) {
6475             st--;
6476             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6477                 PL_regmatch_slab = PL_regmatch_slab->prev;
6478                 st = SLAB_LAST(PL_regmatch_slab);
6479             }
6480             DEBUG_STATE_r({
6481                 if (no_final) {
6482                     DEBUG_STATE_pp("pop (no final)");
6483                 } else {
6484                     DEBUG_STATE_pp("pop (yes)");
6485                 }
6486             });
6487             depth--;
6488         }
6489 #else
6490         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6491             || yes_state > SLAB_LAST(PL_regmatch_slab))
6492         {
6493             /* not in this slab, pop slab */
6494             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6495             PL_regmatch_slab = PL_regmatch_slab->prev;
6496             st = SLAB_LAST(PL_regmatch_slab);
6497         }
6498         depth -= (st - yes_state);
6499 #endif
6500         st = yes_state;
6501         yes_state = st->u.yes.prev_yes_state;
6502         PL_regmatch_state = st;
6503
6504         if (no_final)
6505             locinput= st->locinput;
6506         state_num = st->resume_state + no_final;
6507         goto reenter_switch;
6508     }
6509
6510     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6511                           PL_colors[4], PL_colors[5]));
6512
6513     if (PL_reg_state.re_state_eval_setup_done) {
6514         /* each successfully executed (?{...}) block does the equivalent of
6515          *   local $^R = do {...}
6516          * When popping the save stack, all these locals would be undone;
6517          * bypass this by setting the outermost saved $^R to the latest
6518          * value */
6519         if (oreplsv != GvSV(PL_replgv))
6520             sv_setsv(oreplsv, GvSV(PL_replgv));
6521     }
6522     result = 1;
6523     goto final_exit;
6524
6525 no:
6526     DEBUG_EXECUTE_r(
6527         PerlIO_printf(Perl_debug_log,
6528             "%*s  %sfailed...%s\n",
6529             REPORT_CODE_OFF+depth*2, "",
6530             PL_colors[4], PL_colors[5])
6531         );
6532
6533 no_silent:
6534     if (no_final) {
6535         if (yes_state) {
6536             goto yes;
6537         } else {
6538             goto final_exit;
6539         }
6540     }
6541     if (depth) {
6542         /* there's a previous state to backtrack to */
6543         st--;
6544         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6545             PL_regmatch_slab = PL_regmatch_slab->prev;
6546             st = SLAB_LAST(PL_regmatch_slab);
6547         }
6548         PL_regmatch_state = st;
6549         locinput= st->locinput;
6550
6551         DEBUG_STATE_pp("pop");
6552         depth--;
6553         if (yes_state == st)
6554             yes_state = st->u.yes.prev_yes_state;
6555
6556         state_num = st->resume_state + 1; /* failure = success + 1 */
6557         goto reenter_switch;
6558     }
6559     result = 0;
6560
6561   final_exit:
6562     if (rex->intflags & PREGf_VERBARG_SEEN) {
6563         SV *sv_err = get_sv("REGERROR", 1);
6564         SV *sv_mrk = get_sv("REGMARK", 1);
6565         if (result) {
6566             sv_commit = &PL_sv_no;
6567             if (!sv_yes_mark)
6568                 sv_yes_mark = &PL_sv_yes;
6569         } else {
6570             if (!sv_commit)
6571                 sv_commit = &PL_sv_yes;
6572             sv_yes_mark = &PL_sv_no;
6573         }
6574         sv_setsv(sv_err, sv_commit);
6575         sv_setsv(sv_mrk, sv_yes_mark);
6576     }
6577
6578
6579     if (last_pushed_cv) {
6580         dSP;
6581         POP_MULTICALL;
6582         PERL_UNUSED_VAR(SP);
6583     }
6584
6585     /* clean up; in particular, free all slabs above current one */
6586     LEAVE_SCOPE(oldsave);
6587
6588     assert(!result ||  locinput - PL_bostr >= 0);
6589     return result ?  locinput - PL_bostr : -1;
6590 }
6591
6592 /*
6593  - regrepeat - repeatedly match something simple, report how many
6594  *
6595  * What 'simple' means is a node which can be the operand of a quantifier like
6596  * '+', or {1,3}
6597  *
6598  * startposp - pointer a pointer to the start position.  This is updated
6599  *             to point to the byte following the highest successful
6600  *             match.
6601  * p         - the regnode to be repeatedly matched against.
6602  * max       - maximum number of things to match.
6603  * depth     - (for debugging) backtracking depth.
6604  */
6605 STATIC I32
6606 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6607 {
6608     dVAR;
6609     char *scan;     /* Pointer to current position in target string */
6610     I32 c;
6611     char *loceol = PL_regeol;   /* local version */
6612     I32 hardcount = 0;  /* How many matches so far */
6613     bool utf8_target = PL_reg_match_utf8;
6614     UV utf8_flags;
6615 #ifndef DEBUGGING
6616     PERL_UNUSED_ARG(depth);
6617 #endif
6618
6619     PERL_ARGS_ASSERT_REGREPEAT;
6620
6621     scan = *startposp;
6622     if (max == REG_INFTY)
6623         max = I32_MAX;
6624     else if (! utf8_target && scan + max < loceol)
6625         loceol = scan + max;
6626
6627     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6628      * to the maximum of how far we should go in it (leaving it set to the real
6629      * end, if the maximum permissible would take us beyond that).  This allows
6630      * us to make the loop exit condition that we haven't gone past <loceol> to
6631      * also mean that we haven't exceeded the max permissible count, saving a
6632      * test each time through the loop.  But it assumes that the OP matches a
6633      * single byte, which is true for most of the OPs below when applied to a
6634      * non-UTF-8 target.  Those relatively few OPs that don't have this
6635      * characteristic will have to compensate.
6636      *
6637      * There is no adjustment for UTF-8 targets, as the number of bytes per
6638      * character varies.  OPs will have to test both that the count is less
6639      * than the max permissible (using <hardcount> to keep track), and that we
6640      * are still within the bounds of the string (using <loceol>.  A few OPs
6641      * match a single byte no matter what the encoding.  They can omit the max
6642      * test if, for the UTF-8 case, they do the adjustment that was skipped
6643      * above.
6644      *
6645      * Thus, the code above sets things up for the common case; and exceptional
6646      * cases need extra work; the common case is to make sure <scan> doesn't
6647      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6648      * count doesn't exceed the maximum permissible */
6649
6650     switch (OP(p)) {
6651     case REG_ANY:
6652         if (utf8_target) {
6653             while (scan < loceol && hardcount < max && *scan != '\n') {
6654                 scan += UTF8SKIP(scan);
6655                 hardcount++;
6656             }
6657         } else {
6658             while (scan < loceol && *scan != '\n')
6659                 scan++;
6660         }
6661         break;
6662     case SANY:
6663         if (utf8_target) {
6664             while (scan < loceol && hardcount < max) {
6665                 scan += UTF8SKIP(scan);
6666                 hardcount++;
6667             }
6668         }
6669         else
6670             scan = loceol;
6671         break;
6672     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6673         if (utf8_target && scan + max < loceol) {
6674
6675             /* <loceol> hadn't been adjusted in the UTF-8 case */
6676             scan +=  max;
6677         }
6678         else {
6679             scan = loceol;
6680         }
6681         break;
6682     case EXACT:
6683         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6684
6685         c = (U8)*STRING(p);
6686
6687         /* Can use a simple loop if the pattern char to match on is invariant
6688          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6689          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6690          * true iff it doesn't matter if the argument is in UTF-8 or not */
6691         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6692             if (utf8_target && scan + max < loceol) {
6693                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6694                  * since here, to match at all, 1 char == 1 byte */
6695                 loceol = scan + max;
6696             }
6697             while (scan < loceol && UCHARAT(scan) == c) {
6698                 scan++;
6699             }
6700         }
6701         else if (UTF_PATTERN) {
6702             if (utf8_target) {
6703                 STRLEN scan_char_len;
6704
6705                 /* When both target and pattern are UTF-8, we have to do
6706                  * string EQ */
6707                 while (hardcount < max
6708                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6709                        && scan_char_len <= STR_LEN(p)
6710                        && memEQ(scan, STRING(p), scan_char_len))
6711                 {
6712                     scan += scan_char_len;
6713                     hardcount++;
6714                 }
6715             }
6716             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6717
6718                 /* Target isn't utf8; convert the character in the UTF-8
6719                  * pattern to non-UTF8, and do a simple loop */
6720                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6721                 while (scan < loceol && UCHARAT(scan) == c) {
6722                     scan++;
6723                 }
6724             } /* else pattern char is above Latin1, can't possibly match the
6725                  non-UTF-8 target */
6726         }
6727         else {
6728
6729             /* Here, the string must be utf8; pattern isn't, and <c> is
6730              * different in utf8 than not, so can't compare them directly.
6731              * Outside the loop, find the two utf8 bytes that represent c, and
6732              * then look for those in sequence in the utf8 string */
6733             U8 high = UTF8_TWO_BYTE_HI(c);
6734             U8 low = UTF8_TWO_BYTE_LO(c);
6735
6736             while (hardcount < max
6737                     && scan + 1 < loceol
6738                     && UCHARAT(scan) == high
6739                     && UCHARAT(scan + 1) == low)
6740             {
6741                 scan += 2;
6742                 hardcount++;
6743             }
6744         }
6745         break;
6746
6747     case EXACTFA:
6748         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6749         goto do_exactf;
6750
6751     case EXACTFL:
6752         PL_reg_flags |= RF_tainted;
6753         utf8_flags = FOLDEQ_UTF8_LOCALE;
6754         goto do_exactf;
6755
6756     case EXACTF:
6757             utf8_flags = 0;
6758             goto do_exactf;
6759
6760     case EXACTFU_SS:
6761     case EXACTFU_TRICKYFOLD:
6762     case EXACTFU:
6763         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6764
6765     do_exactf: {
6766         int c1, c2;
6767         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6768
6769         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6770
6771         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6772             if (c1 == CHRTEST_VOID) {
6773                 /* Use full Unicode fold matching */
6774                 char *tmpeol = PL_regeol;
6775                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6776                 while (hardcount < max
6777                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6778                                              STRING(p), NULL, pat_len,
6779                                              cBOOL(UTF_PATTERN), utf8_flags))
6780                 {
6781                     scan = tmpeol;
6782                     tmpeol = PL_regeol;
6783                     hardcount++;
6784                 }
6785             }
6786             else if (utf8_target) {
6787                 if (c1 == c2) {
6788                     while (scan < loceol
6789                            && hardcount < max
6790                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6791                     {
6792                         scan += UTF8SKIP(scan);
6793                         hardcount++;
6794                     }
6795                 }
6796                 else {
6797                     while (scan < loceol
6798                            && hardcount < max
6799                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6800                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6801                     {
6802                         scan += UTF8SKIP(scan);
6803                         hardcount++;
6804                     }
6805                 }
6806             }
6807             else if (c1 == c2) {
6808                 while (scan < loceol && UCHARAT(scan) == c1) {
6809                     scan++;
6810                 }
6811             }
6812             else {
6813                 while (scan < loceol &&
6814                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6815                 {
6816                     scan++;
6817                 }
6818             }
6819         }
6820         break;
6821     }
6822     case ANYOF:
6823         if (utf8_target) {
6824             STRLEN inclasslen;
6825             while (hardcount < max
6826                    && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6827                    && reginclass(prog, p, (U8*)scan, utf8_target))
6828             {
6829                 scan += inclasslen;
6830                 hardcount++;
6831             }
6832         } else {
6833             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6834                 scan++;
6835         }
6836         break;
6837     case ALNUMU:
6838         if (utf8_target) {
6839     utf8_wordchar:
6840             LOAD_UTF8_CHARCLASS_ALNUM();
6841             while (hardcount < max && scan < loceol &&
6842                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6843             {
6844                 scan += UTF8SKIP(scan);
6845                 hardcount++;
6846             }
6847         } else {
6848             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6849                 scan++;
6850             }
6851         }
6852         break;
6853     case ALNUM:
6854         if (utf8_target)
6855             goto utf8_wordchar;
6856         while (scan < loceol && isALNUM((U8) *scan)) {
6857             scan++;
6858         }
6859         break;
6860     case ALNUMA:
6861         if (utf8_target && scan + max < loceol) {
6862
6863             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6864              * since here, to match, 1 char == 1 byte */
6865             loceol = scan + max;
6866         }
6867         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6868             scan++;
6869         }
6870         break;
6871     case ALNUML:
6872         PL_reg_flags |= RF_tainted;
6873         if (utf8_target) {
6874             while (hardcount < max && scan < loceol &&
6875                    isALNUM_LC_utf8((U8*)scan)) {
6876                 scan += UTF8SKIP(scan);
6877                 hardcount++;
6878             }
6879         } else {
6880             while (scan < loceol && isALNUM_LC(*scan))
6881                 scan++;
6882         }
6883         break;
6884     case NALNUMU:
6885         if (utf8_target) {
6886
6887     utf8_Nwordchar:
6888
6889             LOAD_UTF8_CHARCLASS_ALNUM();
6890             while (hardcount < max && scan < loceol &&
6891                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6892             {
6893                 scan += UTF8SKIP(scan);
6894                 hardcount++;
6895             }
6896         } else {
6897             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6898                 scan++;
6899             }
6900         }
6901         break;
6902     case NALNUM:
6903         if (utf8_target)
6904             goto utf8_Nwordchar;
6905         while (scan < loceol && ! isALNUM((U8) *scan)) {
6906             scan++;
6907         }
6908         break;
6909
6910     case POSIXA:
6911         if (utf8_target && scan + max < loceol) {
6912
6913             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6914              * since here, to match, 1 char == 1 byte */
6915             loceol = scan + max;
6916         }
6917         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6918             scan++;
6919         }
6920         break;
6921     case NPOSIXA:
6922         if (utf8_target) {
6923             while (scan < loceol && hardcount < max
6924                    && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6925             {
6926                 scan += UTF8SKIP(scan);
6927                 hardcount++;
6928             }
6929         }
6930         else {
6931             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6932                 scan++;
6933             }
6934         }
6935         break;
6936     case NALNUMA:
6937         if (utf8_target) {
6938             while (scan < loceol && hardcount < max
6939                    && ! isWORDCHAR_A((U8) *scan))
6940             {
6941                 scan += UTF8SKIP(scan);
6942                 hardcount++;
6943             }
6944         }
6945         else {
6946             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6947                 scan++;
6948             }
6949         }
6950         break;
6951     case NALNUML:
6952         PL_reg_flags |= RF_tainted;
6953         if (utf8_target) {
6954             while (hardcount < max && scan < loceol &&
6955                    !isALNUM_LC_utf8((U8*)scan)) {
6956                 scan += UTF8SKIP(scan);
6957                 hardcount++;
6958             }
6959         } else {
6960             while (scan < loceol && !isALNUM_LC(*scan))
6961                 scan++;
6962         }
6963         break;
6964     case SPACEU:
6965         if (utf8_target) {
6966
6967     utf8_space:
6968
6969             while (hardcount < max && scan < loceol
6970                    && is_XPERLSPACE_utf8((U8*)scan))
6971             {
6972                 scan += UTF8SKIP(scan);
6973                 hardcount++;
6974             }
6975             break;
6976         }
6977         else {
6978             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6979                 scan++;
6980             }
6981             break;
6982         }
6983     case SPACE:
6984         if (utf8_target)
6985             goto utf8_space;
6986
6987         while (scan < loceol && isSPACE((U8) *scan)) {
6988             scan++;
6989         }
6990         break;
6991     case SPACEA:
6992         if (utf8_target && scan + max < loceol) {
6993
6994             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6995              * since here, to match, 1 char == 1 byte */
6996             loceol = scan + max;
6997         }
6998         while (scan < loceol && isSPACE_A((U8) *scan)) {
6999             scan++;
7000         }
7001         break;
7002     case SPACEL:
7003         PL_reg_flags |= RF_tainted;
7004         if (utf8_target) {
7005             while (hardcount < max && scan < loceol &&
7006                    isSPACE_LC_utf8((U8*)scan)) {
7007                 scan += UTF8SKIP(scan);
7008                 hardcount++;
7009             }
7010         } else {
7011             while (scan < loceol && isSPACE_LC(*scan))
7012                 scan++;
7013         }
7014         break;
7015     case NSPACEU:
7016         if (utf8_target) {
7017
7018     utf8_Nspace:
7019
7020             while (hardcount < max && scan < loceol
7021                    && ! is_XPERLSPACE_utf8((U8*)scan))
7022             {
7023                 scan += UTF8SKIP(scan);
7024                 hardcount++;
7025             }
7026             break;
7027         }
7028         else {
7029             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7030                 scan++;
7031             }
7032         }
7033         break;
7034     case NSPACE:
7035         if (utf8_target)
7036             goto utf8_Nspace;
7037
7038         while (scan < loceol && ! isSPACE((U8) *scan)) {
7039             scan++;
7040         }
7041         break;
7042     case NSPACEA:
7043         if (utf8_target) {
7044             while (hardcount < max && scan < loceol
7045                    && ! isSPACE_A((U8) *scan))
7046             {
7047                 scan += UTF8SKIP(scan);
7048                 hardcount++;
7049             }
7050         }
7051         else {
7052             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7053                 scan++;
7054             }
7055         }
7056         break;
7057     case NSPACEL:
7058         PL_reg_flags |= RF_tainted;
7059         if (utf8_target) {
7060             while (hardcount < max && scan < loceol &&
7061                    !isSPACE_LC_utf8((U8*)scan)) {
7062                 scan += UTF8SKIP(scan);
7063                 hardcount++;
7064             }
7065         } else {
7066             while (scan < loceol && !isSPACE_LC(*scan))
7067                 scan++;
7068         }
7069         break;
7070     case DIGIT:
7071         if (utf8_target) {
7072             LOAD_UTF8_CHARCLASS_DIGIT();
7073             while (hardcount < max && scan < loceol &&
7074                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7075                 scan += UTF8SKIP(scan);
7076                 hardcount++;
7077             }
7078         } else {
7079             while (scan < loceol && isDIGIT(*scan))
7080                 scan++;
7081         }
7082         break;
7083     case DIGITA:
7084         if (utf8_target && scan + max < loceol) {
7085
7086             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7087              * since here, to match, 1 char == 1 byte */
7088             loceol = scan + max;
7089         }
7090         while (scan < loceol && isDIGIT_A((U8) *scan)) {
7091             scan++;
7092         }
7093         break;
7094     case DIGITL:
7095         PL_reg_flags |= RF_tainted;
7096         if (utf8_target) {
7097             while (hardcount < max && scan < loceol &&
7098                    isDIGIT_LC_utf8((U8*)scan)) {
7099                 scan += UTF8SKIP(scan);
7100                 hardcount++;
7101             }
7102         } else {
7103             while (scan < loceol && isDIGIT_LC(*scan))
7104                 scan++;
7105         }
7106         break;
7107     case NDIGIT:
7108         if (utf8_target) {
7109             LOAD_UTF8_CHARCLASS_DIGIT();
7110             while (hardcount < max && scan < loceol &&
7111                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7112                 scan += UTF8SKIP(scan);
7113                 hardcount++;
7114             }
7115         } else {
7116             while (scan < loceol && !isDIGIT(*scan))
7117                 scan++;
7118         }
7119         break;
7120     case NDIGITA:
7121         if (utf8_target) {
7122             while (hardcount < max && scan < loceol
7123                    && ! isDIGIT_A((U8) *scan)) {
7124                 scan += UTF8SKIP(scan);
7125                 hardcount++;
7126             }
7127         }
7128         else {
7129             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7130                 scan++;
7131             }
7132         }
7133         break;
7134     case NDIGITL:
7135         PL_reg_flags |= RF_tainted;
7136         if (utf8_target) {
7137             while (hardcount < max && scan < loceol &&
7138                    !isDIGIT_LC_utf8((U8*)scan)) {
7139                 scan += UTF8SKIP(scan);
7140                 hardcount++;
7141             }
7142         } else {
7143             while (scan < loceol && !isDIGIT_LC(*scan))
7144                 scan++;
7145         }
7146         break;
7147     case LNBREAK:
7148         if (utf8_target) {
7149             while (hardcount < max && scan < loceol &&
7150                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7151                 scan += c;
7152                 hardcount++;
7153             }
7154         } else {
7155             /* LNBREAK can match one or two latin chars, which is ok, but we
7156              * have to use hardcount in this situation, and throw away the
7157              * adjustment to <loceol> done before the switch statement */
7158             loceol = PL_regeol;
7159             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7160                 scan+=c;
7161                 hardcount++;
7162             }
7163         }
7164         break;
7165     case HORIZWS:
7166         if (utf8_target) {
7167             while (hardcount < max && scan < loceol &&
7168                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7169             {
7170                 scan += c;
7171                 hardcount++;
7172             }
7173         } else {
7174             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7175                 scan++;
7176         }
7177         break;
7178     case NHORIZWS:
7179         if (utf8_target) {
7180             while (hardcount < max && scan < loceol &&
7181                         !is_HORIZWS_utf8_safe(scan, loceol))
7182             {
7183                 scan += UTF8SKIP(scan);
7184                 hardcount++;
7185             }
7186         } else {
7187             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7188                 scan++;
7189
7190         }
7191         break;
7192     case VERTWS:
7193         if (utf8_target) {
7194             while (hardcount < max && scan < loceol &&
7195                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7196             {
7197                 scan += c;
7198                 hardcount++;
7199             }
7200         } else {
7201             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7202                 scan++;
7203
7204         }
7205         break;
7206     case NVERTWS:
7207         if (utf8_target) {
7208             while (hardcount < max && scan < loceol &&
7209                                 !is_VERTWS_utf8_safe(scan, loceol))
7210             {
7211                 scan += UTF8SKIP(scan);
7212                 hardcount++;
7213             }
7214         } else {
7215             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7216                 scan++;
7217
7218         }
7219         break;
7220
7221     case BOUND:
7222     case BOUNDA:
7223     case BOUNDL:
7224     case BOUNDU:
7225     case EOS:
7226     case GPOS:
7227     case KEEPS:
7228     case NBOUND:
7229     case NBOUNDA:
7230     case NBOUNDL:
7231     case NBOUNDU:
7232     case OPFAIL:
7233     case SBOL:
7234     case SEOL:
7235         /* These are all 0 width, so match right here or not at all. */
7236         break;
7237
7238     default:
7239         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7240         assert(0); /* NOTREACHED */
7241
7242     }
7243
7244     if (hardcount)
7245         c = hardcount;
7246     else
7247         c = scan - *startposp;
7248     *startposp = scan;
7249
7250     DEBUG_r({
7251         GET_RE_DEBUG_FLAGS_DECL;
7252         DEBUG_EXECUTE_r({
7253             SV * const prop = sv_newmortal();
7254             regprop(prog, prop, p);
7255             PerlIO_printf(Perl_debug_log,
7256                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7257                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7258         });
7259     });
7260
7261     return(c);
7262 }
7263
7264
7265 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7266 /*
7267 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7268 create a copy so that changes the caller makes won't change the shared one.
7269 If <altsvp> is non-null, will return NULL in it, for back-compat.
7270  */
7271 SV *
7272 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7273 {
7274     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7275
7276     if (altsvp) {
7277         *altsvp = NULL;
7278     }
7279
7280     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7281 }
7282 #endif
7283
7284 STATIC SV *
7285 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7286 {
7287     /* Returns the swash for the input 'node' in the regex 'prog'.
7288      * If <doinit> is true, will attempt to create the swash if not already
7289      *    done.
7290      * If <listsvp> is non-null, will return the swash initialization string in
7291      *    it.
7292      * Tied intimately to how regcomp.c sets up the data structure */
7293
7294     dVAR;
7295     SV *sw  = NULL;
7296     SV *si  = NULL;
7297     SV*  invlist = NULL;
7298
7299     RXi_GET_DECL(prog,progi);
7300     const struct reg_data * const data = prog ? progi->data : NULL;
7301
7302     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7303
7304     assert(ANYOF_NONBITMAP(node));
7305
7306     if (data && data->count) {
7307         const U32 n = ARG(node);
7308
7309         if (data->what[n] == 's') {
7310             SV * const rv = MUTABLE_SV(data->data[n]);
7311             AV * const av = MUTABLE_AV(SvRV(rv));
7312             SV **const ary = AvARRAY(av);
7313             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7314
7315             si = *ary;  /* ary[0] = the string to initialize the swash with */
7316
7317             /* Elements 2 and 3 are either both present or both absent. [2] is
7318              * any inversion list generated at compile time; [3] indicates if
7319              * that inversion list has any user-defined properties in it. */
7320             if (av_len(av) >= 2) {
7321                 invlist = ary[2];
7322                 if (SvUV(ary[3])) {
7323                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7324                 }
7325             }
7326             else {
7327                 invlist = NULL;
7328             }
7329
7330             /* Element [1] is reserved for the set-up swash.  If already there,
7331              * return it; if not, create it and store it there */
7332             if (SvROK(ary[1])) {
7333                 sw = ary[1];
7334             }
7335             else if (si && doinit) {
7336
7337                 sw = _core_swash_init("utf8", /* the utf8 package */
7338                                       "", /* nameless */
7339                                       si,
7340                                       1, /* binary */
7341                                       0, /* not from tr/// */
7342                                       invlist,
7343                                       &swash_init_flags);
7344                 (void)av_store(av, 1, sw);
7345             }
7346         }
7347     }
7348
7349     if (listsvp) {
7350         SV* matches_string = newSVpvn("", 0);
7351
7352         /* Use the swash, if any, which has to have incorporated into it all
7353          * possibilities */
7354         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7355             && (si && si != &PL_sv_undef))
7356         {
7357
7358             /* If no swash, use the input initialization string, if available */
7359             sv_catsv(matches_string, si);
7360         }
7361
7362         /* Add the inversion list to whatever we have.  This may have come from
7363          * the swash, or from an input parameter */
7364         if (invlist) {
7365             sv_catsv(matches_string, _invlist_contents(invlist));
7366         }
7367         *listsvp = matches_string;
7368     }
7369
7370     return sw;
7371 }
7372
7373 /*
7374  - reginclass - determine if a character falls into a character class
7375
7376   n is the ANYOF regnode
7377   p is the target string
7378   utf8_target tells whether p is in UTF-8.
7379
7380   Returns true if matched; false otherwise.
7381
7382   Note that this can be a synthetic start class, a combination of various
7383   nodes, so things you think might be mutually exclusive, such as locale,
7384   aren't.  It can match both locale and non-locale
7385
7386  */
7387
7388 STATIC bool
7389 S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7390 {
7391     dVAR;
7392     const char flags = ANYOF_FLAGS(n);
7393     bool match = FALSE;
7394     UV c = *p;
7395
7396     PERL_ARGS_ASSERT_REGINCLASS;
7397
7398     /* If c is not already the code point, get it.  Note that
7399      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7400     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7401         STRLEN c_len = 0;
7402         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7403                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7404                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7405                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7406                  * UTF8_ALLOW_FFFF */
7407         if (c_len == (STRLEN)-1)
7408             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7409     }
7410
7411     /* If this character is potentially in the bitmap, check it */
7412     if (c < 256) {
7413         if (ANYOF_BITMAP_TEST(n, c))
7414             match = TRUE;
7415         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7416                 && ! utf8_target
7417                 && ! isASCII(c))
7418         {
7419             match = TRUE;
7420         }
7421         else if (flags & ANYOF_LOCALE) {
7422             PL_reg_flags |= RF_tainted;
7423
7424             if ((flags & ANYOF_LOC_FOLD)
7425                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7426             {
7427                 match = TRUE;
7428             }
7429             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7430                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7431                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7432                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7433                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7434                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7435                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7436                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7437                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7438                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7439                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7440                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7441                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7442                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7443                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7444                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7445                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7446                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7447                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7448                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7449                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7450                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7451                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7452                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7453                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7454                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7455                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7456                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7457                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7458                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7459                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7460                      ) /* How's that for a conditional? */
7461             ) {
7462                 match = TRUE;
7463             }
7464         }
7465     }
7466
7467     /* If the bitmap didn't (or couldn't) match, and something outside the
7468      * bitmap could match, try that.  Locale nodes specify completely the
7469      * behavior of code points in the bit map (otherwise, a utf8 target would
7470      * cause them to be treated as Unicode and not locale), except in
7471      * the very unlikely event when this node is a synthetic start class, which
7472      * could be a combination of locale and non-locale nodes.  So allow locale
7473      * to match for the synthetic start class, which will give a false
7474      * positive that will be resolved when the match is done again as not part
7475      * of the synthetic start class */
7476     if (!match) {
7477         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7478             match = TRUE;       /* Everything above 255 matches */
7479         }
7480         else if (ANYOF_NONBITMAP(n)
7481                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7482                      || (utf8_target
7483                          && (c >=256
7484                              || (! (flags & ANYOF_LOCALE))
7485                              || (flags & ANYOF_IS_SYNTHETIC)))))
7486         {
7487             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7488             if (sw) {
7489                 U8 * utf8_p;
7490                 if (utf8_target) {
7491                     utf8_p = (U8 *) p;
7492                 } else { /* Convert to utf8 */
7493                     STRLEN len = 1;
7494                     utf8_p = bytes_to_utf8(p, &len);
7495                 }
7496
7497                 if (swash_fetch(sw, utf8_p, TRUE)) {
7498                     match = TRUE;
7499                 }
7500
7501                 /* If we allocated a string above, free it */
7502                 if (! utf8_target) Safefree(utf8_p);
7503             }
7504         }
7505
7506         if (UNICODE_IS_SUPER(c)
7507             && (flags & ANYOF_WARN_SUPER)
7508             && ckWARN_d(WARN_NON_UNICODE))
7509         {
7510             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7511                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7512         }
7513     }
7514
7515     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7516     return cBOOL(flags & ANYOF_INVERT) ^ match;
7517 }
7518
7519 STATIC U8 *
7520 S_reghop3(U8 *s, I32 off, const U8* lim)
7521 {
7522     /* return the position 'off' UTF-8 characters away from 's', forward if
7523      * 'off' >= 0, backwards if negative.  But don't go outside of position
7524      * 'lim', which better be < s  if off < 0 */
7525
7526     dVAR;
7527
7528     PERL_ARGS_ASSERT_REGHOP3;
7529
7530     if (off >= 0) {
7531         while (off-- && s < lim) {
7532             /* XXX could check well-formedness here */
7533             s += UTF8SKIP(s);
7534         }
7535     }
7536     else {
7537         while (off++ && s > lim) {
7538             s--;
7539             if (UTF8_IS_CONTINUED(*s)) {
7540                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7541                     s--;
7542             }
7543             /* XXX could check well-formedness here */
7544         }
7545     }
7546     return s;
7547 }
7548
7549 #ifdef XXX_dmq
7550 /* there are a bunch of places where we use two reghop3's that should
7551    be replaced with this routine. but since thats not done yet
7552    we ifdef it out - dmq
7553 */
7554 STATIC U8 *
7555 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7556 {
7557     dVAR;
7558
7559     PERL_ARGS_ASSERT_REGHOP4;
7560
7561     if (off >= 0) {
7562         while (off-- && s < rlim) {
7563             /* XXX could check well-formedness here */
7564             s += UTF8SKIP(s);
7565         }
7566     }
7567     else {
7568         while (off++ && s > llim) {
7569             s--;
7570             if (UTF8_IS_CONTINUED(*s)) {
7571                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7572                     s--;
7573             }
7574             /* XXX could check well-formedness here */
7575         }
7576     }
7577     return s;
7578 }
7579 #endif
7580
7581 STATIC U8 *
7582 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7583 {
7584     dVAR;
7585
7586     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7587
7588     if (off >= 0) {
7589         while (off-- && s < lim) {
7590             /* XXX could check well-formedness here */
7591             s += UTF8SKIP(s);
7592         }
7593         if (off >= 0)
7594             return NULL;
7595     }
7596     else {
7597         while (off++ && s > lim) {
7598             s--;
7599             if (UTF8_IS_CONTINUED(*s)) {
7600                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7601                     s--;
7602             }
7603             /* XXX could check well-formedness here */
7604         }
7605         if (off <= 0)
7606             return NULL;
7607     }
7608     return s;
7609 }
7610
7611 static void
7612 restore_pos(pTHX_ void *arg)
7613 {
7614     dVAR;
7615     regexp * const rex = (regexp *)arg;
7616     if (PL_reg_state.re_state_eval_setup_done) {
7617         if (PL_reg_oldsaved) {
7618             rex->subbeg = PL_reg_oldsaved;
7619             rex->sublen = PL_reg_oldsavedlen;
7620             rex->suboffset = PL_reg_oldsavedoffset;
7621             rex->subcoffset = PL_reg_oldsavedcoffset;
7622 #ifdef PERL_ANY_COW
7623             rex->saved_copy = PL_nrs;
7624 #endif
7625             RXp_MATCH_COPIED_on(rex);
7626         }
7627         PL_reg_magic->mg_len = PL_reg_oldpos;
7628         PL_reg_state.re_state_eval_setup_done = FALSE;
7629         PL_curpm = PL_reg_oldcurpm;
7630     }
7631 }
7632
7633 STATIC void
7634 S_to_utf8_substr(pTHX_ regexp *prog)
7635 {
7636     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7637      * on the converted value */
7638
7639     int i = 1;
7640
7641     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7642
7643     do {
7644         if (prog->substrs->data[i].substr
7645             && !prog->substrs->data[i].utf8_substr) {
7646             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7647             prog->substrs->data[i].utf8_substr = sv;
7648             sv_utf8_upgrade(sv);
7649             if (SvVALID(prog->substrs->data[i].substr)) {
7650                 if (SvTAIL(prog->substrs->data[i].substr)) {
7651                     /* Trim the trailing \n that fbm_compile added last
7652                        time.  */
7653                     SvCUR_set(sv, SvCUR(sv) - 1);
7654                     /* Whilst this makes the SV technically "invalid" (as its
7655                        buffer is no longer followed by "\0") when fbm_compile()
7656                        adds the "\n" back, a "\0" is restored.  */
7657                     fbm_compile(sv, FBMcf_TAIL);
7658                 } else
7659                     fbm_compile(sv, 0);
7660             }
7661             if (prog->substrs->data[i].substr == prog->check_substr)
7662                 prog->check_utf8 = sv;
7663         }
7664     } while (i--);
7665 }
7666
7667 STATIC bool
7668 S_to_byte_substr(pTHX_ regexp *prog)
7669 {
7670     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7671      * on the converted value; returns FALSE if can't be converted. */
7672
7673     dVAR;
7674     int i = 1;
7675
7676     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7677
7678     do {
7679         if (prog->substrs->data[i].utf8_substr
7680             && !prog->substrs->data[i].substr) {
7681             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7682             if (! sv_utf8_downgrade(sv, TRUE)) {
7683                 return FALSE;
7684             }
7685             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7686                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7687                     /* Trim the trailing \n that fbm_compile added last
7688                         time.  */
7689                     SvCUR_set(sv, SvCUR(sv) - 1);
7690                     fbm_compile(sv, FBMcf_TAIL);
7691                 } else
7692                     fbm_compile(sv, 0);
7693             }
7694             prog->substrs->data[i].substr = sv;
7695             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7696                 prog->check_substr = sv;
7697         }
7698     } while (i--);
7699
7700     return TRUE;
7701 }
7702
7703 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7704  * pre-composed Hangul syllables, which are all in a contiguous block and
7705  * arranged there in such a way so as to facilitate alorithmic determination of
7706  * their characteristics.  As such, they don't need a swash, but can be
7707  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7708  * is a GCB=LV */
7709 #define SBASE 0xAC00    /* Start of block */
7710 #define SCount 11172    /* Length of block */
7711 #define TCount 28
7712
7713 #if 0   /* This routine is not currently used */
7714 PERL_STATIC_INLINE bool
7715 S_is_utf8_X_LV(pTHX_ const U8 *p)
7716 {
7717     /* Unlike most other similarly named routines here, this does not create a
7718      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7719
7720     dVAR;
7721
7722     UV cp = valid_utf8_to_uvchr(p, NULL);
7723
7724     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7725
7726     /* The earliest Unicode releases did not have these precomposed Hangul
7727      * syllables.  Set to point to undef in that case, so will return false on
7728      * every call */
7729     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7730         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7731         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7732             SvREFCNT_dec(PL_utf8_X_LV);
7733             PL_utf8_X_LV = &PL_sv_undef;
7734         }
7735     }
7736
7737     return (PL_utf8_X_LV != &PL_sv_undef
7738             && cp >= SBASE && cp < SBASE + SCount
7739             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7740 }
7741 #endif
7742
7743 PERL_STATIC_INLINE bool
7744 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7745 {
7746     /* Unlike most other similarly named routines here, this does not create a
7747      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7748
7749     dVAR;
7750
7751     UV cp = valid_utf8_to_uvchr(p, NULL);
7752
7753     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7754
7755     /* The earliest Unicode releases did not have these precomposed Hangul
7756      * syllables.  Set to point to undef in that case, so will return false on
7757      * every call */
7758     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7759         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7760         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7761             SvREFCNT_dec(PL_utf8_X_LVT);
7762             PL_utf8_X_LVT = &PL_sv_undef;
7763         }
7764     }
7765
7766     return (PL_utf8_X_LVT != &PL_sv_undef
7767             && cp >= SBASE && cp < SBASE + SCount
7768             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7769 }
7770
7771 /*
7772  * Local variables:
7773  * c-indentation-style: bsd
7774  * c-basic-offset: 4
7775  * indent-tabs-mode: nil
7776  * End:
7777  *
7778  * ex: set ts=8 sts=4 sw=4 et:
7779  */