regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define RF_tainted      1       /* tainted information used? e.g. locale */
  97 #define RF_warned       2               /* warned about big count? */
  98
  99 #define RF_utf8         8               /* Pattern contains multibyte chars? */
 100
 101 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
 102
 103 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 104
 105 #ifndef STATIC
 106 #define STATIC  static
 107 #endif
 108
 109 /* Valid for non-utf8 strings: avoids the reginclass
 110  * call if there are no complications: i.e., if everything matchable is
 111  * straight forward in the bitmap */
 112 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 113                                               : ANYOF_BITMAP_TEST(p,*(c)))
 114
 115 /*
 116  * Forwards.
 117  */
 118
 119 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 120 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 121
 122 #define HOPc(pos,off) \
 123         (char *)(PL_reg_match_utf8 \
 124             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 125             : (U8*)(pos + off))
 126 #define HOPBACKc(pos, off) \
 127         (char*)(PL_reg_match_utf8\
 128             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 129             : (pos - off >= PL_bostr)           \
 130                 ? (U8*)pos - off                \
 131                 : NULL)
 132
 133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 135
 136
 137 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 138 #define NEXTCHR_IS_EOS (nextchr < 0)
 139
 140 #define SET_nextchr \
 141     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 142
 143 #define SET_locinput(p) \
 144     locinput = (p);  \
 145     SET_nextchr
 146
 147
 148 /* these are unrolled below in the CCC_TRY_XXX defined */
 149 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 150     if (!CAT2(PL_utf8_,class)) { \
 151         bool ok; \
 152         ENTER; save_re_context(); \
 153         ok=CAT2(is_utf8_,class)((const U8*)str); \
 154         PERL_UNUSED_VAR(ok); \
 155         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 156 /* Doesn't do an assert to verify that is correct */
 157 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 158     if (!CAT2(PL_utf8_,class)) { \
 159         bool throw_away PERL_UNUSED_DECL; \
 160         ENTER; save_re_context(); \
 161         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 162         LEAVE; } } STMT_END
 163
 164 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 165 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 166 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 167
 168 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 169         /* No asserts are done for some of these, in case called on a   */  \
 170         /* Unicode version in which they map to nothing */                  \
 171         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 172         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 173
 174 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 175
 176 /* The actual code for CCC_TRY, which uses several variables from the routine
 177  * it's callable from.  It is designed to be the bulk of a case statement.
 178  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 179  *      nextchr matches the class.
 180  * UTF8_TEST is the whole test string to use for utf8 targets
 181  * LOAD is what to use to test, and if not present to load in the swash for the
 182  *      class
 183  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 184  *      UTF8_TEST test.
 185  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 186  * utf8 and a variant, load the swash if necessary and test using the utf8
 187  * test.  Advance to the next character if test is ok, otherwise fail; If not
 188  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 189  * fails, or advance to the next character */
 190
 191 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 192     if (NEXTCHR_IS_EOS) {                                              \
 193         sayNO;                                                                \
 194     }                                                                         \
 195     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 196         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 197         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 198             sayNO;                                                            \
 199         }                                                                     \
 200     }                                                                         \
 201     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 202             sayNO;                                                            \
 203     }                                                                         \
 204     goto increment_locinput;
 205
 206 /* Handle the non-locale cases for a character class and its complement.  It
 207  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 208  * This is because that code fails when the test succeeds, so we want to have
 209  * the test fail so that the code succeeds.  The swash is stored in a
 210  * predictable PL_ place */
 211 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 212                            CLASS, STR)                                        \
 213     case NAME:                                                                \
 214         _CCC_TRY_CODE( !, FUNC,                                               \
 215                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 216                                             (U8*)locinput, TRUE)),            \
 217                           CLASS, STR)                                         \
 218     case NNAME:                                                               \
 219         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 220                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 221                                             (U8*)locinput, TRUE)),            \
 222                           CLASS, STR)                                         \
 223
 224 /* Generate the case statements for both locale and non-locale character
 225  * classes in regmatch for classes that don't have special unicode semantics.
 226  * Locales don't use an immediate swash, but an intermediary special locale
 227  * function that is called on the pointer to the current place in the input
 228  * string.  That function will resolve to needing the same swash.  One might
 229  * think that because we don't know what the locale will match, we shouldn't
 230  * check with the swash loading function that it loaded properly; ie, that we
 231  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 232  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 233  * irrelevant here */
 234 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 235                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 236                 NAMEA, NNAMEA, FUNCA,                                         \
 237                 CLASS, STR)                                                   \
 238     case NAMEL:                                                               \
 239         PL_reg_flags |= RF_tainted;                                           \
 240         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 241     case NNAMEL:                                                              \
 242         PL_reg_flags |= RF_tainted;                                           \
 243         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 244                        CLASS, STR)                                            \
 245     case NAMEA:                                                               \
 246         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 247             sayNO;                                                            \
 248         }                                                                     \
 249         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 250         locinput++;                                        \
 251         break;                                                                \
 252     case NNAMEA:                                                              \
 253         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 254             sayNO;                                                            \
 255         }                                                                     \
 256         goto increment_locinput;                                              \
 257     /* Generate the non-locale cases */                                       \
 258     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 259
 260 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 261  * statements to handle separate Unicode semantics nodes */
 262 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 263                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 264                   NAMEU, NNAMEU, FUNCU,                                        \
 265                   NAMEA, NNAMEA, FUNCA,                                        \
 266                   CLASS, STR)                                                  \
 267     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 268             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 269             NAMEA, NNAMEA, FUNCA,                                              \
 270             CLASS, STR)                                                        \
 271     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 272
 273 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 274
 275 /* for use after a quantifier and before an EXACT-like node -- japhy */
 276 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 277  *
 278  * NOTE that *nothing* that affects backtracking should be in here, specifically
 279  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 280  * node that is in between two EXACT like nodes when ascertaining what the required
 281  * "follow" character is. This should probably be moved to regex compile time
 282  * although it may be done at run time beause of the REF possibility - more
 283  * investigation required. -- demerphq
 284 */
 285 #define JUMPABLE(rn) (      \
 286     OP(rn) == OPEN ||       \
 287     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 288     OP(rn) == EVAL ||   \
 289     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 290     OP(rn) == PLUS || OP(rn) == MINMOD || \
 291     OP(rn) == KEEPS || \
 292     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 293 )
 294 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 295
 296 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 297
 298 #if 0
 299 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 300    we don't need this definition. */
 301 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 302 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 303 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 304
 305 #else
 306 /* ... so we use this as its faster. */
 307 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 308 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 309 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 310 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 311
 312 #endif
 313
 314 /*
 315   Search for mandatory following text node; for lookahead, the text must
 316   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 317 */
 318 #define FIND_NEXT_IMPT(rn) STMT_START { \
 319     while (JUMPABLE(rn)) { \
 320         const OPCODE type = OP(rn); \
 321         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 322             rn = NEXTOPER(NEXTOPER(rn)); \
 323         else if (type == PLUS) \
 324             rn = NEXTOPER(rn); \
 325         else if (type == IFMATCH) \
 326             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 327         else rn += NEXT_OFF(rn); \
 328     } \
 329 } STMT_END
 330
 331
 332 static void restore_pos(pTHX_ void *arg);
 333
 334 #define REGCP_PAREN_ELEMS 3
 335 #define REGCP_OTHER_ELEMS 3
 336 #define REGCP_FRAME_ELEMS 1
 337 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 338  * are needed for the regexp context stack bookkeeping. */
 339
 340 STATIC CHECKPOINT
 341 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 342 {
 343     dVAR;
 344     const int retval = PL_savestack_ix;
 345     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 346     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 347     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 348     I32 p;
 349     GET_RE_DEBUG_FLAGS_DECL;
 350
 351     PERL_ARGS_ASSERT_REGCPPUSH;
 352
 353     if (paren_elems_to_push < 0)
 354         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 355                    paren_elems_to_push);
 356
 357     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 358         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 359                    " out of range (%lu-%ld)",
 360                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 361
 362     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 363
 364     DEBUG_BUFFERS_r(
 365         if ((int)PL_regsize > (int)parenfloor)
 366             PerlIO_printf(Perl_debug_log,
 367                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 368                 PTR2UV(rex),
 369                 PTR2UV(rex->offs)
 370             );
 371     );
 372     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 373 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 374         SSPUSHINT(rex->offs[p].end);
 375         SSPUSHINT(rex->offs[p].start);
 376         SSPUSHINT(rex->offs[p].start_tmp);
 377         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 378             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 379             (UV)p,
 380             (IV)rex->offs[p].start,
 381             (IV)rex->offs[p].start_tmp,
 382             (IV)rex->offs[p].end
 383         ));
 384     }
 385 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 386     SSPUSHINT(PL_regsize);
 387     SSPUSHINT(rex->lastparen);
 388     SSPUSHINT(rex->lastcloseparen);
 389     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 390
 391     return retval;
 392 }
 393
 394 /* These are needed since we do not localize EVAL nodes: */
 395 #define REGCP_SET(cp)                                           \
 396     DEBUG_STATE_r(                                              \
 397             PerlIO_printf(Perl_debug_log,                       \
 398                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 399                 (IV)PL_savestack_ix));                          \
 400     cp = PL_savestack_ix
 401
 402 #define REGCP_UNWIND(cp)                                        \
 403     DEBUG_STATE_r(                                              \
 404         if (cp != PL_savestack_ix)                              \
 405             PerlIO_printf(Perl_debug_log,                       \
 406                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 407                 (IV)(cp), (IV)PL_savestack_ix));                \
 408     regcpblow(cp)
 409
 410 #define UNWIND_PAREN(lp, lcp)               \
 411     for (n = rex->lastparen; n > lp; n--)   \
 412         rex->offs[n].end = -1;              \
 413     rex->lastparen = n;                     \
 414     rex->lastcloseparen = lcp;
 415
 416
 417 STATIC void
 418 S_regcppop(pTHX_ regexp *rex)
 419 {
 420     dVAR;
 421     UV i;
 422     U32 paren;
 423     GET_RE_DEBUG_FLAGS_DECL;
 424
 425     PERL_ARGS_ASSERT_REGCPPOP;
 426
 427     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 428     i = SSPOPUV;
 429     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 430     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 431     rex->lastcloseparen = SSPOPINT;
 432     rex->lastparen = SSPOPINT;
 433     PL_regsize = SSPOPINT;
 434
 435     i -= REGCP_OTHER_ELEMS;
 436     /* Now restore the parentheses context. */
 437     DEBUG_BUFFERS_r(
 438         if (i || rex->lastparen + 1 <= rex->nparens)
 439             PerlIO_printf(Perl_debug_log,
 440                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 441                 PTR2UV(rex),
 442                 PTR2UV(rex->offs)
 443             );
 444     );
 445     paren = PL_regsize;
 446     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 447         I32 tmps;
 448         rex->offs[paren].start_tmp = SSPOPINT;
 449         rex->offs[paren].start = SSPOPINT;
 450         tmps = SSPOPINT;
 451         if (paren <= rex->lastparen)
 452             rex->offs[paren].end = tmps;
 453         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 454             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 455             (UV)paren,
 456             (IV)rex->offs[paren].start,
 457             (IV)rex->offs[paren].start_tmp,
 458             (IV)rex->offs[paren].end,
 459             (paren > rex->lastparen ? "(skipped)" : ""));
 460         );
 461         paren--;
 462     }
 463 #if 1
 464     /* It would seem that the similar code in regtry()
 465      * already takes care of this, and in fact it is in
 466      * a better location to since this code can #if 0-ed out
 467      * but the code in regtry() is needed or otherwise tests
 468      * requiring null fields (pat.t#187 and split.t#{13,14}
 469      * (as of patchlevel 7877)  will fail.  Then again,
 470      * this code seems to be necessary or otherwise
 471      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 472      * --jhi updated by dapm */
 473     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 474         if (i > PL_regsize)
 475             rex->offs[i].start = -1;
 476         rex->offs[i].end = -1;
 477         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 478             "    \\%"UVuf": %s   ..-1 undeffing\n",
 479             (UV)i,
 480             (i > PL_regsize) ? "-1" : "  "
 481         ));
 482     }
 483 #endif
 484 }
 485
 486 /* restore the parens and associated vars at savestack position ix,
 487  * but without popping the stack */
 488
 489 STATIC void
 490 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 491 {
 492     I32 tmpix = PL_savestack_ix;
 493     PL_savestack_ix = ix;
 494     regcppop(rex);
 495     PL_savestack_ix = tmpix;
 496 }
 497
 498 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 499
 500 /*
 501  * pregexec and friends
 502  */
 503
 504 #ifndef PERL_IN_XSUB_RE
 505 /*
 506  - pregexec - match a regexp against a string
 507  */
 508 I32
 509 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 510          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 511 /* stringarg: the point in the string at which to begin matching */
 512 /* strend:    pointer to null at end of string */
 513 /* strbeg:    real beginning of string */
 514 /* minend:    end of match must be >= minend bytes after stringarg. */
 515 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 516  *            itself is accessed via the pointers above */
 517 /* nosave:    For optimizations. */
 518 {
 519     PERL_ARGS_ASSERT_PREGEXEC;
 520
 521     return
 522         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 523                       nosave ? 0 : REXEC_COPY_STR);
 524 }
 525 #endif
 526
 527 /*
 528  * Need to implement the following flags for reg_anch:
 529  *
 530  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 531  * USE_INTUIT_ML
 532  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 533  * INTUIT_AUTORITATIVE_ML
 534  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 535  * INTUIT_ONCE_ML
 536  *
 537  * Another flag for this function: SECOND_TIME (so that float substrs
 538  * with giant delta may be not rechecked).
 539  */
 540
 541 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 542
 543 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 544    Otherwise, only SvCUR(sv) is used to get strbeg. */
 545
 546 /* XXXX We assume that strpos is strbeg unless sv. */
 547
 548 /* XXXX Some places assume that there is a fixed substring.
 549         An update may be needed if optimizer marks as "INTUITable"
 550         RExen without fixed substrings.  Similarly, it is assumed that
 551         lengths of all the strings are no more than minlen, thus they
 552         cannot come from lookahead.
 553         (Or minlen should take into account lookahead.)
 554   NOTE: Some of this comment is not correct. minlen does now take account
 555   of lookahead/behind. Further research is required. -- demerphq
 556
 557 */
 558
 559 /* A failure to find a constant substring means that there is no need to make
 560    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 561    finding a substring too deep into the string means that less calls to
 562    regtry() should be needed.
 563
 564    REx compiler's optimizer found 4 possible hints:
 565         a) Anchored substring;
 566         b) Fixed substring;
 567         c) Whether we are anchored (beginning-of-line or \G);
 568         d) First node (of those at offset 0) which may distinguish positions;
 569    We use a)b)d) and multiline-part of c), and try to find a position in the
 570    string which does not contradict any of them.
 571  */
 572
 573 /* Most of decisions we do here should have been done at compile time.
 574    The nodes of the REx which we used for the search should have been
 575    deleted from the finite automaton. */
 576
 577 char *
 578 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 579                      char *strend, const U32 flags, re_scream_pos_data *data)
 580 {
 581     dVAR;
 582     struct regexp *const prog = ReANY(rx);
 583     I32 start_shift = 0;
 584     /* Should be nonnegative! */
 585     I32 end_shift   = 0;
 586     char *s;
 587     SV *check;
 588     char *strbeg;
 589     char *t;
 590     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 591     I32 ml_anch;
 592     char *other_last = NULL;    /* other substr checked before this */
 593     char *check_at = NULL;              /* check substr found at this pos */
 594     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 595     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 596     RXi_GET_DECL(prog,progi);
 597 #ifdef DEBUGGING
 598     const char * const i_strpos = strpos;
 599 #endif
 600     GET_RE_DEBUG_FLAGS_DECL;
 601
 602     PERL_ARGS_ASSERT_RE_INTUIT_START;
 603     PERL_UNUSED_ARG(flags);
 604     PERL_UNUSED_ARG(data);
 605
 606     RX_MATCH_UTF8_set(rx,utf8_target);
 607
 608     if (RX_UTF8(rx)) {
 609         PL_reg_flags |= RF_utf8;
 610     }
 611     DEBUG_EXECUTE_r(
 612         debug_start_match(rx, utf8_target, strpos, strend,
 613             sv ? "Guessing start of match in sv for"
 614                : "Guessing start of match in string for");
 615               );
 616
 617     /* CHR_DIST() would be more correct here but it makes things slow. */
 618     if (prog->minlen > strend - strpos) {
 619         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 620                               "String too short... [re_intuit_start]\n"));
 621         goto fail;
 622     }
 623
 624     /* XXX we need to pass strbeg as a separate arg: the following is
 625      * guesswork and can be wrong... */
 626     if (sv && SvPOK(sv)) {
 627         char * p   = SvPVX(sv);
 628         STRLEN cur = SvCUR(sv);
 629         if (p <= strpos && strpos < p + cur) {
 630             strbeg = p;
 631             assert(p <= strend && strend <= p + cur);
 632         }
 633         else
 634             strbeg = strend - cur;
 635     }
 636     else
 637         strbeg = strpos;
 638
 639     PL_regeol = strend;
 640     if (utf8_target) {
 641         if (!prog->check_utf8 && prog->check_substr)
 642             to_utf8_substr(prog);
 643         check = prog->check_utf8;
 644     } else {
 645         if (!prog->check_substr && prog->check_utf8) {
 646             if (! to_byte_substr(prog)) {
 647                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 648             }
 649         }
 650         check = prog->check_substr;
 651     }
 652     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 653         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 654                      || ( (prog->extflags & RXf_ANCH_BOL)
 655                           && !multiline ) );    /* Check after \n? */
 656
 657         if (!ml_anch) {
 658           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 659                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 660                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 661                && sv && !SvROK(sv)
 662                && (strpos != strbeg)) {
 663               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 664               goto fail;
 665           }
 666           if (prog->check_offset_min == prog->check_offset_max
 667               && !(prog->extflags & RXf_CANY_SEEN)
 668               && ! multiline)   /* /m can cause \n's to match that aren't
 669                                    accounted for in the string max length.
 670                                    See [perl #115242] */
 671           {
 672             /* Substring at constant offset from beg-of-str... */
 673             I32 slen;
 674
 675             s = HOP3c(strpos, prog->check_offset_min, strend);
 676
 677             if (SvTAIL(check)) {
 678                 slen = SvCUR(check);    /* >= 1 */
 679
 680                 if ( strend - s > slen || strend - s < slen - 1
 681                      || (strend - s == slen && strend[-1] != '\n')) {
 682                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 683                     goto fail_finish;
 684                 }
 685                 /* Now should match s[0..slen-2] */
 686                 slen--;
 687                 if (slen && (*SvPVX_const(check) != *s
 688                              || (slen > 1
 689                                  && memNE(SvPVX_const(check), s, slen)))) {
 690                   report_neq:
 691                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 692                     goto fail_finish;
 693                 }
 694             }
 695             else if (*SvPVX_const(check) != *s
 696                      || ((slen = SvCUR(check)) > 1
 697                          && memNE(SvPVX_const(check), s, slen)))
 698                 goto report_neq;
 699             check_at = s;
 700             goto success_at_start;
 701           }
 702         }
 703         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 704         s = strpos;
 705         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 706         end_shift = prog->check_end_shift;
 707
 708         if (!ml_anch) {
 709             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 710                                          - (SvTAIL(check) != 0);
 711             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 712
 713             if (end_shift < eshift)
 714                 end_shift = eshift;
 715         }
 716     }
 717     else {                              /* Can match at random position */
 718         ml_anch = 0;
 719         s = strpos;
 720         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 721         end_shift = prog->check_end_shift;
 722
 723         /* end shift should be non negative here */
 724     }
 725
 726 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 727     if (end_shift < 0)
 728         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 729                    (IV)end_shift, RX_PRECOMP(prog));
 730 #endif
 731
 732   restart:
 733     /* Find a possible match in the region s..strend by looking for
 734        the "check" substring in the region corrected by start/end_shift. */
 735
 736     {
 737         I32 srch_start_shift = start_shift;
 738         I32 srch_end_shift = end_shift;
 739         U8* start_point;
 740         U8* end_point;
 741         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 742             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 743             srch_start_shift = strbeg - s;
 744         }
 745     DEBUG_OPTIMISE_MORE_r({
 746         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 747             (IV)prog->check_offset_min,
 748             (IV)srch_start_shift,
 749             (IV)srch_end_shift,
 750             (IV)prog->check_end_shift);
 751     });
 752
 753         if (prog->extflags & RXf_CANY_SEEN) {
 754             start_point= (U8*)(s + srch_start_shift);
 755             end_point= (U8*)(strend - srch_end_shift);
 756         } else {
 757             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 758             end_point= HOP3(strend, -srch_end_shift, strbeg);
 759         }
 760         DEBUG_OPTIMISE_MORE_r({
 761             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 762                 (int)(end_point - start_point),
 763                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 764                 start_point);
 765         });
 766
 767         s = fbm_instr( start_point, end_point,
 768                       check, multiline ? FBMrf_MULTILINE : 0);
 769     }
 770     /* Update the count-of-usability, remove useless subpatterns,
 771         unshift s.  */
 772
 773     DEBUG_EXECUTE_r({
 774         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 775             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 776         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 777                           (s ? "Found" : "Did not find"),
 778             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 779                 ? "anchored" : "floating"),
 780             quoted,
 781             RE_SV_TAIL(check),
 782             (s ? " at offset " : "...\n") );
 783     });
 784
 785     if (!s)
 786         goto fail_finish;
 787     /* Finish the diagnostic message */
 788     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 789
 790     /* XXX dmq: first branch is for positive lookbehind...
 791        Our check string is offset from the beginning of the pattern.
 792        So we need to do any stclass tests offset forward from that
 793        point. I think. :-(
 794      */
 795
 796
 797
 798     check_at=s;
 799
 800
 801     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 802        Start with the other substr.
 803        XXXX no SCREAM optimization yet - and a very coarse implementation
 804        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 805                 *always* match.  Probably should be marked during compile...
 806        Probably it is right to do no SCREAM here...
 807      */
 808
 809     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 810                 : (prog->float_substr && prog->anchored_substr))
 811     {
 812         /* Take into account the "other" substring. */
 813         /* XXXX May be hopelessly wrong for UTF... */
 814         if (!other_last)
 815             other_last = strpos;
 816         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 817           do_other_anchored:
 818             {
 819                 char * const last = HOP3c(s, -start_shift, strbeg);
 820                 char *last1, *last2;
 821                 char * const saved_s = s;
 822                 SV* must;
 823
 824                 t = s - prog->check_offset_max;
 825                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 826                     && (!utf8_target
 827                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 828                             && t > strpos)))
 829                     NOOP;
 830                 else
 831                     t = strpos;
 832                 t = HOP3c(t, prog->anchored_offset, strend);
 833                 if (t < other_last)     /* These positions already checked */
 834                     t = other_last;
 835                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 836                 if (last < last1)
 837                     last1 = last;
 838                 /* XXXX It is not documented what units *_offsets are in.
 839                    We assume bytes, but this is clearly wrong.
 840                    Meaning this code needs to be carefully reviewed for errors.
 841                    dmq.
 842                   */
 843
 844                 /* On end-of-str: see comment below. */
 845                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 846                 if (must == &PL_sv_undef) {
 847                     s = (char*)NULL;
 848                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 849                 }
 850                 else
 851                     s = fbm_instr(
 852                         (unsigned char*)t,
 853                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 854                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 855                         must,
 856                         multiline ? FBMrf_MULTILINE : 0
 857                     );
 858                 DEBUG_EXECUTE_r({
 859                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 860                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 861                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 862                         (s ? "Found" : "Contradicts"),
 863                         quoted, RE_SV_TAIL(must));
 864                 });
 865
 866
 867                 if (!s) {
 868                     if (last1 >= last2) {
 869                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 870                                                 ", giving up...\n"));
 871                         goto fail_finish;
 872                     }
 873                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 874                         ", trying floating at offset %ld...\n",
 875                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 876                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 877                     s = HOP3c(last, 1, strend);
 878                     goto restart;
 879                 }
 880                 else {
 881                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 882                           (long)(s - i_strpos)));
 883                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 884                     other_last = HOP3c(s, 1, strend);
 885                     s = saved_s;
 886                     if (t == strpos)
 887                         goto try_at_start;
 888                     goto try_at_offset;
 889                 }
 890             }
 891         }
 892         else {          /* Take into account the floating substring. */
 893             char *last, *last1;
 894             char * const saved_s = s;
 895             SV* must;
 896
 897             t = HOP3c(s, -start_shift, strbeg);
 898             last1 = last =
 899                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 900             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 901                 last = HOP3c(t, prog->float_max_offset, strend);
 902             s = HOP3c(t, prog->float_min_offset, strend);
 903             if (s < other_last)
 904                 s = other_last;
 905  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 906             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 907             /* fbm_instr() takes into account exact value of end-of-str
 908                if the check is SvTAIL(ed).  Since false positives are OK,
 909                and end-of-str is not later than strend we are OK. */
 910             if (must == &PL_sv_undef) {
 911                 s = (char*)NULL;
 912                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 913             }
 914             else
 915                 s = fbm_instr((unsigned char*)s,
 916                               (unsigned char*)last + SvCUR(must)
 917                                   - (SvTAIL(must)!=0),
 918                               must, multiline ? FBMrf_MULTILINE : 0);
 919             DEBUG_EXECUTE_r({
 920                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 921                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 922                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 923                     (s ? "Found" : "Contradicts"),
 924                     quoted, RE_SV_TAIL(must));
 925             });
 926             if (!s) {
 927                 if (last1 == last) {
 928                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 929                                             ", giving up...\n"));
 930                     goto fail_finish;
 931                 }
 932                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 933                     ", trying anchored starting at offset %ld...\n",
 934                     (long)(saved_s + 1 - i_strpos)));
 935                 other_last = last;
 936                 s = HOP3c(t, 1, strend);
 937                 goto restart;
 938             }
 939             else {
 940                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 941                       (long)(s - i_strpos)));
 942                 other_last = s; /* Fix this later. --Hugo */
 943                 s = saved_s;
 944                 if (t == strpos)
 945                     goto try_at_start;
 946                 goto try_at_offset;
 947             }
 948         }
 949     }
 950
 951
 952     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 953
 954     DEBUG_OPTIMISE_MORE_r(
 955         PerlIO_printf(Perl_debug_log,
 956             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 957             (IV)prog->check_offset_min,
 958             (IV)prog->check_offset_max,
 959             (IV)(s-strpos),
 960             (IV)(t-strpos),
 961             (IV)(t-s),
 962             (IV)(strend-strpos)
 963         )
 964     );
 965
 966     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 967         && (!utf8_target
 968             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 969                  && t > strpos)))
 970     {
 971         /* Fixed substring is found far enough so that the match
 972            cannot start at strpos. */
 973       try_at_offset:
 974         if (ml_anch && t[-1] != '\n') {
 975             /* Eventually fbm_*() should handle this, but often
 976                anchored_offset is not 0, so this check will not be wasted. */
 977             /* XXXX In the code below we prefer to look for "^" even in
 978                presence of anchored substrings.  And we search even
 979                beyond the found float position.  These pessimizations
 980                are historical artefacts only.  */
 981           find_anchor:
 982             while (t < strend - prog->minlen) {
 983                 if (*t == '\n') {
 984                     if (t < check_at - prog->check_offset_min) {
 985                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 986                             /* Since we moved from the found position,
 987                                we definitely contradict the found anchored
 988                                substr.  Due to the above check we do not
 989                                contradict "check" substr.
 990                                Thus we can arrive here only if check substr
 991                                is float.  Redo checking for "other"=="fixed".
 992                              */
 993                             strpos = t + 1;
 994                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 995                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 996                             goto do_other_anchored;
 997                         }
 998                         /* We don't contradict the found floating substring. */
 999                         /* XXXX Why not check for STCLASS? */
1000                         s = t + 1;
1001                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1002                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1003                         goto set_useful;
1004                     }
1005                     /* Position contradicts check-string */
1006                     /* XXXX probably better to look for check-string
1007                        than for "\n", so one should lower the limit for t? */
1008                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1009                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1010                     other_last = strpos = s = t + 1;
1011                     goto restart;
1012                 }
1013                 t++;
1014             }
1015             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1016                         PL_colors[0], PL_colors[1]));
1017             goto fail_finish;
1018         }
1019         else {
1020             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1021                         PL_colors[0], PL_colors[1]));
1022         }
1023         s = t;
1024       set_useful:
1025         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1026     }
1027     else {
1028         /* The found string does not prohibit matching at strpos,
1029            - no optimization of calling REx engine can be performed,
1030            unless it was an MBOL and we are not after MBOL,
1031            or a future STCLASS check will fail this. */
1032       try_at_start:
1033         /* Even in this situation we may use MBOL flag if strpos is offset
1034            wrt the start of the string. */
1035         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1036             && (strpos != strbeg) && strpos[-1] != '\n'
1037             /* May be due to an implicit anchor of m{.*foo}  */
1038             && !(prog->intflags & PREGf_IMPLICIT))
1039         {
1040             t = strpos;
1041             goto find_anchor;
1042         }
1043         DEBUG_EXECUTE_r( if (ml_anch)
1044             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1045                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1046         );
1047       success_at_start:
1048         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1049             && (utf8_target ? (
1050                 prog->check_utf8                /* Could be deleted already */
1051                 && --BmUSEFUL(prog->check_utf8) < 0
1052                 && (prog->check_utf8 == prog->float_utf8)
1053             ) : (
1054                 prog->check_substr              /* Could be deleted already */
1055                 && --BmUSEFUL(prog->check_substr) < 0
1056                 && (prog->check_substr == prog->float_substr)
1057             )))
1058         {
1059             /* If flags & SOMETHING - do not do it many times on the same match */
1060             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1061             /* XXX Does the destruction order has to change with utf8_target? */
1062             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1063             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1064             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1065             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1066             check = NULL;                       /* abort */
1067             s = strpos;
1068             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1069                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1070             if (prog->intflags & PREGf_IMPLICIT)
1071                 prog->extflags &= ~RXf_ANCH_MBOL;
1072             /* XXXX This is a remnant of the old implementation.  It
1073                     looks wasteful, since now INTUIT can use many
1074                     other heuristics. */
1075             prog->extflags &= ~RXf_USE_INTUIT;
1076             /* XXXX What other flags might need to be cleared in this branch? */
1077         }
1078         else
1079             s = strpos;
1080     }
1081
1082     /* Last resort... */
1083     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1084     /* trie stclasses are too expensive to use here, we are better off to
1085        leave it to regmatch itself */
1086     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1087         /* minlen == 0 is possible if regstclass is \b or \B,
1088            and the fixed substr is ''$.
1089            Since minlen is already taken into account, s+1 is before strend;
1090            accidentally, minlen >= 1 guaranties no false positives at s + 1
1091            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1092            regstclass does not come from lookahead...  */
1093         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1094            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1095         const U8* const str = (U8*)STRING(progi->regstclass);
1096         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1097                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1098                     : 1);
1099         char * endpos;
1100         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1101             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1102         else if (prog->float_substr || prog->float_utf8)
1103             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1104         else
1105             endpos= strend;
1106
1107         if (checked_upto < s)
1108            checked_upto = s;
1109         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1110                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1111
1112         t = s;
1113         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1114         if (s) {
1115             checked_upto = s;
1116         } else {
1117 #ifdef DEBUGGING
1118             const char *what = NULL;
1119 #endif
1120             if (endpos == strend) {
1121                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1122                                 "Could not match STCLASS...\n") );
1123                 goto fail;
1124             }
1125             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126                                    "This position contradicts STCLASS...\n") );
1127             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1128                 goto fail;
1129             checked_upto = HOPBACKc(endpos, start_shift);
1130             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1132             /* Contradict one of substrings */
1133             if (prog->anchored_substr || prog->anchored_utf8) {
1134                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1135                     DEBUG_EXECUTE_r( what = "anchored" );
1136                   hop_and_restart:
1137                     s = HOP3c(t, 1, strend);
1138                     if (s + start_shift + end_shift > strend) {
1139                         /* XXXX Should be taken into account earlier? */
1140                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1141                                                "Could not match STCLASS...\n") );
1142                         goto fail;
1143                     }
1144                     if (!check)
1145                         goto giveup;
1146                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1147                                 "Looking for %s substr starting at offset %ld...\n",
1148                                  what, (long)(s + start_shift - i_strpos)) );
1149                     goto restart;
1150                 }
1151                 /* Have both, check_string is floating */
1152                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1153                     goto retry_floating_check;
1154                 /* Recheck anchored substring, but not floating... */
1155                 s = check_at;
1156                 if (!check)
1157                     goto giveup;
1158                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1159                           "Looking for anchored substr starting at offset %ld...\n",
1160                           (long)(other_last - i_strpos)) );
1161                 goto do_other_anchored;
1162             }
1163             /* Another way we could have checked stclass at the
1164                current position only: */
1165             if (ml_anch) {
1166                 s = t = t + 1;
1167                 if (!check)
1168                     goto giveup;
1169                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1170                           "Looking for /%s^%s/m starting at offset %ld...\n",
1171                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1172                 goto try_at_offset;
1173             }
1174             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1175                 goto fail;
1176             /* Check is floating substring. */
1177           retry_floating_check:
1178             t = check_at - start_shift;
1179             DEBUG_EXECUTE_r( what = "floating" );
1180             goto hop_and_restart;
1181         }
1182         if (t != s) {
1183             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1184                         "By STCLASS: moving %ld --> %ld\n",
1185                                   (long)(t - i_strpos), (long)(s - i_strpos))
1186                    );
1187         }
1188         else {
1189             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1190                                   "Does not contradict STCLASS...\n");
1191                    );
1192         }
1193     }
1194   giveup:
1195     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1196                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1197                           PL_colors[5], (long)(s - i_strpos)) );
1198     return s;
1199
1200   fail_finish:                          /* Substring not found */
1201     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1202         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1203   fail:
1204     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1205                           PL_colors[4], PL_colors[5]));
1206     return NULL;
1207 }
1208
1209 #define DECL_TRIE_TYPE(scan) \
1210     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1211                     trie_type = ((scan->flags == EXACT) \
1212                               ? (utf8_target ? trie_utf8 : trie_plain) \
1213                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1214
1215 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1216 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1217     STRLEN skiplen;                                                                 \
1218     switch (trie_type) {                                                            \
1219     case trie_utf8_fold:                                                            \
1220         if ( foldlen>0 ) {                                                          \
1221             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1222             foldlen -= len;                                                         \
1223             uscan += len;                                                           \
1224             len=0;                                                                  \
1225         } else {                                                                    \
1226             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1227             len = UTF8SKIP(uc);                                                     \
1228             skiplen = UNISKIP( uvc );                                               \
1229             foldlen -= skiplen;                                                     \
1230             uscan = foldbuf + skiplen;                                              \
1231         }                                                                           \
1232         break;                                                                      \
1233     case trie_latin_utf8_fold:                                                      \
1234         if ( foldlen>0 ) {                                                          \
1235             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1236             foldlen -= len;                                                         \
1237             uscan += len;                                                           \
1238             len=0;                                                                  \
1239         } else {                                                                    \
1240             len = 1;                                                                \
1241             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1242             skiplen = UNISKIP( uvc );                                               \
1243             foldlen -= skiplen;                                                     \
1244             uscan = foldbuf + skiplen;                                              \
1245         }                                                                           \
1246         break;                                                                      \
1247     case trie_utf8:                                                                 \
1248         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1249         break;                                                                      \
1250     case trie_plain:                                                                \
1251         uvc = (UV)*uc;                                                              \
1252         len = 1;                                                                    \
1253     }                                                                               \
1254     if (uvc < 256) {                                                                \
1255         charid = trie->charmap[ uvc ];                                              \
1256     }                                                                               \
1257     else {                                                                          \
1258         charid = 0;                                                                 \
1259         if (widecharmap) {                                                          \
1260             SV** const svpp = hv_fetch(widecharmap,                                 \
1261                         (char*)&uvc, sizeof(UV), 0);                                \
1262             if (svpp)                                                               \
1263                 charid = (U16)SvIV(*svpp);                                          \
1264         }                                                                           \
1265     }                                                                               \
1266 } STMT_END
1267
1268 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1269 STMT_START {                                              \
1270     while (s <= e) {                                      \
1271         if ( (CoNd)                                       \
1272              && (ln == 1 || folder(s, pat_string, ln))    \
1273              && (!reginfo || regtry(reginfo, &s)) )       \
1274             goto got_it;                                  \
1275         s++;                                              \
1276     }                                                     \
1277 } STMT_END
1278
1279 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1280 STMT_START {                                          \
1281     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1282         CoDe                                          \
1283         s += uskip;                                   \
1284     }                                                 \
1285 } STMT_END
1286
1287 #define REXEC_FBC_SCAN(CoDe)                          \
1288 STMT_START {                                          \
1289     while (s < strend) {                              \
1290         CoDe                                          \
1291         s++;                                          \
1292     }                                                 \
1293 } STMT_END
1294
1295 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1296 REXEC_FBC_UTF8_SCAN(                                  \
1297     if (CoNd) {                                       \
1298         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1299             goto got_it;                              \
1300         else                                          \
1301             tmp = doevery;                            \
1302     }                                                 \
1303     else                                              \
1304         tmp = 1;                                      \
1305 )
1306
1307 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1308 REXEC_FBC_SCAN(                                       \
1309     if (CoNd) {                                       \
1310         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1311             goto got_it;                              \
1312         else                                          \
1313             tmp = doevery;                            \
1314     }                                                 \
1315     else                                              \
1316         tmp = 1;                                      \
1317 )
1318
1319 #define REXEC_FBC_TRYIT               \
1320 if ((!reginfo || regtry(reginfo, &s))) \
1321     goto got_it
1322
1323 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1324     if (utf8_target) {                                             \
1325         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1326     }                                                          \
1327     else {                                                     \
1328         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1329     }
1330
1331 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1332     if (utf8_target) {                                             \
1333         UtFpReLoAd;                                            \
1334         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1335     }                                                          \
1336     else {                                                     \
1337         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1338     }
1339
1340 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1341     PL_reg_flags |= RF_tainted;                                \
1342     if (utf8_target) {                                             \
1343         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1344     }                                                          \
1345     else {                                                     \
1346         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1347     }
1348
1349 #define DUMP_EXEC_POS(li,s,doutf8) \
1350     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1351
1352
1353 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1354         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1355         tmp = TEST_NON_UTF8(tmp);                                              \
1356         REXEC_FBC_UTF8_SCAN(                                                   \
1357             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1358                 tmp = !tmp;                                                    \
1359                 IF_SUCCESS;                                                    \
1360             }                                                                  \
1361             else {                                                             \
1362                 IF_FAIL;                                                       \
1363             }                                                                  \
1364         );                                                                     \
1365
1366 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1367         if (s == PL_bostr) {                                                   \
1368             tmp = '\n';                                                        \
1369         }                                                                      \
1370         else {                                                                 \
1371             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1372             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1373         }                                                                      \
1374         tmp = TeSt1_UtF8;                                                      \
1375         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1376         REXEC_FBC_UTF8_SCAN(                                                   \
1377             if (tmp == ! (TeSt2_UtF8)) { \
1378                 tmp = !tmp;                                                    \
1379                 IF_SUCCESS;                                                    \
1380             }                                                                  \
1381             else {                                                             \
1382                 IF_FAIL;                                                       \
1383             }                                                                  \
1384         );                                                                     \
1385
1386 /* The only difference between the BOUND and NBOUND cases is that
1387  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1388  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1389  * with the other one being empty */
1390 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1391     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1392
1393 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1394     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1395
1396 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1397     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1398
1399 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1400     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1401
1402
1403 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1404  * be passed in completely with the variable name being tested, which isn't
1405  * such a clean interface, but this is easier to read than it was before.  We
1406  * are looking for the boundary (or non-boundary between a word and non-word
1407  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1408  * must be different.  Find the "wordness" of the character just prior to this
1409  * one, and compare it with the wordness of this one.  If they differ, we have
1410  * a boundary.  At the beginning of the string, pretend that the previous
1411  * character was a new-line */
1412 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1413     if (utf8_target) {                                                         \
1414                 UTF8_CODE \
1415     }                                                                          \
1416     else {  /* Not utf8 */                                                     \
1417         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1418         tmp = TEST_NON_UTF8(tmp);                                              \
1419         REXEC_FBC_SCAN(                                                        \
1420             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1421                 tmp = !tmp;                                                    \
1422                 IF_SUCCESS;                                                    \
1423             }                                                                  \
1424             else {                                                             \
1425                 IF_FAIL;                                                       \
1426             }                                                                  \
1427         );                                                                     \
1428     }                                                                          \
1429     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1430         goto got_it;
1431
1432 /* We know what class REx starts with.  Try to find this position... */
1433 /* if reginfo is NULL, its a dryrun */
1434 /* annoyingly all the vars in this routine have different names from their counterparts
1435    in regmatch. /grrr */
1436
1437 STATIC char *
1438 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1439     const char *strend, regmatch_info *reginfo)
1440 {
1441         dVAR;
1442         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1443         char *pat_string;   /* The pattern's exactish string */
1444         char *pat_end;      /* ptr to end char of pat_string */
1445         re_fold_t folder;       /* Function for computing non-utf8 folds */
1446         const U8 *fold_array;   /* array for folding ords < 256 */
1447         STRLEN ln;
1448         STRLEN lnc;
1449         STRLEN uskip;
1450         U8 c1;
1451         U8 c2;
1452         char *e;
1453         I32 tmp = 1;    /* Scratch variable? */
1454         const bool utf8_target = PL_reg_match_utf8;
1455         UV utf8_fold_flags = 0;
1456         RXi_GET_DECL(prog,progi);
1457
1458         PERL_ARGS_ASSERT_FIND_BYCLASS;
1459
1460         /* We know what class it must start with. */
1461         switch (OP(c)) {
1462         case ANYOF:
1463             if (utf8_target) {
1464                 REXEC_FBC_UTF8_CLASS_SCAN(
1465                           reginclass(prog, c, (U8*)s, utf8_target));
1466             }
1467             else {
1468                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1469             }
1470             break;
1471         case CANY:
1472             REXEC_FBC_SCAN(
1473                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1474                     goto got_it;
1475                 else
1476                     tmp = doevery;
1477             );
1478             break;
1479
1480         case EXACTFA:
1481             if (UTF_PATTERN || utf8_target) {
1482                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1483                 goto do_exactf_utf8;
1484             }
1485             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1486             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1487             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1488
1489         case EXACTF:
1490             if (utf8_target) {
1491
1492                 /* regcomp.c already folded this if pattern is in UTF-8 */
1493                 utf8_fold_flags = 0;
1494                 goto do_exactf_utf8;
1495             }
1496             fold_array = PL_fold;
1497             folder = foldEQ;
1498             goto do_exactf_non_utf8;
1499
1500         case EXACTFL:
1501             if (UTF_PATTERN || utf8_target) {
1502                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1503                 goto do_exactf_utf8;
1504             }
1505             fold_array = PL_fold_locale;
1506             folder = foldEQ_locale;
1507             goto do_exactf_non_utf8;
1508
1509         case EXACTFU_SS:
1510             if (UTF_PATTERN) {
1511                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1512             }
1513             goto do_exactf_utf8;
1514
1515         case EXACTFU_TRICKYFOLD:
1516         case EXACTFU:
1517             if (UTF_PATTERN || utf8_target) {
1518                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1519                 goto do_exactf_utf8;
1520             }
1521
1522             /* Any 'ss' in the pattern should have been replaced by regcomp,
1523              * so we don't have to worry here about this single special case
1524              * in the Latin1 range */
1525             fold_array = PL_fold_latin1;
1526             folder = foldEQ_latin1;
1527
1528             /* FALL THROUGH */
1529
1530         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1531                                are no glitches with fold-length differences
1532                                between the target string and pattern */
1533
1534             /* The idea in the non-utf8 EXACTF* cases is to first find the
1535              * first character of the EXACTF* node and then, if necessary,
1536              * case-insensitively compare the full text of the node.  c1 is the
1537              * first character.  c2 is its fold.  This logic will not work for
1538              * Unicode semantics and the german sharp ss, which hence should
1539              * not be compiled into a node that gets here. */
1540             pat_string = STRING(c);
1541             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1542
1543             /* We know that we have to match at least 'ln' bytes (which is the
1544              * same as characters, since not utf8).  If we have to match 3
1545              * characters, and there are only 2 availabe, we know without
1546              * trying that it will fail; so don't start a match past the
1547              * required minimum number from the far end */
1548             e = HOP3c(strend, -((I32)ln), s);
1549
1550             if (!reginfo && e < s) {
1551                 e = s;                  /* Due to minlen logic of intuit() */
1552             }
1553
1554             c1 = *pat_string;
1555             c2 = fold_array[c1];
1556             if (c1 == c2) { /* If char and fold are the same */
1557                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1558             }
1559             else {
1560                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1561             }
1562             break;
1563
1564         do_exactf_utf8:
1565         {
1566             unsigned expansion;
1567
1568
1569             /* If one of the operands is in utf8, we can't use the simpler
1570              * folding above, due to the fact that many different characters
1571              * can have the same fold, or portion of a fold, or different-
1572              * length fold */
1573             pat_string = STRING(c);
1574             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1575             pat_end = pat_string + ln;
1576             lnc = (UTF_PATTERN) /* length to match in characters */
1577                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1578                     : ln;
1579
1580             /* We have 'lnc' characters to match in the pattern, but because of
1581              * multi-character folding, each character in the target can match
1582              * up to 3 characters (Unicode guarantees it will never exceed
1583              * this) if it is utf8-encoded; and up to 2 if not (based on the
1584              * fact that the Latin 1 folds are already determined, and the
1585              * only multi-char fold in that range is the sharp-s folding to
1586              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1587              * string character.  Adjust lnc accordingly, rounding up, so that
1588              * if we need to match at least 4+1/3 chars, that really is 5. */
1589             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1590             lnc = (lnc + expansion - 1) / expansion;
1591
1592             /* As in the non-UTF8 case, if we have to match 3 characters, and
1593              * only 2 are left, it's guaranteed to fail, so don't start a
1594              * match that would require us to go beyond the end of the string
1595              */
1596             e = HOP3c(strend, -((I32)lnc), s);
1597
1598             if (!reginfo && e < s) {
1599                 e = s;                  /* Due to minlen logic of intuit() */
1600             }
1601
1602             /* XXX Note that we could recalculate e to stop the loop earlier,
1603              * as the worst case expansion above will rarely be met, and as we
1604              * go along we would usually find that e moves further to the left.
1605              * This would happen only after we reached the point in the loop
1606              * where if there were no expansion we should fail.  Unclear if
1607              * worth the expense */
1608
1609             while (s <= e) {
1610                 char *my_strend= (char *)strend;
1611                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1612                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1613                     && (!reginfo || regtry(reginfo, &s)) )
1614                 {
1615                     goto got_it;
1616                 }
1617                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1618             }
1619             break;
1620         }
1621         case BOUNDL:
1622             PL_reg_flags |= RF_tainted;
1623             FBC_BOUND(isALNUM_LC,
1624                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625                       isALNUM_LC_utf8((U8*)s));
1626             break;
1627         case NBOUNDL:
1628             PL_reg_flags |= RF_tainted;
1629             FBC_NBOUND(isALNUM_LC,
1630                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631                        isALNUM_LC_utf8((U8*)s));
1632             break;
1633         case BOUND:
1634             FBC_BOUND(isWORDCHAR,
1635                       isALNUM_uni(tmp),
1636                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1637             break;
1638         case BOUNDA:
1639             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1640                              isWORDCHAR_A(tmp),
1641                              isWORDCHAR_A((U8*)s));
1642             break;
1643         case NBOUND:
1644             FBC_NBOUND(isWORDCHAR,
1645                        isALNUM_uni(tmp),
1646                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1647             break;
1648         case NBOUNDA:
1649             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1650                               isWORDCHAR_A(tmp),
1651                               isWORDCHAR_A((U8*)s));
1652             break;
1653         case BOUNDU:
1654             FBC_BOUND(isWORDCHAR_L1,
1655                       isALNUM_uni(tmp),
1656                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1657             break;
1658         case NBOUNDU:
1659             FBC_NBOUND(isWORDCHAR_L1,
1660                        isALNUM_uni(tmp),
1661                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1662             break;
1663         case ALNUML:
1664             REXEC_FBC_CSCAN_TAINT(
1665                 isALNUM_LC_utf8((U8*)s),
1666                 isALNUM_LC(*s)
1667             );
1668             break;
1669         case ALNUMU:
1670             REXEC_FBC_CSCAN_PRELOAD(
1671                 LOAD_UTF8_CHARCLASS_ALNUM(),
1672                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1673                 isWORDCHAR_L1((U8) *s)
1674             );
1675             break;
1676         case ALNUM:
1677             REXEC_FBC_CSCAN_PRELOAD(
1678                 LOAD_UTF8_CHARCLASS_ALNUM(),
1679                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1680                 isWORDCHAR((U8) *s)
1681             );
1682             break;
1683         case ALNUMA:
1684             /* Don't need to worry about utf8, as it can match only a single
1685              * byte invariant character */
1686             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1687             break;
1688         case NALNUMU:
1689             REXEC_FBC_CSCAN_PRELOAD(
1690                 LOAD_UTF8_CHARCLASS_ALNUM(),
1691                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1692                 ! isWORDCHAR_L1((U8) *s)
1693             );
1694             break;
1695         case NALNUM:
1696             REXEC_FBC_CSCAN_PRELOAD(
1697                 LOAD_UTF8_CHARCLASS_ALNUM(),
1698                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1699                 ! isALNUM(*s)
1700             );
1701             break;
1702         case NALNUMA:
1703             REXEC_FBC_CSCAN(
1704                 !isWORDCHAR_A(*s),
1705                 !isWORDCHAR_A(*s)
1706             );
1707             break;
1708         case NALNUML:
1709             REXEC_FBC_CSCAN_TAINT(
1710                 !isALNUM_LC_utf8((U8*)s),
1711                 !isALNUM_LC(*s)
1712             );
1713             break;
1714         case SPACEU:
1715             REXEC_FBC_CSCAN_PRELOAD(
1716                 LOAD_UTF8_CHARCLASS_SPACE(),
1717                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1718                 isSPACE_L1((U8) *s)
1719             );
1720             break;
1721         case SPACE:
1722             REXEC_FBC_CSCAN_PRELOAD(
1723                 LOAD_UTF8_CHARCLASS_SPACE(),
1724                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1725                 isSPACE((U8) *s)
1726             );
1727             break;
1728         case SPACEA:
1729             /* Don't need to worry about utf8, as it can match only a single
1730              * byte invariant character */
1731             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1732             break;
1733         case SPACEL:
1734             REXEC_FBC_CSCAN_TAINT(
1735                 isSPACE_LC_utf8((U8*)s),
1736                 isSPACE_LC(*s)
1737             );
1738             break;
1739         case NSPACEU:
1740             REXEC_FBC_CSCAN_PRELOAD(
1741                 LOAD_UTF8_CHARCLASS_SPACE(),
1742                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1743                 ! isSPACE_L1((U8) *s)
1744             );
1745             break;
1746         case NSPACE:
1747             REXEC_FBC_CSCAN_PRELOAD(
1748                 LOAD_UTF8_CHARCLASS_SPACE(),
1749                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1750                 ! isSPACE((U8) *s)
1751             );
1752             break;
1753         case NSPACEA:
1754             REXEC_FBC_CSCAN(
1755                 !isSPACE_A(*s),
1756                 !isSPACE_A(*s)
1757             );
1758             break;
1759         case NSPACEL:
1760             REXEC_FBC_CSCAN_TAINT(
1761                 !isSPACE_LC_utf8((U8*)s),
1762                 !isSPACE_LC(*s)
1763             );
1764             break;
1765         case DIGIT:
1766             REXEC_FBC_CSCAN_PRELOAD(
1767                 LOAD_UTF8_CHARCLASS_DIGIT(),
1768                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1769                 isDIGIT(*s)
1770             );
1771             break;
1772         case DIGITA:
1773             /* Don't need to worry about utf8, as it can match only a single
1774              * byte invariant character */
1775             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1776             break;
1777         case DIGITL:
1778             REXEC_FBC_CSCAN_TAINT(
1779                 isDIGIT_LC_utf8((U8*)s),
1780                 isDIGIT_LC(*s)
1781             );
1782             break;
1783         case NDIGIT:
1784             REXEC_FBC_CSCAN_PRELOAD(
1785                 LOAD_UTF8_CHARCLASS_DIGIT(),
1786                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1787                 !isDIGIT(*s)
1788             );
1789             break;
1790         case NDIGITA:
1791             REXEC_FBC_CSCAN(
1792                 !isDIGIT_A(*s),
1793                 !isDIGIT_A(*s)
1794             );
1795             break;
1796         case NDIGITL:
1797             REXEC_FBC_CSCAN_TAINT(
1798                 !isDIGIT_LC_utf8((U8*)s),
1799                 !isDIGIT_LC(*s)
1800             );
1801             break;
1802         case LNBREAK:
1803             REXEC_FBC_CSCAN(
1804                 is_LNBREAK_utf8_safe(s, strend),
1805                 is_LNBREAK_latin1_safe(s, strend)
1806             );
1807             break;
1808         case VERTWS:
1809             REXEC_FBC_CSCAN(
1810                 is_VERTWS_utf8_safe(s, strend),
1811                 is_VERTWS_latin1_safe(s, strend)
1812             );
1813             break;
1814         case NVERTWS:
1815             REXEC_FBC_CSCAN(
1816                 !is_VERTWS_utf8_safe(s, strend),
1817                 !is_VERTWS_latin1_safe(s, strend)
1818             );
1819             break;
1820         case HORIZWS:
1821             REXEC_FBC_CSCAN(
1822                 is_HORIZWS_utf8_safe(s, strend),
1823                 is_HORIZWS_latin1_safe(s, strend)
1824             );
1825             break;
1826         case NHORIZWS:
1827             REXEC_FBC_CSCAN(
1828                 !is_HORIZWS_utf8_safe(s, strend),
1829                 !is_HORIZWS_latin1_safe(s, strend)
1830             );
1831             break;
1832         case POSIXA:
1833             /* Don't need to worry about utf8, as it can match only a single
1834             * byte invariant character.  The flag in this node type is the
1835             * class number to pass to _generic_isCC() to build a mask for
1836             * searching in PL_charclass[] */
1837             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1838             break;
1839         case NPOSIXA:
1840             REXEC_FBC_CSCAN(
1841                 !_generic_isCC_A(*s, FLAGS(c)),
1842                 !_generic_isCC_A(*s, FLAGS(c))
1843             );
1844             break;
1845
1846         case AHOCORASICKC:
1847         case AHOCORASICK:
1848             {
1849                 DECL_TRIE_TYPE(c);
1850                 /* what trie are we using right now */
1851                 reg_ac_data *aho
1852                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1853                 reg_trie_data *trie
1854                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1855                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1856
1857                 const char *last_start = strend - trie->minlen;
1858 #ifdef DEBUGGING
1859                 const char *real_start = s;
1860 #endif
1861                 STRLEN maxlen = trie->maxlen;
1862                 SV *sv_points;
1863                 U8 **points; /* map of where we were in the input string
1864                                 when reading a given char. For ASCII this
1865                                 is unnecessary overhead as the relationship
1866                                 is always 1:1, but for Unicode, especially
1867                                 case folded Unicode this is not true. */
1868                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1869                 U8 *bitmap=NULL;
1870
1871
1872                 GET_RE_DEBUG_FLAGS_DECL;
1873
1874                 /* We can't just allocate points here. We need to wrap it in
1875                  * an SV so it gets freed properly if there is a croak while
1876                  * running the match */
1877                 ENTER;
1878                 SAVETMPS;
1879                 sv_points=newSV(maxlen * sizeof(U8 *));
1880                 SvCUR_set(sv_points,
1881                     maxlen * sizeof(U8 *));
1882                 SvPOK_on(sv_points);
1883                 sv_2mortal(sv_points);
1884                 points=(U8**)SvPV_nolen(sv_points );
1885                 if ( trie_type != trie_utf8_fold
1886                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1887                 {
1888                     if (trie->bitmap)
1889                         bitmap=(U8*)trie->bitmap;
1890                     else
1891                         bitmap=(U8*)ANYOF_BITMAP(c);
1892                 }
1893                 /* this is the Aho-Corasick algorithm modified a touch
1894                    to include special handling for long "unknown char"
1895                    sequences. The basic idea being that we use AC as long
1896                    as we are dealing with a possible matching char, when
1897                    we encounter an unknown char (and we have not encountered
1898                    an accepting state) we scan forward until we find a legal
1899                    starting char.
1900                    AC matching is basically that of trie matching, except
1901                    that when we encounter a failing transition, we fall back
1902                    to the current states "fail state", and try the current char
1903                    again, a process we repeat until we reach the root state,
1904                    state 1, or a legal transition. If we fail on the root state
1905                    then we can either terminate if we have reached an accepting
1906                    state previously, or restart the entire process from the beginning
1907                    if we have not.
1908
1909                  */
1910                 while (s <= last_start) {
1911                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1912                     U8 *uc = (U8*)s;
1913                     U16 charid = 0;
1914                     U32 base = 1;
1915                     U32 state = 1;
1916                     UV uvc = 0;
1917                     STRLEN len = 0;
1918                     STRLEN foldlen = 0;
1919                     U8 *uscan = (U8*)NULL;
1920                     U8 *leftmost = NULL;
1921 #ifdef DEBUGGING
1922                     U32 accepted_word= 0;
1923 #endif
1924                     U32 pointpos = 0;
1925
1926                     while ( state && uc <= (U8*)strend ) {
1927                         int failed=0;
1928                         U32 word = aho->states[ state ].wordnum;
1929
1930                         if( state==1 ) {
1931                             if ( bitmap ) {
1932                                 DEBUG_TRIE_EXECUTE_r(
1933                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1934                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1935                                             (char *)uc, utf8_target );
1936                                         PerlIO_printf( Perl_debug_log,
1937                                             " Scanning for legal start char...\n");
1938                                     }
1939                                 );
1940                                 if (utf8_target) {
1941                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1942                                         uc += UTF8SKIP(uc);
1943                                     }
1944                                 } else {
1945                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1946                                         uc++;
1947                                     }
1948                                 }
1949                                 s= (char *)uc;
1950                             }
1951                             if (uc >(U8*)last_start) break;
1952                         }
1953
1954                         if ( word ) {
1955                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1956                             if (!leftmost || lpos < leftmost) {
1957                                 DEBUG_r(accepted_word=word);
1958                                 leftmost= lpos;
1959                             }
1960                             if (base==0) break;
1961
1962                         }
1963                         points[pointpos++ % maxlen]= uc;
1964                         if (foldlen || uc < (U8*)strend) {
1965                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1966                                              widecharmap, uc,
1967                                              uscan, len, uvc, charid, foldlen,
1968                                              foldbuf, uniflags);
1969                             DEBUG_TRIE_EXECUTE_r({
1970                                 dump_exec_pos( (char *)uc, c, strend,
1971                                             real_start, s, utf8_target);
1972                                 PerlIO_printf(Perl_debug_log,
1973                                     " Charid:%3u CP:%4"UVxf" ",
1974                                      charid, uvc);
1975                             });
1976                         }
1977                         else {
1978                             len = 0;
1979                             charid = 0;
1980                         }
1981
1982
1983                         do {
1984 #ifdef DEBUGGING
1985                             word = aho->states[ state ].wordnum;
1986 #endif
1987                             base = aho->states[ state ].trans.base;
1988
1989                             DEBUG_TRIE_EXECUTE_r({
1990                                 if (failed)
1991                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1992                                         s,   utf8_target );
1993                                 PerlIO_printf( Perl_debug_log,
1994                                     "%sState: %4"UVxf", word=%"UVxf,
1995                                     failed ? " Fail transition to " : "",
1996                                     (UV)state, (UV)word);
1997                             });
1998                             if ( base ) {
1999                                 U32 tmp;
2000                                 I32 offset;
2001                                 if (charid &&
2002                                      ( ((offset = base + charid
2003                                         - 1 - trie->uniquecharcount)) >= 0)
2004                                      && ((U32)offset < trie->lasttrans)
2005                                      && trie->trans[offset].check == state
2006                                      && (tmp=trie->trans[offset].next))
2007                                 {
2008                                     DEBUG_TRIE_EXECUTE_r(
2009                                         PerlIO_printf( Perl_debug_log," - legal\n"));
2010                                     state = tmp;
2011                                     break;
2012                                 }
2013                                 else {
2014                                     DEBUG_TRIE_EXECUTE_r(
2015                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2016                                     failed = 1;
2017                                     state = aho->fail[state];
2018                                 }
2019                             }
2020                             else {
2021                                 /* we must be accepting here */
2022                                 DEBUG_TRIE_EXECUTE_r(
2023                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2024                                 failed = 1;
2025                                 break;
2026                             }
2027                         } while(state);
2028                         uc += len;
2029                         if (failed) {
2030                             if (leftmost)
2031                                 break;
2032                             if (!state) state = 1;
2033                         }
2034                     }
2035                     if ( aho->states[ state ].wordnum ) {
2036                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2037                         if (!leftmost || lpos < leftmost) {
2038                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2039                             leftmost = lpos;
2040                         }
2041                     }
2042                     if (leftmost) {
2043                         s = (char*)leftmost;
2044                         DEBUG_TRIE_EXECUTE_r({
2045                             PerlIO_printf(
2046                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2047                                 (UV)accepted_word, (IV)(s - real_start)
2048                             );
2049                         });
2050                         if (!reginfo || regtry(reginfo, &s)) {
2051                             FREETMPS;
2052                             LEAVE;
2053                             goto got_it;
2054                         }
2055                         s = HOPc(s,1);
2056                         DEBUG_TRIE_EXECUTE_r({
2057                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2058                         });
2059                     } else {
2060                         DEBUG_TRIE_EXECUTE_r(
2061                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2062                         break;
2063                     }
2064                 }
2065                 FREETMPS;
2066                 LEAVE;
2067             }
2068             break;
2069         default:
2070             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2071             break;
2072         }
2073         return 0;
2074       got_it:
2075         return s;
2076 }
2077
2078
2079 /*
2080  - regexec_flags - match a regexp against a string
2081  */
2082 I32
2083 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2084               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2085 /* stringarg: the point in the string at which to begin matching */
2086 /* strend:    pointer to null at end of string */
2087 /* strbeg:    real beginning of string */
2088 /* minend:    end of match must be >= minend bytes after stringarg. */
2089 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2090  *            itself is accessed via the pointers above */
2091 /* data:      May be used for some additional optimizations.
2092               Currently its only used, with a U32 cast, for transmitting
2093               the ganch offset when doing a /g match. This will change */
2094 /* nosave:    For optimizations. */
2095
2096 {
2097     dVAR;
2098     struct regexp *const prog = ReANY(rx);
2099     /*register*/ char *s;
2100     regnode *c;
2101     /*register*/ char *startpos = stringarg;
2102     I32 minlen;         /* must match at least this many chars */
2103     I32 dontbother = 0; /* how many characters not to try at end */
2104     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2105     I32 scream_pos = -1;                /* Internal iterator of scream. */
2106     char *scream_olds = NULL;
2107     const bool utf8_target = cBOOL(DO_UTF8(sv));
2108     I32 multiline;
2109     RXi_GET_DECL(prog,progi);
2110     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2111     regexp_paren_pair *swap = NULL;
2112     GET_RE_DEBUG_FLAGS_DECL;
2113
2114     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2115     PERL_UNUSED_ARG(data);
2116
2117     /* Be paranoid... */
2118     if (prog == NULL || startpos == NULL) {
2119         Perl_croak(aTHX_ "NULL regexp parameter");
2120         return 0;
2121     }
2122
2123     multiline = prog->extflags & RXf_PMf_MULTILINE;
2124     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2125
2126     RX_MATCH_UTF8_set(rx, utf8_target);
2127     DEBUG_EXECUTE_r(
2128         debug_start_match(rx, utf8_target, startpos, strend,
2129         "Matching");
2130     );
2131
2132     minlen = prog->minlen;
2133
2134     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2135         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2136                               "String too short [regexec_flags]...\n"));
2137         goto phooey;
2138     }
2139
2140
2141     /* Check validity of program. */
2142     if (UCHARAT(progi->program) != REG_MAGIC) {
2143         Perl_croak(aTHX_ "corrupted regexp program");
2144     }
2145
2146     PL_reg_flags = 0;
2147     PL_reg_state.re_state_eval_setup_done = FALSE;
2148     PL_reg_maxiter = 0;
2149
2150     if (RX_UTF8(rx))
2151         PL_reg_flags |= RF_utf8;
2152
2153     /* Mark beginning of line for ^ and lookbehind. */
2154     reginfo.bol = startpos; /* XXX not used ??? */
2155     PL_bostr  = strbeg;
2156     reginfo.sv = sv;
2157
2158     /* Mark end of line for $ (and such) */
2159     PL_regeol = strend;
2160
2161     /* see how far we have to get to not match where we matched before */
2162     reginfo.till = startpos+minend;
2163
2164     /* If there is a "must appear" string, look for it. */
2165     s = startpos;
2166
2167     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2168         MAGIC *mg;
2169         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2170             reginfo.ganch = startpos + prog->gofs;
2171             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2172               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2173         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2174                   && SvMAGIC(sv)
2175                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2176                   && mg->mg_len >= 0) {
2177             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2178             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2179                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2180
2181             if (prog->extflags & RXf_ANCH_GPOS) {
2182                 if (s > reginfo.ganch)
2183                     goto phooey;
2184                 s = reginfo.ganch - prog->gofs;
2185                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2186                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2187                 if (s < strbeg)
2188                     goto phooey;
2189             }
2190         }
2191         else if (data) {
2192             reginfo.ganch = strbeg + PTR2UV(data);
2193             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2195
2196         } else {                                /* pos() not defined */
2197             reginfo.ganch = strbeg;
2198             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2199                  "GPOS: reginfo.ganch = strbeg\n"));
2200         }
2201     }
2202     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2203         /* We have to be careful. If the previous successful match
2204            was from this regex we don't want a subsequent partially
2205            successful match to clobber the old results.
2206            So when we detect this possibility we add a swap buffer
2207            to the re, and switch the buffer each match. If we fail
2208            we switch it back, otherwise we leave it swapped.
2209         */
2210         swap = prog->offs;
2211         /* do we need a save destructor here for eval dies? */
2212         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2213         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2214             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2215             PTR2UV(prog),
2216             PTR2UV(swap),
2217             PTR2UV(prog->offs)
2218         ));
2219     }
2220     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2221         re_scream_pos_data d;
2222
2223         d.scream_olds = &scream_olds;
2224         d.scream_pos = &scream_pos;
2225         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2226         if (!s) {
2227             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2228             goto phooey;        /* not present */
2229         }
2230     }
2231
2232
2233
2234     /* Simplest case:  anchored match need be tried only once. */
2235     /*  [unless only anchor is BOL and multiline is set] */
2236     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2237         if (s == startpos && regtry(&reginfo, &startpos))
2238             goto got_it;
2239         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2240                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2241         {
2242             char *end;
2243
2244             if (minlen)
2245                 dontbother = minlen - 1;
2246             end = HOP3c(strend, -dontbother, strbeg) - 1;
2247             /* for multiline we only have to try after newlines */
2248             if (prog->check_substr || prog->check_utf8) {
2249                 /* because of the goto we can not easily reuse the macros for bifurcating the
2250                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2251                 if (utf8_target) {
2252                     if (s == startpos)
2253                         goto after_try_utf8;
2254                     while (1) {
2255                         if (regtry(&reginfo, &s)) {
2256                             goto got_it;
2257                         }
2258                       after_try_utf8:
2259                         if (s > end) {
2260                             goto phooey;
2261                         }
2262                         if (prog->extflags & RXf_USE_INTUIT) {
2263                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2264                             if (!s) {
2265                                 goto phooey;
2266                             }
2267                         }
2268                         else {
2269                             s += UTF8SKIP(s);
2270                         }
2271                     }
2272                 } /* end search for check string in unicode */
2273                 else {
2274                     if (s == startpos) {
2275                         goto after_try_latin;
2276                     }
2277                     while (1) {
2278                         if (regtry(&reginfo, &s)) {
2279                             goto got_it;
2280                         }
2281                       after_try_latin:
2282                         if (s > end) {
2283                             goto phooey;
2284                         }
2285                         if (prog->extflags & RXf_USE_INTUIT) {
2286                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2287                             if (!s) {
2288                                 goto phooey;
2289                             }
2290                         }
2291                         else {
2292                             s++;
2293                         }
2294                     }
2295                 } /* end search for check string in latin*/
2296             } /* end search for check string */
2297             else { /* search for newline */
2298                 if (s > startpos) {
2299                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2300                     s--;
2301                 }
2302                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2303                 while (s <= end) { /* note it could be possible to match at the end of the string */
2304                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2305                         if (regtry(&reginfo, &s))
2306                             goto got_it;
2307                     }
2308                 }
2309             } /* end search for newline */
2310         } /* end anchored/multiline check string search */
2311         goto phooey;
2312     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2313     {
2314         /* the warning about reginfo.ganch being used without initialization
2315            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2316            and we only enter this block when the same bit is set. */
2317         char *tmp_s = reginfo.ganch - prog->gofs;
2318
2319         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2320             goto got_it;
2321         goto phooey;
2322     }
2323
2324     /* Messy cases:  unanchored match. */
2325     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2326         /* we have /x+whatever/ */
2327         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2328         char ch;
2329 #ifdef DEBUGGING
2330         int did_match = 0;
2331 #endif
2332         if (utf8_target) {
2333             if (! prog->anchored_utf8) {
2334                 to_utf8_substr(prog);
2335             }
2336             ch = SvPVX_const(prog->anchored_utf8)[0];
2337             REXEC_FBC_SCAN(
2338                 if (*s == ch) {
2339                     DEBUG_EXECUTE_r( did_match = 1 );
2340                     if (regtry(&reginfo, &s)) goto got_it;
2341                     s += UTF8SKIP(s);
2342                     while (s < strend && *s == ch)
2343                         s += UTF8SKIP(s);
2344                 }
2345             );
2346
2347         }
2348         else {
2349             if (! prog->anchored_substr) {
2350                 if (! to_byte_substr(prog)) {
2351                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2352                 }
2353             }
2354             ch = SvPVX_const(prog->anchored_substr)[0];
2355             REXEC_FBC_SCAN(
2356                 if (*s == ch) {
2357                     DEBUG_EXECUTE_r( did_match = 1 );
2358                     if (regtry(&reginfo, &s)) goto got_it;
2359                     s++;
2360                     while (s < strend && *s == ch)
2361                         s++;
2362                 }
2363             );
2364         }
2365         DEBUG_EXECUTE_r(if (!did_match)
2366                 PerlIO_printf(Perl_debug_log,
2367                                   "Did not find anchored character...\n")
2368                );
2369     }
2370     else if (prog->anchored_substr != NULL
2371               || prog->anchored_utf8 != NULL
2372               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2373                   && prog->float_max_offset < strend - s)) {
2374         SV *must;
2375         I32 back_max;
2376         I32 back_min;
2377         char *last;
2378         char *last1;            /* Last position checked before */
2379 #ifdef DEBUGGING
2380         int did_match = 0;
2381 #endif
2382         if (prog->anchored_substr || prog->anchored_utf8) {
2383             if (utf8_target) {
2384                 if (! prog->anchored_utf8) {
2385                     to_utf8_substr(prog);
2386                 }
2387                 must = prog->anchored_utf8;
2388             }
2389             else {
2390                 if (! prog->anchored_substr) {
2391                     if (! to_byte_substr(prog)) {
2392                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2393                     }
2394                 }
2395                 must = prog->anchored_substr;
2396             }
2397             back_max = back_min = prog->anchored_offset;
2398         } else {
2399             if (utf8_target) {
2400                 if (! prog->float_utf8) {
2401                     to_utf8_substr(prog);
2402                 }
2403                 must = prog->float_utf8;
2404             }
2405             else {
2406                 if (! prog->float_substr) {
2407                     if (! to_byte_substr(prog)) {
2408                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2409                     }
2410                 }
2411                 must = prog->float_substr;
2412             }
2413             back_max = prog->float_max_offset;
2414             back_min = prog->float_min_offset;
2415         }
2416
2417         if (back_min<0) {
2418             last = strend;
2419         } else {
2420             last = HOP3c(strend,        /* Cannot start after this */
2421                   -(I32)(CHR_SVLEN(must)
2422                          - (SvTAIL(must) != 0) + back_min), strbeg);
2423         }
2424         if (s > PL_bostr)
2425             last1 = HOPc(s, -1);
2426         else
2427             last1 = s - 1;      /* bogus */
2428
2429         /* XXXX check_substr already used to find "s", can optimize if
2430            check_substr==must. */
2431         scream_pos = -1;
2432         dontbother = end_shift;
2433         strend = HOPc(strend, -dontbother);
2434         while ( (s <= last) &&
2435                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2436                                   (unsigned char*)strend, must,
2437                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2438             DEBUG_EXECUTE_r( did_match = 1 );
2439             if (HOPc(s, -back_max) > last1) {
2440                 last1 = HOPc(s, -back_min);
2441                 s = HOPc(s, -back_max);
2442             }
2443             else {
2444                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2445
2446                 last1 = HOPc(s, -back_min);
2447                 s = t;
2448             }
2449             if (utf8_target) {
2450                 while (s <= last1) {
2451                     if (regtry(&reginfo, &s))
2452                         goto got_it;
2453                     if (s >= last1) {
2454                         s++; /* to break out of outer loop */
2455                         break;
2456                     }
2457                     s += UTF8SKIP(s);
2458                 }
2459             }
2460             else {
2461                 while (s <= last1) {
2462                     if (regtry(&reginfo, &s))
2463                         goto got_it;
2464                     s++;
2465                 }
2466             }
2467         }
2468         DEBUG_EXECUTE_r(if (!did_match) {
2469             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2470                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2471             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2472                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2473                                ? "anchored" : "floating"),
2474                 quoted, RE_SV_TAIL(must));
2475         });
2476         goto phooey;
2477     }
2478     else if ( (c = progi->regstclass) ) {
2479         if (minlen) {
2480             const OPCODE op = OP(progi->regstclass);
2481             /* don't bother with what can't match */
2482             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2483                 strend = HOPc(strend, -(minlen - 1));
2484         }
2485         DEBUG_EXECUTE_r({
2486             SV * const prop = sv_newmortal();
2487             regprop(prog, prop, c);
2488             {
2489                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2490                     s,strend-s,60);
2491                 PerlIO_printf(Perl_debug_log,
2492                     "Matching stclass %.*s against %s (%d bytes)\n",
2493                     (int)SvCUR(prop), SvPVX_const(prop),
2494                      quoted, (int)(strend - s));
2495             }
2496         });
2497         if (find_byclass(prog, c, s, strend, &reginfo))
2498             goto got_it;
2499         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2500     }
2501     else {
2502         dontbother = 0;
2503         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2504             /* Trim the end. */
2505             char *last= NULL;
2506             SV* float_real;
2507             STRLEN len;
2508             const char *little;
2509
2510             if (utf8_target) {
2511                 if (! prog->float_utf8) {
2512                     to_utf8_substr(prog);
2513                 }
2514                 float_real = prog->float_utf8;
2515             }
2516             else {
2517                 if (! prog->float_substr) {
2518                     if (! to_byte_substr(prog)) {
2519                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2520                     }
2521                 }
2522                 float_real = prog->float_substr;
2523             }
2524
2525             little = SvPV_const(float_real, len);
2526             if (SvTAIL(float_real)) {
2527                     /* This means that float_real contains an artificial \n on
2528                      * the end due to the presence of something like this:
2529                      * /foo$/ where we can match both "foo" and "foo\n" at the
2530                      * end of the string.  So we have to compare the end of the
2531                      * string first against the float_real without the \n and
2532                      * then against the full float_real with the string.  We
2533                      * have to watch out for cases where the string might be
2534                      * smaller than the float_real or the float_real without
2535                      * the \n. */
2536                     char *checkpos= strend - len;
2537                     DEBUG_OPTIMISE_r(
2538                         PerlIO_printf(Perl_debug_log,
2539                             "%sChecking for float_real.%s\n",
2540                             PL_colors[4], PL_colors[5]));
2541                     if (checkpos + 1 < strbeg) {
2542                         /* can't match, even if we remove the trailing \n
2543                          * string is too short to match */
2544                         DEBUG_EXECUTE_r(
2545                             PerlIO_printf(Perl_debug_log,
2546                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2547                                 PL_colors[4], PL_colors[5]));
2548                         goto phooey;
2549                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2550                         /* can match, the end of the string matches without the
2551                          * "\n" */
2552                         last = checkpos + 1;
2553                     } else if (checkpos < strbeg) {
2554                         /* cant match, string is too short when the "\n" is
2555                          * included */
2556                         DEBUG_EXECUTE_r(
2557                             PerlIO_printf(Perl_debug_log,
2558                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2559                                 PL_colors[4], PL_colors[5]));
2560                         goto phooey;
2561                     } else if (!multiline) {
2562                         /* non multiline match, so compare with the "\n" at the
2563                          * end of the string */
2564                         if (memEQ(checkpos, little, len)) {
2565                             last= checkpos;
2566                         } else {
2567                             DEBUG_EXECUTE_r(
2568                                 PerlIO_printf(Perl_debug_log,
2569                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2570                                     PL_colors[4], PL_colors[5]));
2571                             goto phooey;
2572                         }
2573                     } else {
2574                         /* multiline match, so we have to search for a place
2575                          * where the full string is located */
2576                         goto find_last;
2577                     }
2578             } else {
2579                   find_last:
2580                     if (len)
2581                         last = rninstr(s, strend, little, little + len);
2582                     else
2583                         last = strend;  /* matching "$" */
2584             }
2585             if (!last) {
2586                 /* at one point this block contained a comment which was
2587                  * probably incorrect, which said that this was a "should not
2588                  * happen" case.  Even if it was true when it was written I am
2589                  * pretty sure it is not anymore, so I have removed the comment
2590                  * and replaced it with this one. Yves */
2591                 DEBUG_EXECUTE_r(
2592                     PerlIO_printf(Perl_debug_log,
2593                         "String does not contain required substring, cannot match.\n"
2594                     ));
2595                 goto phooey;
2596             }
2597             dontbother = strend - last + prog->float_min_offset;
2598         }
2599         if (minlen && (dontbother < minlen))
2600             dontbother = minlen - 1;
2601         strend -= dontbother;              /* this one's always in bytes! */
2602         /* We don't know much -- general case. */
2603         if (utf8_target) {
2604             for (;;) {
2605                 if (regtry(&reginfo, &s))
2606                     goto got_it;
2607                 if (s >= strend)
2608                     break;
2609                 s += UTF8SKIP(s);
2610             };
2611         }
2612         else {
2613             do {
2614                 if (regtry(&reginfo, &s))
2615                     goto got_it;
2616             } while (s++ < strend);
2617         }
2618     }
2619
2620     /* Failure. */
2621     goto phooey;
2622
2623 got_it:
2624     DEBUG_BUFFERS_r(
2625         if (swap)
2626             PerlIO_printf(Perl_debug_log,
2627                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2628                 PTR2UV(prog),
2629                 PTR2UV(swap)
2630             );
2631     );
2632     Safefree(swap);
2633     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2634
2635     if (PL_reg_state.re_state_eval_setup_done)
2636         restore_pos(aTHX_ prog);
2637     if (RXp_PAREN_NAMES(prog))
2638         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2639
2640     /* make sure $`, $&, $', and $digit will work later */
2641     if ( !(flags & REXEC_NOT_FIRST) ) {
2642         if (flags & REXEC_COPY_STR) {
2643 #ifdef PERL_OLD_COPY_ON_WRITE
2644             if ((SvIsCOW(sv)
2645                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2646                 if (DEBUG_C_TEST) {
2647                     PerlIO_printf(Perl_debug_log,
2648                                   "Copy on write: regexp capture, type %d\n",
2649                                   (int) SvTYPE(sv));
2650                 }
2651                 RX_MATCH_COPY_FREE(rx);
2652                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2653                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2654                 assert (SvPOKp(prog->saved_copy));
2655                 prog->sublen  = PL_regeol - strbeg;
2656                 prog->suboffset = 0;
2657                 prog->subcoffset = 0;
2658             } else
2659 #endif
2660             {
2661                 I32 min = 0;
2662                 I32 max = PL_regeol - strbeg;
2663                 I32 sublen;
2664
2665                 if (    (flags & REXEC_COPY_SKIP_POST)
2666                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2667                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2668                 ) { /* don't copy $' part of string */
2669                     U32 n = 0;
2670                     max = -1;
2671                     /* calculate the right-most part of the string covered
2672                      * by a capture. Due to look-ahead, this may be to
2673                      * the right of $&, so we have to scan all captures */
2674                     while (n <= prog->lastparen) {
2675                         if (prog->offs[n].end > max)
2676                             max = prog->offs[n].end;
2677                         n++;
2678                     }
2679                     if (max == -1)
2680                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2681                                 ? prog->offs[0].start
2682                                 : 0;
2683                     assert(max >= 0 && max <= PL_regeol - strbeg);
2684                 }
2685
2686                 if (    (flags & REXEC_COPY_SKIP_PRE)
2687                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2688                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2689                 ) { /* don't copy $` part of string */
2690                     U32 n = 0;
2691                     min = max;
2692                     /* calculate the left-most part of the string covered
2693                      * by a capture. Due to look-behind, this may be to
2694                      * the left of $&, so we have to scan all captures */
2695                     while (min && n <= prog->lastparen) {
2696                         if (   prog->offs[n].start != -1
2697                             && prog->offs[n].start < min)
2698                         {
2699                             min = prog->offs[n].start;
2700                         }
2701                         n++;
2702                     }
2703                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2704                         && min >  prog->offs[0].end
2705                     )
2706                         min = prog->offs[0].end;
2707
2708                 }
2709
2710                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2711                 sublen = max - min;
2712
2713                 if (RX_MATCH_COPIED(rx)) {
2714                     if (sublen > prog->sublen)
2715                         prog->subbeg =
2716                                 (char*)saferealloc(prog->subbeg, sublen+1);
2717                 }
2718                 else
2719                     prog->subbeg = (char*)safemalloc(sublen+1);
2720                 Copy(strbeg + min, prog->subbeg, sublen, char);
2721                 prog->subbeg[sublen] = '\0';
2722                 prog->suboffset = min;
2723                 prog->sublen = sublen;
2724                 RX_MATCH_COPIED_on(rx);
2725             }
2726             prog->subcoffset = prog->suboffset;
2727             if (prog->suboffset && utf8_target) {
2728                 /* Convert byte offset to chars.
2729                  * XXX ideally should only compute this if @-/@+
2730                  * has been seen, a la PL_sawampersand ??? */
2731
2732                 /* If there's a direct correspondence between the
2733                  * string which we're matching and the original SV,
2734                  * then we can use the utf8 len cache associated with
2735                  * the SV. In particular, it means that under //g,
2736                  * sv_pos_b2u() will use the previously cached
2737                  * position to speed up working out the new length of
2738                  * subcoffset, rather than counting from the start of
2739                  * the string each time. This stops
2740                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2741                  * from going quadratic */
2742                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2743                     sv_pos_b2u(sv, &(prog->subcoffset));
2744                 else
2745                     prog->subcoffset = utf8_length((U8*)strbeg,
2746                                         (U8*)(strbeg+prog->suboffset));
2747             }
2748         }
2749         else {
2750             RX_MATCH_COPY_FREE(rx);
2751             prog->subbeg = strbeg;
2752             prog->suboffset = 0;
2753             prog->subcoffset = 0;
2754             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2755         }
2756     }
2757
2758     return 1;
2759
2760 phooey:
2761     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2762                           PL_colors[4], PL_colors[5]));
2763     if (PL_reg_state.re_state_eval_setup_done)
2764         restore_pos(aTHX_ prog);
2765     if (swap) {
2766         /* we failed :-( roll it back */
2767         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2768             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2769             PTR2UV(prog),
2770             PTR2UV(prog->offs),
2771             PTR2UV(swap)
2772         ));
2773         Safefree(prog->offs);
2774         prog->offs = swap;
2775     }
2776     return 0;
2777 }
2778
2779
2780 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2781  * Do inc before dec, in case old and new rex are the same */
2782 #define SET_reg_curpm(Re2) \
2783     if (PL_reg_state.re_state_eval_setup_done) {    \
2784         (void)ReREFCNT_inc(Re2);                    \
2785         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2786         PM_SETRE((PL_reg_curpm), (Re2));            \
2787     }
2788
2789
2790 /*
2791  - regtry - try match at specific point
2792  */
2793 STATIC I32                      /* 0 failure, 1 success */
2794 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2795 {
2796     dVAR;
2797     CHECKPOINT lastcp;
2798     REGEXP *const rx = reginfo->prog;
2799     regexp *const prog = ReANY(rx);
2800     I32 result;
2801     RXi_GET_DECL(prog,progi);
2802     GET_RE_DEBUG_FLAGS_DECL;
2803
2804     PERL_ARGS_ASSERT_REGTRY;
2805
2806     reginfo->cutpoint=NULL;
2807
2808     if ((prog->extflags & RXf_EVAL_SEEN)
2809         && !PL_reg_state.re_state_eval_setup_done)
2810     {
2811         MAGIC *mg;
2812
2813         PL_reg_state.re_state_eval_setup_done = TRUE;
2814         if (reginfo->sv) {
2815             /* Make $_ available to executed code. */
2816             if (reginfo->sv != DEFSV) {
2817                 SAVE_DEFSV;
2818                 DEFSV_set(reginfo->sv);
2819             }
2820
2821             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2822                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2823                 /* prepare for quick setting of pos */
2824 #ifdef PERL_OLD_COPY_ON_WRITE
2825                 if (SvIsCOW(reginfo->sv))
2826                     sv_force_normal_flags(reginfo->sv, 0);
2827 #endif
2828                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2829                                  &PL_vtbl_mglob, NULL, 0);
2830                 mg->mg_len = -1;
2831             }
2832             PL_reg_magic    = mg;
2833             PL_reg_oldpos   = mg->mg_len;
2834             SAVEDESTRUCTOR_X(restore_pos, prog);
2835         }
2836         if (!PL_reg_curpm) {
2837             Newxz(PL_reg_curpm, 1, PMOP);
2838 #ifdef USE_ITHREADS
2839             {
2840                 SV* const repointer = &PL_sv_undef;
2841                 /* this regexp is also owned by the new PL_reg_curpm, which
2842                    will try to free it.  */
2843                 av_push(PL_regex_padav, repointer);
2844                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2845                 PL_regex_pad = AvARRAY(PL_regex_padav);
2846             }
2847 #endif
2848         }
2849         SET_reg_curpm(rx);
2850         PL_reg_oldcurpm = PL_curpm;
2851         PL_curpm = PL_reg_curpm;
2852         if (RXp_MATCH_COPIED(prog)) {
2853             /*  Here is a serious problem: we cannot rewrite subbeg,
2854                 since it may be needed if this match fails.  Thus
2855                 $` inside (?{}) could fail... */
2856             PL_reg_oldsaved = prog->subbeg;
2857             PL_reg_oldsavedlen = prog->sublen;
2858             PL_reg_oldsavedoffset = prog->suboffset;
2859             PL_reg_oldsavedcoffset = prog->suboffset;
2860 #ifdef PERL_OLD_COPY_ON_WRITE
2861             PL_nrs = prog->saved_copy;
2862 #endif
2863             RXp_MATCH_COPIED_off(prog);
2864         }
2865         else
2866             PL_reg_oldsaved = NULL;
2867         prog->subbeg = PL_bostr;
2868         prog->suboffset = 0;
2869         prog->subcoffset = 0;
2870         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2871     }
2872 #ifdef DEBUGGING
2873     PL_reg_starttry = *startposp;
2874 #endif
2875     prog->offs[0].start = *startposp - PL_bostr;
2876     prog->lastparen = 0;
2877     prog->lastcloseparen = 0;
2878     PL_regsize = 0;
2879
2880     /* XXXX What this code is doing here?!!!  There should be no need
2881        to do this again and again, prog->lastparen should take care of
2882        this!  --ilya*/
2883
2884     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2885      * Actually, the code in regcppop() (which Ilya may be meaning by
2886      * prog->lastparen), is not needed at all by the test suite
2887      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2888      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2889      * Meanwhile, this code *is* needed for the
2890      * above-mentioned test suite tests to succeed.  The common theme
2891      * on those tests seems to be returning null fields from matches.
2892      * --jhi updated by dapm */
2893 #if 1
2894     if (prog->nparens) {
2895         regexp_paren_pair *pp = prog->offs;
2896         I32 i;
2897         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2898             ++pp;
2899             pp->start = -1;
2900             pp->end = -1;
2901         }
2902     }
2903 #endif
2904     REGCP_SET(lastcp);
2905     result = regmatch(reginfo, *startposp, progi->program + 1);
2906     if (result != -1) {
2907         prog->offs[0].end = result;
2908         return 1;
2909     }
2910     if (reginfo->cutpoint)
2911         *startposp= reginfo->cutpoint;
2912     REGCP_UNWIND(lastcp);
2913     return 0;
2914 }
2915
2916
2917 #define sayYES goto yes
2918 #define sayNO goto no
2919 #define sayNO_SILENT goto no_silent
2920
2921 /* we dont use STMT_START/END here because it leads to
2922    "unreachable code" warnings, which are bogus, but distracting. */
2923 #define CACHEsayNO \
2924     if (ST.cache_mask) \
2925        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2926     sayNO
2927
2928 /* this is used to determine how far from the left messages like
2929    'failed...' are printed. It should be set such that messages
2930    are inline with the regop output that created them.
2931 */
2932 #define REPORT_CODE_OFF 32
2933
2934
2935 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2936 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2937 #define CHRTEST_NOT_A_CP_1 -999
2938 #define CHRTEST_NOT_A_CP_2 -998
2939
2940 #define SLAB_FIRST(s) (&(s)->states[0])
2941 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2942
2943 /* grab a new slab and return the first slot in it */
2944
2945 STATIC regmatch_state *
2946 S_push_slab(pTHX)
2947 {
2948 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2949     dMY_CXT;
2950 #endif
2951     regmatch_slab *s = PL_regmatch_slab->next;
2952     if (!s) {
2953         Newx(s, 1, regmatch_slab);
2954         s->prev = PL_regmatch_slab;
2955         s->next = NULL;
2956         PL_regmatch_slab->next = s;
2957     }
2958     PL_regmatch_slab = s;
2959     return SLAB_FIRST(s);
2960 }
2961
2962
2963 /* push a new state then goto it */
2964
2965 #define PUSH_STATE_GOTO(state, node, input) \
2966     pushinput = input; \
2967     scan = node; \
2968     st->resume_state = state; \
2969     goto push_state;
2970
2971 /* push a new state with success backtracking, then goto it */
2972
2973 #define PUSH_YES_STATE_GOTO(state, node, input) \
2974     pushinput = input; \
2975     scan = node; \
2976     st->resume_state = state; \
2977     goto push_yes_state;
2978
2979
2980
2981
2982 /*
2983
2984 regmatch() - main matching routine
2985
2986 This is basically one big switch statement in a loop. We execute an op,
2987 set 'next' to point the next op, and continue. If we come to a point which
2988 we may need to backtrack to on failure such as (A|B|C), we push a
2989 backtrack state onto the backtrack stack. On failure, we pop the top
2990 state, and re-enter the loop at the state indicated. If there are no more
2991 states to pop, we return failure.
2992
2993 Sometimes we also need to backtrack on success; for example /A+/, where
2994 after successfully matching one A, we need to go back and try to
2995 match another one; similarly for lookahead assertions: if the assertion
2996 completes successfully, we backtrack to the state just before the assertion
2997 and then carry on.  In these cases, the pushed state is marked as
2998 'backtrack on success too'. This marking is in fact done by a chain of
2999 pointers, each pointing to the previous 'yes' state. On success, we pop to
3000 the nearest yes state, discarding any intermediate failure-only states.
3001 Sometimes a yes state is pushed just to force some cleanup code to be
3002 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3003 it to free the inner regex.
3004
3005 Note that failure backtracking rewinds the cursor position, while
3006 success backtracking leaves it alone.
3007
3008 A pattern is complete when the END op is executed, while a subpattern
3009 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3010 ops trigger the "pop to last yes state if any, otherwise return true"
3011 behaviour.
3012
3013 A common convention in this function is to use A and B to refer to the two
3014 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3015 the subpattern to be matched possibly multiple times, while B is the entire
3016 rest of the pattern. Variable and state names reflect this convention.
3017
3018 The states in the main switch are the union of ops and failure/success of
3019 substates associated with with that op.  For example, IFMATCH is the op
3020 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3021 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3022 successfully matched A and IFMATCH_A_fail is a state saying that we have
3023 just failed to match A. Resume states always come in pairs. The backtrack
3024 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3025 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3026 on success or failure.
3027
3028 The struct that holds a backtracking state is actually a big union, with
3029 one variant for each major type of op. The variable st points to the
3030 top-most backtrack struct. To make the code clearer, within each
3031 block of code we #define ST to alias the relevant union.
3032
3033 Here's a concrete example of a (vastly oversimplified) IFMATCH
3034 implementation:
3035
3036     switch (state) {
3037     ....
3038
3039 #define ST st->u.ifmatch
3040
3041     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3042         ST.foo = ...; // some state we wish to save
3043         ...
3044         // push a yes backtrack state with a resume value of
3045         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3046         // first node of A:
3047         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3048         // NOTREACHED
3049
3050     case IFMATCH_A: // we have successfully executed A; now continue with B
3051         next = B;
3052         bar = ST.foo; // do something with the preserved value
3053         break;
3054
3055     case IFMATCH_A_fail: // A failed, so the assertion failed
3056         ...;   // do some housekeeping, then ...
3057         sayNO; // propagate the failure
3058
3059 #undef ST
3060
3061     ...
3062     }
3063
3064 For any old-timers reading this who are familiar with the old recursive
3065 approach, the code above is equivalent to:
3066
3067     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3068     {
3069         int foo = ...
3070         ...
3071         if (regmatch(A)) {
3072             next = B;
3073             bar = foo;
3074             break;
3075         }
3076         ...;   // do some housekeeping, then ...
3077         sayNO; // propagate the failure
3078     }
3079
3080 The topmost backtrack state, pointed to by st, is usually free. If you
3081 want to claim it, populate any ST.foo fields in it with values you wish to
3082 save, then do one of
3083
3084         PUSH_STATE_GOTO(resume_state, node, newinput);
3085         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3086
3087 which sets that backtrack state's resume value to 'resume_state', pushes a
3088 new free entry to the top of the backtrack stack, then goes to 'node'.
3089 On backtracking, the free slot is popped, and the saved state becomes the
3090 new free state. An ST.foo field in this new top state can be temporarily
3091 accessed to retrieve values, but once the main loop is re-entered, it
3092 becomes available for reuse.
3093
3094 Note that the depth of the backtrack stack constantly increases during the
3095 left-to-right execution of the pattern, rather than going up and down with
3096 the pattern nesting. For example the stack is at its maximum at Z at the
3097 end of the pattern, rather than at X in the following:
3098
3099     /(((X)+)+)+....(Y)+....Z/
3100
3101 The only exceptions to this are lookahead/behind assertions and the cut,
3102 (?>A), which pop all the backtrack states associated with A before
3103 continuing.
3104
3105 Backtrack state structs are allocated in slabs of about 4K in size.
3106 PL_regmatch_state and st always point to the currently active state,
3107 and PL_regmatch_slab points to the slab currently containing
3108 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3109 allocated, and is never freed until interpreter destruction. When the slab
3110 is full, a new one is allocated and chained to the end. At exit from
3111 regmatch(), slabs allocated since entry are freed.
3112
3113 */
3114
3115
3116 #define DEBUG_STATE_pp(pp)                                  \
3117     DEBUG_STATE_r({                                         \
3118         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3119         PerlIO_printf(Perl_debug_log,                       \
3120             "    %*s"pp" %s%s%s%s%s\n",                     \
3121             depth*2, "",                                    \
3122             PL_reg_name[st->resume_state],                     \
3123             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3124             ((st==yes_state) ? "Y" : ""),                   \
3125             ((st==mark_state) ? "M" : ""),                  \
3126             ((st==yes_state||st==mark_state) ? "]" : "")    \
3127         );                                                  \
3128     });
3129
3130
3131 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3132
3133 #ifdef DEBUGGING
3134
3135 STATIC void
3136 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3137     const char *start, const char *end, const char *blurb)
3138 {
3139     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3140
3141     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3142
3143     if (!PL_colorset)
3144             reginitcolors();
3145     {
3146         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3147             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3148
3149         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3150             start, end - start, 60);
3151
3152         PerlIO_printf(Perl_debug_log,
3153             "%s%s REx%s %s against %s\n",
3154                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3155
3156         if (utf8_target||utf8_pat)
3157             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3158                 utf8_pat ? "pattern" : "",
3159                 utf8_pat && utf8_target ? " and " : "",
3160                 utf8_target ? "string" : ""
3161             );
3162     }
3163 }
3164
3165 STATIC void
3166 S_dump_exec_pos(pTHX_ const char *locinput,
3167                       const regnode *scan,
3168                       const char *loc_regeol,
3169                       const char *loc_bostr,
3170                       const char *loc_reg_starttry,
3171                       const bool utf8_target)
3172 {
3173     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3174     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3175     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3176     /* The part of the string before starttry has one color
3177        (pref0_len chars), between starttry and current
3178        position another one (pref_len - pref0_len chars),
3179        after the current position the third one.
3180        We assume that pref0_len <= pref_len, otherwise we
3181        decrease pref0_len.  */
3182     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3183         ? (5 + taill) - l : locinput - loc_bostr;
3184     int pref0_len;
3185
3186     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3187
3188     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3189         pref_len++;
3190     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3191     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3192         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3193               ? (5 + taill) - pref_len : loc_regeol - locinput);
3194     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3195         l--;
3196     if (pref0_len < 0)
3197         pref0_len = 0;
3198     if (pref0_len > pref_len)
3199         pref0_len = pref_len;
3200     {
3201         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3202
3203         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3204             (locinput - pref_len),pref0_len, 60, 4, 5);
3205
3206         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3207                     (locinput - pref_len + pref0_len),
3208                     pref_len - pref0_len, 60, 2, 3);
3209
3210         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3211                     locinput, loc_regeol - locinput, 10, 0, 1);
3212
3213         const STRLEN tlen=len0+len1+len2;
3214         PerlIO_printf(Perl_debug_log,
3215                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3216                     (IV)(locinput - loc_bostr),
3217                     len0, s0,
3218                     len1, s1,
3219                     (docolor ? "" : "> <"),
3220                     len2, s2,
3221                     (int)(tlen > 19 ? 0 :  19 - tlen),
3222                     "");
3223     }
3224 }
3225
3226 #endif
3227
3228 /* reg_check_named_buff_matched()
3229  * Checks to see if a named buffer has matched. The data array of
3230  * buffer numbers corresponding to the buffer is expected to reside
3231  * in the regexp->data->data array in the slot stored in the ARG() of
3232  * node involved. Note that this routine doesn't actually care about the
3233  * name, that information is not preserved from compilation to execution.
3234  * Returns the index of the leftmost defined buffer with the given name
3235  * or 0 if non of the buffers matched.
3236  */
3237 STATIC I32
3238 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3239 {
3240     I32 n;
3241     RXi_GET_DECL(rex,rexi);
3242     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3243     I32 *nums=(I32*)SvPVX(sv_dat);
3244
3245     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3246
3247     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3248         if ((I32)rex->lastparen >= nums[n] &&
3249             rex->offs[nums[n]].end != -1)
3250         {
3251             return nums[n];
3252         }
3253     }
3254     return 0;
3255 }
3256
3257
3258 /* free all slabs above current one  - called during LEAVE_SCOPE */
3259
3260 STATIC void
3261 S_clear_backtrack_stack(pTHX_ void *p)
3262 {
3263     regmatch_slab *s = PL_regmatch_slab->next;
3264     PERL_UNUSED_ARG(p);
3265
3266     if (!s)
3267         return;
3268     PL_regmatch_slab->next = NULL;
3269     while (s) {
3270         regmatch_slab * const osl = s;
3271         s = s->next;
3272         Safefree(osl);
3273     }
3274 }
3275 static bool
3276 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3277 {
3278     /* This function determines if there are one or two characters that match
3279      * the first character of the passed-in EXACTish node <text_node>, and if
3280      * so, returns them in the passed-in pointers.
3281      *
3282      * If it determines that no possible character in the target string can
3283      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3284      * the first character in <text_node> requires UTF-8 to represent, and the
3285      * target string isn't in UTF-8.)
3286      *
3287      * If there are more than two characters that could match the beginning of
3288      * <text_node>, or if more context is required to determine a match or not,
3289      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3290      *
3291      * The motiviation behind this function is to allow the caller to set up
3292      * tight loops for matching.  If <text_node> is of type EXACT, there is
3293      * only one possible character that can match its first character, and so
3294      * the situation is quite simple.  But things get much more complicated if
3295      * folding is involved.  It may be that the first character of an EXACTFish
3296      * node doesn't participate in any possible fold, e.g., punctuation, so it
3297      * can be matched only by itself.  The vast majority of characters that are
3298      * in folds match just two things, their lower and upper-case equivalents.
3299      * But not all are like that; some have multiple possible matches, or match
3300      * sequences of more than one character.  This function sorts all that out.
3301      *
3302      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3303      * loop of trying to match A*, we know we can't exit where the thing
3304      * following it isn't a B.  And something can't be a B unless it is the
3305      * beginning of B.  By putting a quick test for that beginning in a tight
3306      * loop, we can rule out things that can't possibly be B without having to
3307      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3308      * character, we can make a tight loop matching A*, using the outputs of
3309      * this function.
3310      *
3311      * If the target string to match isn't in UTF-8, and there aren't
3312      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3313      * the one or two possible octets (which are characters in this situation)
3314      * that can match.  In all cases, if there is only one character that can
3315      * match, *<c1p> and *<c2p> will be identical.
3316      *
3317      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3318      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3319      * can match the beginning of <text_node>.  They should be declared with at
3320      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3321      * undefined what these contain.)  If one or both of the buffers are
3322      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3323      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3324      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3325      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3326      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3327
3328     const bool utf8_target = PL_reg_match_utf8;
3329
3330     UV c1 = CHRTEST_NOT_A_CP_1;
3331     UV c2 = CHRTEST_NOT_A_CP_2;
3332     bool use_chrtest_void = FALSE;
3333
3334     /* Used when we have both utf8 input and utf8 output, to avoid converting
3335      * to/from code points */
3336     bool utf8_has_been_setup = FALSE;
3337
3338     dVAR;
3339
3340     U8 *pat = (U8*)STRING(text_node);
3341
3342     if (OP(text_node) == EXACT) {
3343
3344         /* In an exact node, only one thing can be matched, that first
3345          * character.  If both the pat and the target are UTF-8, we can just
3346          * copy the input to the output, avoiding finding the code point of
3347          * that character */
3348         if (! UTF_PATTERN) {
3349             c2 = c1 = *pat;
3350         }
3351         else if (utf8_target) {
3352             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3353             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3354             utf8_has_been_setup = TRUE;
3355         }
3356         else {
3357             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3358         }
3359     }
3360     else /* an EXACTFish node */
3361          if ((UTF_PATTERN
3362                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3363                                                     pat + STR_LEN(text_node)))
3364              || (! UTF_PATTERN
3365                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3366                                                     pat + STR_LEN(text_node))))
3367     {
3368         /* Multi-character folds require more context to sort out.  Also
3369          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3370          * handled outside this routine */
3371         use_chrtest_void = TRUE;
3372     }
3373     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3374         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3375         if (c1 > 256) {
3376             /* Load the folds hash, if not already done */
3377             SV** listp;
3378             if (! PL_utf8_foldclosures) {
3379                 if (! PL_utf8_tofold) {
3380                     U8 dummy[UTF8_MAXBYTES+1];
3381
3382                     /* Force loading this by folding an above-Latin1 char */
3383                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3384                     assert(PL_utf8_tofold); /* Verify that worked */
3385                 }
3386                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3387             }
3388
3389             /* The fold closures data structure is a hash with the keys being
3390              * the UTF-8 of every character that is folded to, like 'k', and
3391              * the values each an array of all code points that fold to its
3392              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3393              * not included */
3394             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3395                                      (char *) pat,
3396                                      UTF8SKIP(pat),
3397                                      FALSE))))
3398             {
3399                 /* Not found in the hash, therefore there are no folds
3400                  * containing it, so there is only a single character that
3401                  * could match */
3402                 c2 = c1;
3403             }
3404             else {  /* Does participate in folds */
3405                 AV* list = (AV*) *listp;
3406                 if (av_len(list) != 1) {
3407
3408                     /* If there aren't exactly two folds to this, it is outside
3409                      * the scope of this function */
3410                     use_chrtest_void = TRUE;
3411                 }
3412                 else {  /* There are two.  Get them */
3413                     SV** c_p = av_fetch(list, 0, FALSE);
3414                     if (c_p == NULL) {
3415                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3416                     }
3417                     c1 = SvUV(*c_p);
3418
3419                     c_p = av_fetch(list, 1, FALSE);
3420                     if (c_p == NULL) {
3421                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3422                     }
3423                     c2 = SvUV(*c_p);
3424
3425                     /* Folds that cross the 255/256 boundary are forbidden if
3426                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3427                      * pattern character is above 256, and its only other match
3428                      * is below 256, the only legal match will be to itself.
3429                      * We have thrown away the original, so have to compute
3430                      * which is the one above 255 */
3431                     if ((c1 < 256) != (c2 < 256)) {
3432                         if (OP(text_node) == EXACTFL
3433                             || (OP(text_node) == EXACTFA
3434                                 && (isASCII(c1) || isASCII(c2))))
3435                         {
3436                             if (c1 < 256) {
3437                                 c1 = c2;
3438                             }
3439                             else {
3440                                 c2 = c1;
3441                             }
3442                         }
3443                     }
3444                 }
3445             }
3446         }
3447         else /* Here, c1 is < 255 */
3448              if (utf8_target
3449                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3450                  && OP(text_node) != EXACTFL
3451                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3452         {
3453             /* Here, there could be something above Latin1 in the target which
3454              * folds to this character in the pattern.  All such cases except
3455              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3456              * involved in their folds, so are outside the scope of this
3457              * function */
3458             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3459                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3460             }
3461             else {
3462                 use_chrtest_void = TRUE;
3463             }
3464         }
3465         else { /* Here nothing above Latin1 can fold to the pattern character */
3466             switch (OP(text_node)) {
3467
3468                 case EXACTFL:   /* /l rules */
3469                     c2 = PL_fold_locale[c1];
3470                     break;
3471
3472                 case EXACTF:
3473                     if (! utf8_target) {    /* /d rules */
3474                         c2 = PL_fold[c1];
3475                         break;
3476                     }
3477                     /* FALLTHROUGH */
3478                     /* /u rules for all these.  This happens to work for
3479                      * EXACTFA as nothing in Latin1 folds to ASCII */
3480                 case EXACTFA:
3481                 case EXACTFU_TRICKYFOLD:
3482                 case EXACTFU_SS:
3483                 case EXACTFU:
3484                     c2 = PL_fold_latin1[c1];
3485                     break;
3486
3487                 default:
3488                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3489                     assert(0); /* NOTREACHED */
3490             }
3491         }
3492     }
3493
3494     /* Here have figured things out.  Set up the returns */
3495     if (use_chrtest_void) {
3496         *c2p = *c1p = CHRTEST_VOID;
3497     }
3498     else if (utf8_target) {
3499         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3500             uvchr_to_utf8(c1_utf8, c1);
3501             uvchr_to_utf8(c2_utf8, c2);
3502         }
3503
3504         /* Invariants are stored in both the utf8 and byte outputs; Use
3505          * negative numbers otherwise for the byte ones.  Make sure that the
3506          * byte ones are the same iff the utf8 ones are the same */
3507         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3508         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3509                 ? *c2_utf8
3510                 : (c1 == c2)
3511                   ? CHRTEST_NOT_A_CP_1
3512                   : CHRTEST_NOT_A_CP_2;
3513     }
3514     else if (c1 > 255) {
3515        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3516                            can represent */
3517            return FALSE;
3518        }
3519
3520        *c1p = *c2p = c2;    /* c2 is the only representable value */
3521     }
3522     else {  /* c1 is representable; see about c2 */
3523        *c1p = c1;
3524        *c2p = (c2 < 256) ? c2 : c1;
3525     }
3526
3527     return TRUE;
3528 }
3529
3530 /* returns -1 on failure, $+[0] on success */
3531 STATIC I32
3532 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3533 {
3534 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3535     dMY_CXT;
3536 #endif
3537     dVAR;
3538     const bool utf8_target = PL_reg_match_utf8;
3539     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3540     REGEXP *rex_sv = reginfo->prog;
3541     regexp *rex = ReANY(rex_sv);
3542     RXi_GET_DECL(rex,rexi);
3543     I32 oldsave;
3544     /* the current state. This is a cached copy of PL_regmatch_state */
3545     regmatch_state *st;
3546     /* cache heavy used fields of st in registers */
3547     regnode *scan;
3548     regnode *next;
3549     U32 n = 0;  /* general value; init to avoid compiler warning */
3550     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3551     char *locinput = startpos;
3552     char *pushinput; /* where to continue after a PUSH */
3553     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3554
3555     bool result = 0;        /* return value of S_regmatch */
3556     int depth = 0;          /* depth of backtrack stack */
3557     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3558     const U32 max_nochange_depth =
3559         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3560         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3561     regmatch_state *yes_state = NULL; /* state to pop to on success of
3562                                                             subpattern */
3563     /* mark_state piggy backs on the yes_state logic so that when we unwind
3564        the stack on success we can update the mark_state as we go */
3565     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3566     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3567     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3568     U32 state_num;
3569     bool no_final = 0;      /* prevent failure from backtracking? */
3570     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3571     char *startpoint = locinput;
3572     SV *popmark = NULL;     /* are we looking for a mark? */
3573     SV *sv_commit = NULL;   /* last mark name seen in failure */
3574     SV *sv_yes_mark = NULL; /* last mark name we have seen
3575                                during a successful match */
3576     U32 lastopen = 0;       /* last open we saw */
3577     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3578     SV* const oreplsv = GvSV(PL_replgv);
3579     /* these three flags are set by various ops to signal information to
3580      * the very next op. They have a useful lifetime of exactly one loop
3581      * iteration, and are not preserved or restored by state pushes/pops
3582      */
3583     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3584     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3585     int logical = 0;        /* the following EVAL is:
3586                                 0: (?{...})
3587                                 1: (?(?{...})X|Y)
3588                                 2: (??{...})
3589                                or the following IFMATCH/UNLESSM is:
3590                                 false: plain (?=foo)
3591                                 true:  used as a condition: (?(?=foo))
3592                             */
3593     PAD* last_pad = NULL;
3594     dMULTICALL;
3595     I32 gimme = G_SCALAR;
3596     CV *caller_cv = NULL;       /* who called us */
3597     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3598     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3599
3600 #ifdef DEBUGGING
3601     GET_RE_DEBUG_FLAGS_DECL;
3602 #endif
3603
3604     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3605     multicall_oldcatch = 0;
3606     multicall_cv = NULL;
3607     cx = NULL;
3608     PERL_UNUSED_VAR(multicall_cop);
3609     PERL_UNUSED_VAR(newsp);
3610
3611
3612     PERL_ARGS_ASSERT_REGMATCH;
3613
3614     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3615             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3616     }));
3617     /* on first ever call to regmatch, allocate first slab */
3618     if (!PL_regmatch_slab) {
3619         Newx(PL_regmatch_slab, 1, regmatch_slab);
3620         PL_regmatch_slab->prev = NULL;
3621         PL_regmatch_slab->next = NULL;
3622         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3623     }
3624
3625     oldsave = PL_savestack_ix;
3626     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3627     SAVEVPTR(PL_regmatch_slab);
3628     SAVEVPTR(PL_regmatch_state);
3629
3630     /* grab next free state slot */
3631     st = ++PL_regmatch_state;
3632     if (st >  SLAB_LAST(PL_regmatch_slab))
3633         st = PL_regmatch_state = S_push_slab(aTHX);
3634
3635     /* Note that nextchr is a byte even in UTF */
3636     SET_nextchr;
3637     scan = prog;
3638     while (scan != NULL) {
3639
3640         DEBUG_EXECUTE_r( {
3641             SV * const prop = sv_newmortal();
3642             regnode *rnext=regnext(scan);
3643             DUMP_EXEC_POS( locinput, scan, utf8_target );
3644             regprop(rex, prop, scan);
3645
3646             PerlIO_printf(Perl_debug_log,
3647                     "%3"IVdf":%*s%s(%"IVdf")\n",
3648                     (IV)(scan - rexi->program), depth*2, "",
3649                     SvPVX_const(prop),
3650                     (PL_regkind[OP(scan)] == END || !rnext) ?
3651                         0 : (IV)(rnext - rexi->program));
3652         });
3653
3654         next = scan + NEXT_OFF(scan);
3655         if (next == scan)
3656             next = NULL;
3657         state_num = OP(scan);
3658
3659       reenter_switch:
3660
3661         SET_nextchr;
3662         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3663
3664         switch (state_num) {
3665         case BOL: /*  /^../  */
3666             if (locinput == PL_bostr)
3667             {
3668                 /* reginfo->till = reginfo->bol; */
3669                 break;
3670             }
3671             sayNO;
3672
3673         case MBOL: /*  /^../m  */
3674             if (locinput == PL_bostr ||
3675                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3676             {
3677                 break;
3678             }
3679             sayNO;
3680
3681         case SBOL: /*  /^../s  */
3682             if (locinput == PL_bostr)
3683                 break;
3684             sayNO;
3685
3686         case GPOS: /*  \G  */
3687             if (locinput == reginfo->ganch)
3688                 break;
3689             sayNO;
3690
3691         case KEEPS: /*   \K  */
3692             /* update the startpoint */
3693             st->u.keeper.val = rex->offs[0].start;
3694             rex->offs[0].start = locinput - PL_bostr;
3695             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3696             assert(0); /*NOTREACHED*/
3697         case KEEPS_next_fail:
3698             /* rollback the start point change */
3699             rex->offs[0].start = st->u.keeper.val;
3700             sayNO_SILENT;
3701             assert(0); /*NOTREACHED*/
3702
3703         case EOL: /* /..$/  */
3704                 goto seol;
3705
3706         case MEOL: /* /..$/m  */
3707             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3708                 sayNO;
3709             break;
3710
3711         case SEOL: /* /..$/s  */
3712           seol:
3713             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3714                 sayNO;
3715             if (PL_regeol - locinput > 1)
3716                 sayNO;
3717             break;
3718
3719         case EOS: /*  \z  */
3720             if (!NEXTCHR_IS_EOS)
3721                 sayNO;
3722             break;
3723
3724         case SANY: /*  /./s  */
3725             if (NEXTCHR_IS_EOS)
3726                 sayNO;
3727             goto increment_locinput;
3728
3729         case CANY: /*  \C  */
3730             if (NEXTCHR_IS_EOS)
3731                 sayNO;
3732             locinput++;
3733             break;
3734
3735         case REG_ANY: /*  /./  */
3736             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3737                 sayNO;
3738             goto increment_locinput;
3739
3740
3741 #undef  ST
3742 #define ST st->u.trie
3743         case TRIEC: /* (ab|cd) with known charclass */
3744             /* In this case the charclass data is available inline so
3745                we can fail fast without a lot of extra overhead.
3746              */
3747             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3748                 DEBUG_EXECUTE_r(
3749                     PerlIO_printf(Perl_debug_log,
3750                               "%*s  %sfailed to match trie start class...%s\n",
3751                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3752                 );
3753                 sayNO_SILENT;
3754                 assert(0); /* NOTREACHED */
3755             }
3756             /* FALL THROUGH */
3757         case TRIE:  /* (ab|cd)  */
3758             /* the basic plan of execution of the trie is:
3759              * At the beginning, run though all the states, and
3760              * find the longest-matching word. Also remember the position
3761              * of the shortest matching word. For example, this pattern:
3762              *    1  2 3 4    5
3763              *    ab|a|x|abcd|abc
3764              * when matched against the string "abcde", will generate
3765              * accept states for all words except 3, with the longest
3766              * matching word being 4, and the shortest being 2 (with
3767              * the position being after char 1 of the string).
3768              *
3769              * Then for each matching word, in word order (i.e. 1,2,4,5),
3770              * we run the remainder of the pattern; on each try setting
3771              * the current position to the character following the word,
3772              * returning to try the next word on failure.
3773              *
3774              * We avoid having to build a list of words at runtime by
3775              * using a compile-time structure, wordinfo[].prev, which
3776              * gives, for each word, the previous accepting word (if any).
3777              * In the case above it would contain the mappings 1->2, 2->0,
3778              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3779              * the longest word (4 above), a list of all words, by
3780              * following the list of prev pointers; this gives us the
3781              * unordered list 4,5,1,2. Then given the current word we have
3782              * just tried, we can go through the list and find the
3783              * next-biggest word to try (so if we just failed on word 2,
3784              * the next in the list is 4).
3785              *
3786              * Since at runtime we don't record the matching position in
3787              * the string for each word, we have to work that out for
3788              * each word we're about to process. The wordinfo table holds
3789              * the character length of each word; given that we recorded
3790              * at the start: the position of the shortest word and its
3791              * length in chars, we just need to move the pointer the
3792              * difference between the two char lengths. Depending on
3793              * Unicode status and folding, that's cheap or expensive.
3794              *
3795              * This algorithm is optimised for the case where are only a
3796              * small number of accept states, i.e. 0,1, or maybe 2.
3797              * With lots of accepts states, and having to try all of them,
3798              * it becomes quadratic on number of accept states to find all
3799              * the next words.
3800              */
3801
3802             {
3803                 /* what type of TRIE am I? (utf8 makes this contextual) */
3804                 DECL_TRIE_TYPE(scan);
3805
3806                 /* what trie are we using right now */
3807                 reg_trie_data * const trie
3808                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3809                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3810                 U32 state = trie->startstate;
3811
3812                 if (   trie->bitmap
3813                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3814                 {
3815                     if (trie->states[ state ].wordnum) {
3816                          DEBUG_EXECUTE_r(
3817                             PerlIO_printf(Perl_debug_log,
3818                                           "%*s  %smatched empty string...%s\n",
3819                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3820                         );
3821                         if (!trie->jump)
3822                             break;
3823                     } else {
3824                         DEBUG_EXECUTE_r(
3825                             PerlIO_printf(Perl_debug_log,
3826                                           "%*s  %sfailed to match trie start class...%s\n",
3827                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3828                         );
3829                         sayNO_SILENT;
3830                    }
3831                 }
3832
3833             {
3834                 U8 *uc = ( U8* )locinput;
3835
3836                 STRLEN len = 0;
3837                 STRLEN foldlen = 0;
3838                 U8 *uscan = (U8*)NULL;
3839                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3840                 U32 charcount = 0; /* how many input chars we have matched */
3841                 U32 accepted = 0; /* have we seen any accepting states? */
3842
3843                 ST.jump = trie->jump;
3844                 ST.me = scan;
3845                 ST.firstpos = NULL;
3846                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3847                 ST.nextword = 0;
3848
3849                 /* fully traverse the TRIE; note the position of the
3850                    shortest accept state and the wordnum of the longest
3851                    accept state */
3852
3853                 while ( state && uc <= (U8*)PL_regeol ) {
3854                     U32 base = trie->states[ state ].trans.base;
3855                     UV uvc = 0;
3856                     U16 charid = 0;
3857                     U16 wordnum;
3858                     wordnum = trie->states[ state ].wordnum;
3859
3860                     if (wordnum) { /* it's an accept state */
3861                         if (!accepted) {
3862                             accepted = 1;
3863                             /* record first match position */
3864                             if (ST.longfold) {
3865                                 ST.firstpos = (U8*)locinput;
3866                                 ST.firstchars = 0;
3867                             }
3868                             else {
3869                                 ST.firstpos = uc;
3870                                 ST.firstchars = charcount;
3871                             }
3872                         }
3873                         if (!ST.nextword || wordnum < ST.nextword)
3874                             ST.nextword = wordnum;
3875                         ST.topword = wordnum;
3876                     }
3877
3878                     DEBUG_TRIE_EXECUTE_r({
3879                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3880                                 PerlIO_printf( Perl_debug_log,
3881                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3882                                     2+depth * 2, "", PL_colors[4],
3883                                     (UV)state, (accepted ? 'Y' : 'N'));
3884                     });
3885
3886                     /* read a char and goto next state */
3887                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3888                         I32 offset;
3889                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3890                                              uscan, len, uvc, charid, foldlen,
3891                                              foldbuf, uniflags);
3892                         charcount++;
3893                         if (foldlen>0)
3894                             ST.longfold = TRUE;
3895                         if (charid &&
3896                              ( ((offset =
3897                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3898
3899                              && ((U32)offset < trie->lasttrans)
3900                              && trie->trans[offset].check == state)
3901                         {
3902                             state = trie->trans[offset].next;
3903                         }
3904                         else {
3905                             state = 0;
3906                         }
3907                         uc += len;
3908
3909                     }
3910                     else {
3911                         state = 0;
3912                     }
3913                     DEBUG_TRIE_EXECUTE_r(
3914                         PerlIO_printf( Perl_debug_log,
3915                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3916                             charid, uvc, (UV)state, PL_colors[5] );
3917                     );
3918                 }
3919                 if (!accepted)
3920                    sayNO;
3921
3922                 /* calculate total number of accept states */
3923                 {
3924                     U16 w = ST.topword;
3925                     accepted = 0;
3926                     while (w) {
3927                         w = trie->wordinfo[w].prev;
3928                         accepted++;
3929                     }
3930                     ST.accepted = accepted;
3931                 }
3932
3933                 DEBUG_EXECUTE_r(
3934                     PerlIO_printf( Perl_debug_log,
3935                         "%*s  %sgot %"IVdf" possible matches%s\n",
3936                         REPORT_CODE_OFF + depth * 2, "",
3937                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3938                 );
3939                 goto trie_first_try; /* jump into the fail handler */
3940             }}
3941             assert(0); /* NOTREACHED */
3942
3943         case TRIE_next_fail: /* we failed - try next alternative */
3944         {
3945             U8 *uc;
3946             if ( ST.jump) {
3947                 REGCP_UNWIND(ST.cp);
3948                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3949             }
3950             if (!--ST.accepted) {
3951                 DEBUG_EXECUTE_r({
3952                     PerlIO_printf( Perl_debug_log,
3953                         "%*s  %sTRIE failed...%s\n",
3954                         REPORT_CODE_OFF+depth*2, "",
3955                         PL_colors[4],
3956                         PL_colors[5] );
3957                 });
3958                 sayNO_SILENT;
3959             }
3960             {
3961                 /* Find next-highest word to process.  Note that this code
3962                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3963                 U16 min = 0;
3964                 U16 word;
3965                 U16 const nextword = ST.nextword;
3966                 reg_trie_wordinfo * const wordinfo
3967                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3968                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3969                     if (word > nextword && (!min || word < min))
3970                         min = word;
3971                 }
3972                 ST.nextword = min;
3973             }
3974
3975           trie_first_try:
3976             if (do_cutgroup) {
3977                 do_cutgroup = 0;
3978                 no_final = 0;
3979             }
3980
3981             if ( ST.jump) {
3982                 ST.lastparen = rex->lastparen;
3983                 ST.lastcloseparen = rex->lastcloseparen;
3984                 REGCP_SET(ST.cp);
3985             }
3986
3987             /* find start char of end of current word */
3988             {
3989                 U32 chars; /* how many chars to skip */
3990                 reg_trie_data * const trie
3991                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3992
3993                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3994                             >=  ST.firstchars);
3995                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3996                             - ST.firstchars;
3997                 uc = ST.firstpos;
3998
3999                 if (ST.longfold) {
4000                     /* the hard option - fold each char in turn and find
4001                      * its folded length (which may be different */
4002                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4003                     STRLEN foldlen;
4004                     STRLEN len;
4005                     UV uvc;
4006                     U8 *uscan;
4007
4008                     while (chars) {
4009                         if (utf8_target) {
4010                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4011                                                     uniflags);
4012                             uc += len;
4013                         }
4014                         else {
4015                             uvc = *uc;
4016                             uc++;
4017                         }
4018                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4019                         uscan = foldbuf;
4020                         while (foldlen) {
4021                             if (!--chars)
4022                                 break;
4023                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4024                                             uniflags);
4025                             uscan += len;
4026                             foldlen -= len;
4027                         }
4028                     }
4029                 }
4030                 else {
4031                     if (utf8_target)
4032                         while (chars--)
4033                             uc += UTF8SKIP(uc);
4034                     else
4035                         uc += chars;
4036                 }
4037             }
4038
4039             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4040                             ? ST.jump[ST.nextword]
4041                             : NEXT_OFF(ST.me));
4042
4043             DEBUG_EXECUTE_r({
4044                 PerlIO_printf( Perl_debug_log,
4045                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4046                     REPORT_CODE_OFF+depth*2, "",
4047                     PL_colors[4],
4048                     ST.nextword,
4049                     PL_colors[5]
4050                     );
4051             });
4052
4053             if (ST.accepted > 1 || has_cutgroup) {
4054                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4055                 assert(0); /* NOTREACHED */
4056             }
4057             /* only one choice left - just continue */
4058             DEBUG_EXECUTE_r({
4059                 AV *const trie_words
4060                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4061                 SV ** const tmp = av_fetch( trie_words,
4062                     ST.nextword-1, 0 );
4063                 SV *sv= tmp ? sv_newmortal() : NULL;
4064
4065                 PerlIO_printf( Perl_debug_log,
4066                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4067                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4068                     ST.nextword,
4069                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4070                             PL_colors[0], PL_colors[1],
4071                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4072                         )
4073                     : "not compiled under -Dr",
4074                     PL_colors[5] );
4075             });
4076
4077             locinput = (char*)uc;
4078             continue; /* execute rest of RE */
4079             assert(0); /* NOTREACHED */
4080         }
4081 #undef  ST
4082
4083         case EXACT: {            /*  /abc/        */
4084             char *s = STRING(scan);
4085             ln = STR_LEN(scan);
4086             if (utf8_target != UTF_PATTERN) {
4087                 /* The target and the pattern have differing utf8ness. */
4088                 char *l = locinput;
4089                 const char * const e = s + ln;
4090
4091                 if (utf8_target) {
4092                     /* The target is utf8, the pattern is not utf8.
4093                      * Above-Latin1 code points can't match the pattern;
4094                      * invariants match exactly, and the other Latin1 ones need
4095                      * to be downgraded to a single byte in order to do the
4096                      * comparison.  (If we could be confident that the target
4097                      * is not malformed, this could be refactored to have fewer
4098                      * tests by just assuming that if the first bytes match, it
4099                      * is an invariant, but there are tests in the test suite
4100                      * dealing with (??{...}) which violate this) */
4101                     while (s < e) {
4102                         if (l >= PL_regeol)
4103                              sayNO;
4104                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4105                             sayNO;
4106                         }
4107                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4108                             if (*l != *s) {
4109                                 sayNO;
4110                             }
4111                             l++;
4112                         }
4113                         else {
4114                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4115                                 sayNO;
4116                             }
4117                             l += 2;
4118                         }
4119                         s++;
4120                     }
4121                 }
4122                 else {
4123                     /* The target is not utf8, the pattern is utf8. */
4124                     while (s < e) {
4125                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4126                         {
4127                             sayNO;
4128                         }
4129                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4130                             if (*s != *l) {
4131                                 sayNO;
4132                             }
4133                             s++;
4134                         }
4135                         else {
4136                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4137                                 sayNO;
4138                             }
4139                             s += 2;
4140                         }
4141                         l++;
4142                     }
4143                 }
4144                 locinput = l;
4145                 break;
4146             }
4147             /* The target and the pattern have the same utf8ness. */
4148             /* Inline the first character, for speed. */
4149             if (UCHARAT(s) != nextchr)
4150                 sayNO;
4151             if (PL_regeol - locinput < ln)
4152                 sayNO;
4153             if (ln > 1 && memNE(s, locinput, ln))
4154                 sayNO;
4155             locinput += ln;
4156             break;
4157             }
4158
4159         case EXACTFL: {          /*  /abc/il      */
4160             re_fold_t folder;
4161             const U8 * fold_array;
4162             const char * s;
4163             U32 fold_utf8_flags;
4164
4165             PL_reg_flags |= RF_tainted;
4166             folder = foldEQ_locale;
4167             fold_array = PL_fold_locale;
4168             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4169             goto do_exactf;
4170
4171         case EXACTFU_SS:         /*  /\x{df}/iu   */
4172         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4173         case EXACTFU:            /*  /abc/iu      */
4174             folder = foldEQ_latin1;
4175             fold_array = PL_fold_latin1;
4176             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4177             goto do_exactf;
4178
4179         case EXACTFA:            /*  /abc/iaa     */
4180             folder = foldEQ_latin1;
4181             fold_array = PL_fold_latin1;
4182             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4183             goto do_exactf;
4184
4185         case EXACTF:             /*  /abc/i       */
4186             folder = foldEQ;
4187             fold_array = PL_fold;
4188             fold_utf8_flags = 0;
4189
4190           do_exactf:
4191             s = STRING(scan);
4192             ln = STR_LEN(scan);
4193
4194             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4195               /* Either target or the pattern are utf8, or has the issue where
4196                * the fold lengths may differ. */
4197                 const char * const l = locinput;
4198                 char *e = PL_regeol;
4199
4200                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4201                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4202                 {
4203                     sayNO;
4204                 }
4205                 locinput = e;
4206                 break;
4207             }
4208
4209             /* Neither the target nor the pattern are utf8 */
4210             if (UCHARAT(s) != nextchr
4211                 && !NEXTCHR_IS_EOS
4212                 && UCHARAT(s) != fold_array[nextchr])
4213             {
4214                 sayNO;
4215             }
4216             if (PL_regeol - locinput < ln)
4217                 sayNO;
4218             if (ln > 1 && ! folder(s, locinput, ln))
4219                 sayNO;
4220             locinput += ln;
4221             break;
4222         }
4223
4224         /* XXX Could improve efficiency by separating these all out using a
4225          * macro or in-line function.  At that point regcomp.c would no longer
4226          * have to set the FLAGS fields of these */
4227         case BOUNDL:  /*  /\b/l  */
4228         case NBOUNDL: /*  /\B/l  */
4229             PL_reg_flags |= RF_tainted;
4230             /* FALL THROUGH */
4231         case BOUND:   /*  /\b/   */
4232         case BOUNDU:  /*  /\b/u  */
4233         case BOUNDA:  /*  /\b/a  */
4234         case NBOUND:  /*  /\B/   */
4235         case NBOUNDU: /*  /\B/u  */
4236         case NBOUNDA: /*  /\B/a  */
4237             /* was last char in word? */
4238             if (utf8_target
4239                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4240                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4241             {
4242                 if (locinput == PL_bostr)
4243                     ln = '\n';
4244                 else {
4245                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4246
4247                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4248                 }
4249                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4250                     ln = isALNUM_uni(ln);
4251                     if (NEXTCHR_IS_EOS)
4252                         n = 0;
4253                     else {
4254                         LOAD_UTF8_CHARCLASS_ALNUM();
4255                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4256                                                                 utf8_target);
4257                     }
4258                 }
4259                 else {
4260                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4261                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4262                 }
4263             }
4264             else {
4265
4266                 /* Here the string isn't utf8, or is utf8 and only ascii
4267                  * characters are to match \w.  In the latter case looking at
4268                  * the byte just prior to the current one may be just the final
4269                  * byte of a multi-byte character.  This is ok.  There are two
4270                  * cases:
4271                  * 1) it is a single byte character, and then the test is doing
4272                  *      just what it's supposed to.
4273                  * 2) it is a multi-byte character, in which case the final
4274                  *      byte is never mistakable for ASCII, and so the test
4275                  *      will say it is not a word character, which is the
4276                  *      correct answer. */
4277                 ln = (locinput != PL_bostr) ?
4278                     UCHARAT(locinput - 1) : '\n';
4279                 switch (FLAGS(scan)) {
4280                     case REGEX_UNICODE_CHARSET:
4281                         ln = isWORDCHAR_L1(ln);
4282                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4283                         break;
4284                     case REGEX_LOCALE_CHARSET:
4285                         ln = isALNUM_LC(ln);
4286                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4287                         break;
4288                     case REGEX_DEPENDS_CHARSET:
4289                         ln = isALNUM(ln);
4290                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4291                         break;
4292                     case REGEX_ASCII_RESTRICTED_CHARSET:
4293                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4294                         ln = isWORDCHAR_A(ln);
4295                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4296                         break;
4297                     default:
4298                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4299                         break;
4300                 }
4301             }
4302             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4303              * regcomp.sym */
4304             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4305                     sayNO;
4306             break;
4307
4308         case ANYOF:  /*  /[abc]/       */
4309             if (NEXTCHR_IS_EOS)
4310                 sayNO;
4311             if (utf8_target) {
4312                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4313                     sayNO;
4314                 locinput += UTF8SKIP(locinput);
4315                 break;
4316             }
4317             else {
4318                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4319                     sayNO;
4320                 locinput++;
4321                 break;
4322             }
4323             break;
4324
4325         /* Special char classes: \d, \w etc.
4326          * The defines start on line 166 or so */
4327         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4328                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4329                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4330                   ALNUMA, NALNUMA, isWORDCHAR_A,
4331                   alnum, "a");
4332
4333         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
4334                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4335                   SPACEU, NSPACEU, isSPACE_L1,
4336                   SPACEA, NSPACEA, isSPACE_A,
4337                   space, " ");
4338
4339         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4340                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4341                 DIGITA, NDIGITA, isDIGIT_A,
4342                 digit, "0");
4343
4344         case POSIXA: /* /[[:ascii:]]/ etc */
4345             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4346                 sayNO;
4347             }
4348             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4349             locinput++;
4350             break;
4351
4352         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4353             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4354                 sayNO;
4355             }
4356             goto increment_locinput;
4357
4358         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4359                        a Unicode extended Grapheme Cluster */
4360             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4361               extended Grapheme Cluster is:
4362
4363                CR LF
4364                | Prepend* Begin Extend*
4365                | .
4366
4367                Begin is:           ( Special_Begin | ! Control )
4368                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4369                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4370                Control is:         [ GCB_Control  CR  LF ]
4371                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4372
4373                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4374                we can rewrite
4375
4376                    Begin is ( Regular_Begin + Special Begin )
4377
4378                It turns out that 98.4% of all Unicode code points match
4379                Regular_Begin.  Doing it this way eliminates a table match in
4380                the previous implementation for almost all Unicode code points.
4381
4382                There is a subtlety with Prepend* which showed up in testing.
4383                Note that the Begin, and only the Begin is required in:
4384                 | Prepend* Begin Extend*
4385                Also, Begin contains '! Control'.  A Prepend must be a
4386                '!  Control', which means it must also be a Begin.  What it
4387                comes down to is that if we match Prepend* and then find no
4388                suitable Begin afterwards, that if we backtrack the last
4389                Prepend, that one will be a suitable Begin.
4390             */
4391
4392             if (NEXTCHR_IS_EOS)
4393                 sayNO;
4394             if  (! utf8_target) {
4395
4396                 /* Match either CR LF  or '.', as all the other possibilities
4397                  * require utf8 */
4398                 locinput++;         /* Match the . or CR */
4399                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4400                                        match the LF */
4401                     && locinput < PL_regeol
4402                     && UCHARAT(locinput) == '\n') locinput++;
4403             }
4404             else {
4405
4406                 /* Utf8: See if is ( CR LF ); already know that locinput <
4407                  * PL_regeol, so locinput+1 is in bounds */
4408                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4409                         && UCHARAT(locinput + 1) == '\n')
4410                 {
4411                     locinput += 2;
4412                 }
4413                 else {
4414                     STRLEN len;
4415
4416                     /* In case have to backtrack to beginning, then match '.' */
4417                     char *starting = locinput;
4418
4419                     /* In case have to backtrack the last prepend */
4420                     char *previous_prepend = 0;
4421
4422                     LOAD_UTF8_CHARCLASS_GCB();
4423
4424                     /* Match (prepend)*   */
4425                     while (locinput < PL_regeol
4426                            && (len = is_GCB_Prepend_utf8(locinput)))
4427                     {
4428                         previous_prepend = locinput;
4429                         locinput += len;
4430                     }
4431
4432                     /* As noted above, if we matched a prepend character, but
4433                      * the next thing won't match, back off the last prepend we
4434                      * matched, as it is guaranteed to match the begin */
4435                     if (previous_prepend
4436                         && (locinput >=  PL_regeol
4437                             || (! swash_fetch(PL_utf8_X_regular_begin,
4438                                              (U8*)locinput, utf8_target)
4439                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4440                         )
4441                     {
4442                         locinput = previous_prepend;
4443                     }
4444
4445                     /* Note that here we know PL_regeol > locinput, as we
4446                      * tested that upon input to this switch case, and if we
4447                      * moved locinput forward, we tested the result just above
4448                      * and it either passed, or we backed off so that it will
4449                      * now pass */
4450                     if (swash_fetch(PL_utf8_X_regular_begin,
4451                                     (U8*)locinput, utf8_target)) {
4452                         locinput += UTF8SKIP(locinput);
4453                     }
4454                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4455
4456                         /* Here did not match the required 'Begin' in the
4457                          * second term.  So just match the very first
4458                          * character, the '.' of the final term of the regex */
4459                         locinput = starting + UTF8SKIP(starting);
4460                         goto exit_utf8;
4461                     } else {
4462
4463                         /* Here is a special begin.  It can be composed of
4464                          * several individual characters.  One possibility is
4465                          * RI+ */
4466                         if ((len = is_GCB_RI_utf8(locinput))) {
4467                             locinput += len;
4468                             while (locinput < PL_regeol
4469                                    && (len = is_GCB_RI_utf8(locinput)))
4470                             {
4471                                 locinput += len;
4472                             }
4473                         } else if ((len = is_GCB_T_utf8(locinput))) {
4474                             /* Another possibility is T+ */
4475                             locinput += len;
4476                             while (locinput < PL_regeol
4477                                 && (len = is_GCB_T_utf8(locinput)))
4478                             {
4479                                 locinput += len;
4480                             }
4481                         } else {
4482
4483                             /* Here, neither RI+ nor T+; must be some other
4484                              * Hangul.  That means it is one of the others: L,
4485                              * LV, LVT or V, and matches:
4486                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4487
4488                             /* Match L*           */
4489                             while (locinput < PL_regeol
4490                                    && (len = is_GCB_L_utf8(locinput)))
4491                             {
4492                                 locinput += len;
4493                             }
4494
4495                             /* Here, have exhausted L*.  If the next character
4496                              * is not an LV, LVT nor V, it means we had to have
4497                              * at least one L, so matches L+ in the original
4498                              * equation, we have a complete hangul syllable.
4499                              * Are done. */
4500
4501                             if (locinput < PL_regeol
4502                                 && is_GCB_LV_LVT_V_utf8(locinput))
4503                             {
4504
4505                                 /* Otherwise keep going.  Must be LV, LVT or V.
4506                                  * See if LVT */
4507                                 if (is_utf8_X_LVT((U8*)locinput)) {
4508                                     locinput += UTF8SKIP(locinput);
4509                                 } else {
4510
4511                                     /* Must be  V or LV.  Take it, then match
4512                                      * V*     */
4513                                     locinput += UTF8SKIP(locinput);
4514                                     while (locinput < PL_regeol
4515                                            && (len = is_GCB_V_utf8(locinput)))
4516                                     {
4517                                         locinput += len;
4518                                     }
4519                                 }
4520
4521                                 /* And any of LV, LVT, or V can be followed
4522                                  * by T*            */
4523                                 while (locinput < PL_regeol
4524                                        && (len = is_GCB_T_utf8(locinput)))
4525                                 {
4526                                     locinput += len;
4527                                 }
4528                             }
4529                         }
4530                     }
4531
4532                     /* Match any extender */
4533                     while (locinput < PL_regeol
4534                             && swash_fetch(PL_utf8_X_extend,
4535                                             (U8*)locinput, utf8_target))
4536                     {
4537                         locinput += UTF8SKIP(locinput);
4538                     }
4539                 }
4540             exit_utf8:
4541                 if (locinput > PL_regeol) sayNO;
4542             }
4543             break;
4544
4545         case NREFFL:  /*  /\g{name}/il  */
4546         {   /* The capture buffer cases.  The ones beginning with N for the
4547                named buffers just convert to the equivalent numbered and
4548                pretend they were called as the corresponding numbered buffer
4549                op.  */
4550             /* don't initialize these in the declaration, it makes C++
4551                unhappy */
4552             char *s;
4553             char type;
4554             re_fold_t folder;
4555             const U8 *fold_array;
4556             UV utf8_fold_flags;
4557
4558             PL_reg_flags |= RF_tainted;
4559             folder = foldEQ_locale;
4560             fold_array = PL_fold_locale;
4561             type = REFFL;
4562             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4563             goto do_nref;
4564
4565         case NREFFA:  /*  /\g{name}/iaa  */
4566             folder = foldEQ_latin1;
4567             fold_array = PL_fold_latin1;
4568             type = REFFA;
4569             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4570             goto do_nref;
4571
4572         case NREFFU:  /*  /\g{name}/iu  */
4573             folder = foldEQ_latin1;
4574             fold_array = PL_fold_latin1;
4575             type = REFFU;
4576             utf8_fold_flags = 0;
4577             goto do_nref;
4578
4579         case NREFF:  /*  /\g{name}/i  */
4580             folder = foldEQ;
4581             fold_array = PL_fold;
4582             type = REFF;
4583             utf8_fold_flags = 0;
4584             goto do_nref;
4585
4586         case NREF:  /*  /\g{name}/   */
4587             type = REF;
4588             folder = NULL;
4589             fold_array = NULL;
4590             utf8_fold_flags = 0;
4591           do_nref:
4592
4593             /* For the named back references, find the corresponding buffer
4594              * number */
4595             n = reg_check_named_buff_matched(rex,scan);
4596
4597             if ( ! n ) {
4598                 sayNO;
4599             }
4600             goto do_nref_ref_common;
4601
4602         case REFFL:  /*  /\1/il  */
4603             PL_reg_flags |= RF_tainted;
4604             folder = foldEQ_locale;
4605             fold_array = PL_fold_locale;
4606             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4607             goto do_ref;
4608
4609         case REFFA:  /*  /\1/iaa  */
4610             folder = foldEQ_latin1;
4611             fold_array = PL_fold_latin1;
4612             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4613             goto do_ref;
4614
4615         case REFFU:  /*  /\1/iu  */
4616             folder = foldEQ_latin1;
4617             fold_array = PL_fold_latin1;
4618             utf8_fold_flags = 0;
4619             goto do_ref;
4620
4621         case REFF:  /*  /\1/i  */
4622             folder = foldEQ;
4623             fold_array = PL_fold;
4624             utf8_fold_flags = 0;
4625             goto do_ref;
4626
4627         case REF:  /*  /\1/    */
4628             folder = NULL;
4629             fold_array = NULL;
4630             utf8_fold_flags = 0;
4631
4632           do_ref:
4633             type = OP(scan);
4634             n = ARG(scan);  /* which paren pair */
4635
4636           do_nref_ref_common:
4637             ln = rex->offs[n].start;
4638             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4639             if (rex->lastparen < n || ln == -1)
4640                 sayNO;                  /* Do not match unless seen CLOSEn. */
4641             if (ln == rex->offs[n].end)
4642                 break;
4643
4644             s = PL_bostr + ln;
4645             if (type != REF     /* REF can do byte comparison */
4646                 && (utf8_target || type == REFFU))
4647             { /* XXX handle REFFL better */
4648                 char * limit = PL_regeol;
4649
4650                 /* This call case insensitively compares the entire buffer
4651                     * at s, with the current input starting at locinput, but
4652                     * not going off the end given by PL_regeol, and returns in
4653                     * <limit> upon success, how much of the current input was
4654                     * matched */
4655                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4656                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4657                 {
4658                     sayNO;
4659                 }
4660                 locinput = limit;
4661                 break;
4662             }
4663
4664             /* Not utf8:  Inline the first character, for speed. */
4665             if (!NEXTCHR_IS_EOS &&
4666                 UCHARAT(s) != nextchr &&
4667                 (type == REF ||
4668                  UCHARAT(s) != fold_array[nextchr]))
4669                 sayNO;
4670             ln = rex->offs[n].end - ln;
4671             if (locinput + ln > PL_regeol)
4672                 sayNO;
4673             if (ln > 1 && (type == REF
4674                            ? memNE(s, locinput, ln)
4675                            : ! folder(s, locinput, ln)))
4676                 sayNO;
4677             locinput += ln;
4678             break;
4679         }
4680
4681         case NOTHING: /* null op; e.g. the 'nothing' following
4682                        * the '*' in m{(a+|b)*}' */
4683             break;
4684         case TAIL: /* placeholder while compiling (A|B|C) */
4685             break;
4686
4687         case BACK: /* ??? doesn't appear to be used ??? */
4688             break;
4689
4690 #undef  ST
4691 #define ST st->u.eval
4692         {
4693             SV *ret;
4694             REGEXP *re_sv;
4695             regexp *re;
4696             regexp_internal *rei;
4697             regnode *startpoint;
4698
4699         case GOSTART: /*  (?R)  */
4700         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4701             if (cur_eval && cur_eval->locinput==locinput) {
4702                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4703                     Perl_croak(aTHX_ "Infinite recursion in regex");
4704                 if ( ++nochange_depth > max_nochange_depth )
4705                     Perl_croak(aTHX_
4706                         "Pattern subroutine nesting without pos change"
4707                         " exceeded limit in regex");
4708             } else {
4709                 nochange_depth = 0;
4710             }
4711             re_sv = rex_sv;
4712             re = rex;
4713             rei = rexi;
4714             if (OP(scan)==GOSUB) {
4715                 startpoint = scan + ARG2L(scan);
4716                 ST.close_paren = ARG(scan);
4717             } else {
4718                 startpoint = rei->program+1;
4719                 ST.close_paren = 0;
4720             }
4721             goto eval_recurse_doit;
4722             assert(0); /* NOTREACHED */
4723
4724         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4725             if (cur_eval && cur_eval->locinput==locinput) {
4726                 if ( ++nochange_depth > max_nochange_depth )
4727                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4728             } else {
4729                 nochange_depth = 0;
4730             }
4731             {
4732                 /* execute the code in the {...} */
4733
4734                 dSP;
4735                 SV ** before;
4736                 OP * const oop = PL_op;
4737                 COP * const ocurcop = PL_curcop;
4738                 OP *nop;
4739                 char *saved_regeol = PL_regeol;
4740                 struct re_save_state saved_state;
4741                 CV *newcv;
4742
4743                 /* save *all* paren positions */
4744                 regcppush(rex, 0);
4745                 REGCP_SET(runops_cp);
4746
4747                 /* To not corrupt the existing regex state while executing the
4748                  * eval we would normally put it on the save stack, like with
4749                  * save_re_context. However, re-evals have a weird scoping so we
4750                  * can't just add ENTER/LEAVE here. With that, things like
4751                  *
4752                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4753                  *
4754                  * would break, as they expect the localisation to be unwound
4755                  * only when the re-engine backtracks through the bit that
4756                  * localised it.
4757                  *
4758                  * What we do instead is just saving the state in a local c
4759                  * variable.
4760                  */
4761                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4762
4763                 PL_reg_state.re_reparsing = FALSE;
4764
4765                 if (!caller_cv)
4766                     caller_cv = find_runcv(NULL);
4767
4768                 n = ARG(scan);
4769
4770                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4771                     newcv = (ReANY(
4772                                                 (REGEXP*)(rexi->data->data[n])
4773                                             ))->qr_anoncv
4774                                         ;
4775                     nop = (OP*)rexi->data->data[n+1];
4776                 }
4777                 else if (rexi->data->what[n] == 'l') { /* literal code */
4778                     newcv = caller_cv;
4779                     nop = (OP*)rexi->data->data[n];
4780                     assert(CvDEPTH(newcv));
4781                 }
4782                 else {
4783                     /* literal with own CV */
4784                     assert(rexi->data->what[n] == 'L');
4785                     newcv = rex->qr_anoncv;
4786                     nop = (OP*)rexi->data->data[n];
4787                 }
4788
4789                 /* normally if we're about to execute code from the same
4790                  * CV that we used previously, we just use the existing
4791                  * CX stack entry. However, its possible that in the
4792                  * meantime we may have backtracked, popped from the save
4793                  * stack, and undone the SAVECOMPPAD(s) associated with
4794                  * PUSH_MULTICALL; in which case PL_comppad no longer
4795                  * points to newcv's pad. */
4796                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4797                 {
4798                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4799                     if (last_pushed_cv) {
4800                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4801                     }
4802                     else {
4803                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4804                     }
4805                     last_pushed_cv = newcv;
4806                 }
4807                 last_pad = PL_comppad;
4808
4809                 /* the initial nextstate you would normally execute
4810                  * at the start of an eval (which would cause error
4811                  * messages to come from the eval), may be optimised
4812                  * away from the execution path in the regex code blocks;
4813                  * so manually set PL_curcop to it initially */
4814                 {
4815                     OP *o = cUNOPx(nop)->op_first;
4816                     assert(o->op_type == OP_NULL);
4817                     if (o->op_targ == OP_SCOPE) {
4818                         o = cUNOPo->op_first;
4819                     }
4820                     else {
4821                         assert(o->op_targ == OP_LEAVE);
4822                         o = cUNOPo->op_first;
4823                         assert(o->op_type == OP_ENTER);
4824                         o = o->op_sibling;
4825                     }
4826
4827                     if (o->op_type != OP_STUB) {
4828                         assert(    o->op_type == OP_NEXTSTATE
4829                                 || o->op_type == OP_DBSTATE
4830                                 || (o->op_type == OP_NULL
4831                                     &&  (  o->op_targ == OP_NEXTSTATE
4832                                         || o->op_targ == OP_DBSTATE
4833                                         )
4834                                     )
4835                         );
4836                         PL_curcop = (COP*)o;
4837                     }
4838                 }
4839                 nop = nop->op_next;
4840
4841                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4842                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4843
4844                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4845
4846                 if (sv_yes_mark) {
4847                     SV *sv_mrk = get_sv("REGMARK", 1);
4848                     sv_setsv(sv_mrk, sv_yes_mark);
4849                 }
4850
4851                 /* we don't use MULTICALL here as we want to call the
4852                  * first op of the block of interest, rather than the
4853                  * first op of the sub */
4854                 before = SP;
4855                 PL_op = nop;
4856                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4857                 SPAGAIN;
4858                 if (SP == before)
4859                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4860                 else {
4861                     ret = POPs;
4862                     PUTBACK;
4863                 }
4864
4865                 /* before restoring everything, evaluate the returned
4866                  * value, so that 'uninit' warnings don't use the wrong
4867                  * PL_op or pad. Also need to process any magic vars
4868                  * (e.g. $1) *before* parentheses are restored */
4869
4870                 PL_op = NULL;
4871
4872                 re_sv = NULL;
4873                 if (logical == 0)        /*   (?{})/   */
4874                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4875                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4876                     sw = cBOOL(SvTRUE(ret));
4877                     logical = 0;
4878                 }
4879                 else {                   /*  /(??{})  */
4880                     /*  if its overloaded, let the regex compiler handle
4881                      *  it; otherwise extract regex, or stringify  */
4882                     if (!SvAMAGIC(ret)) {
4883                         SV *sv = ret;
4884                         if (SvROK(sv))
4885                             sv = SvRV(sv);
4886                         if (SvTYPE(sv) == SVt_REGEXP)
4887                             re_sv = (REGEXP*) sv;
4888                         else if (SvSMAGICAL(sv)) {
4889                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4890                             if (mg)
4891                                 re_sv = (REGEXP *) mg->mg_obj;
4892                         }
4893
4894                         /* force any magic, undef warnings here */
4895                         if (!re_sv) {
4896                             ret = sv_mortalcopy(ret);
4897                             (void) SvPV_force_nolen(ret);
4898                         }
4899                     }
4900
4901                 }
4902
4903                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4904
4905                 /* *** Note that at this point we don't restore
4906                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4907                  * be used again soon. This is safe as long as nothing
4908                  * in the regexp code uses the pad ! */
4909                 PL_op = oop;
4910                 PL_curcop = ocurcop;
4911                 PL_regeol = saved_regeol;
4912                 S_regcp_restore(aTHX_ rex, runops_cp);
4913
4914                 if (logical != 2)
4915                     break;
4916             }
4917
4918                 /* only /(??{})/  from now on */
4919                 logical = 0;
4920                 {
4921                     /* extract RE object from returned value; compiling if
4922                      * necessary */
4923
4924                     if (re_sv) {
4925                         re_sv = reg_temp_copy(NULL, re_sv);
4926                     }
4927                     else {
4928                         U32 pm_flags = 0;
4929                         const I32 osize = PL_regsize;
4930
4931                         if (SvUTF8(ret) && IN_BYTES) {
4932                             /* In use 'bytes': make a copy of the octet
4933                              * sequence, but without the flag on */
4934                             STRLEN len;
4935                             const char *const p = SvPV(ret, len);
4936                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4937                         }
4938                         if (rex->intflags & PREGf_USE_RE_EVAL)
4939                             pm_flags |= PMf_USE_RE_EVAL;
4940
4941                         /* if we got here, it should be an engine which
4942                          * supports compiling code blocks and stuff */
4943                         assert(rex->engine && rex->engine->op_comp);
4944                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4945                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4946                                     rex->engine, NULL, NULL,
4947                                     /* copy /msix etc to inner pattern */
4948                                     scan->flags,
4949                                     pm_flags);
4950
4951                         if (!(SvFLAGS(ret)
4952                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4953                                  | SVs_GMG))) {
4954                             /* This isn't a first class regexp. Instead, it's
4955                                caching a regexp onto an existing, Perl visible
4956                                scalar.  */
4957                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4958                         }
4959                         PL_regsize = osize;
4960                         /* safe to do now that any $1 etc has been
4961                          * interpolated into the new pattern string and
4962                          * compiled */
4963                         S_regcp_restore(aTHX_ rex, runops_cp);
4964                     }
4965                     re = ReANY(re_sv);
4966                 }
4967                 RXp_MATCH_COPIED_off(re);
4968                 re->subbeg = rex->subbeg;
4969                 re->sublen = rex->sublen;
4970                 re->suboffset = rex->suboffset;
4971                 re->subcoffset = rex->subcoffset;
4972                 rei = RXi_GET(re);
4973                 DEBUG_EXECUTE_r(
4974                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4975                         "Matching embedded");
4976                 );
4977                 startpoint = rei->program + 1;
4978                 ST.close_paren = 0; /* only used for GOSUB */
4979
4980         eval_recurse_doit: /* Share code with GOSUB below this line */
4981                 /* run the pattern returned from (??{...}) */
4982                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4983                 REGCP_SET(ST.lastcp);
4984
4985                 re->lastparen = 0;
4986                 re->lastcloseparen = 0;
4987
4988                 PL_regsize = 0;
4989
4990                 /* XXXX This is too dramatic a measure... */
4991                 PL_reg_maxiter = 0;
4992
4993                 ST.toggle_reg_flags = PL_reg_flags;
4994                 if (RX_UTF8(re_sv))
4995                     PL_reg_flags |= RF_utf8;
4996                 else
4997                     PL_reg_flags &= ~RF_utf8;
4998                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4999
5000                 ST.prev_rex = rex_sv;
5001                 ST.prev_curlyx = cur_curlyx;
5002                 rex_sv = re_sv;
5003                 SET_reg_curpm(rex_sv);
5004                 rex = re;
5005                 rexi = rei;
5006                 cur_curlyx = NULL;
5007                 ST.B = next;
5008                 ST.prev_eval = cur_eval;
5009                 cur_eval = st;
5010                 /* now continue from first node in postoned RE */
5011                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5012                 assert(0); /* NOTREACHED */
5013         }
5014
5015         case EVAL_AB: /* cleanup after a successful (??{A})B */
5016             /* note: this is called twice; first after popping B, then A */
5017             PL_reg_flags ^= ST.toggle_reg_flags;
5018             rex_sv = ST.prev_rex;
5019             SET_reg_curpm(rex_sv);
5020             rex = ReANY(rex_sv);
5021             rexi = RXi_GET(rex);
5022             regcpblow(ST.cp);
5023             cur_eval = ST.prev_eval;
5024             cur_curlyx = ST.prev_curlyx;
5025
5026             /* XXXX This is too dramatic a measure... */
5027             PL_reg_maxiter = 0;
5028             if ( nochange_depth )
5029                 nochange_depth--;
5030             sayYES;
5031
5032
5033         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5034             /* note: this is called twice; first after popping B, then A */
5035             PL_reg_flags ^= ST.toggle_reg_flags;
5036             rex_sv = ST.prev_rex;
5037             SET_reg_curpm(rex_sv);
5038             rex = ReANY(rex_sv);
5039             rexi = RXi_GET(rex);
5040
5041             REGCP_UNWIND(ST.lastcp);
5042             regcppop(rex);
5043             cur_eval = ST.prev_eval;
5044             cur_curlyx = ST.prev_curlyx;
5045             /* XXXX This is too dramatic a measure... */
5046             PL_reg_maxiter = 0;
5047             if ( nochange_depth )
5048                 nochange_depth--;
5049             sayNO_SILENT;
5050 #undef ST
5051
5052         case OPEN: /*  (  */
5053             n = ARG(scan);  /* which paren pair */
5054             rex->offs[n].start_tmp = locinput - PL_bostr;
5055             if (n > PL_regsize)
5056                 PL_regsize = n;
5057             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5058                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5059                 PTR2UV(rex),
5060                 PTR2UV(rex->offs),
5061                 (UV)n,
5062                 (IV)rex->offs[n].start_tmp,
5063                 (UV)PL_regsize
5064             ));
5065             lastopen = n;
5066             break;
5067
5068 /* XXX really need to log other places start/end are set too */
5069 #define CLOSE_CAPTURE \
5070     rex->offs[n].start = rex->offs[n].start_tmp; \
5071     rex->offs[n].end = locinput - PL_bostr; \
5072     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5073         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5074         PTR2UV(rex), \
5075         PTR2UV(rex->offs), \
5076         (UV)n, \
5077         (IV)rex->offs[n].start, \
5078         (IV)rex->offs[n].end \
5079     ))
5080
5081         case CLOSE:  /*  )  */
5082             n = ARG(scan);  /* which paren pair */
5083             CLOSE_CAPTURE;
5084             /*if (n > PL_regsize)
5085                 PL_regsize = n;*/
5086             if (n > rex->lastparen)
5087                 rex->lastparen = n;
5088             rex->lastcloseparen = n;
5089             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5090                 goto fake_end;
5091             }
5092             break;
5093
5094         case ACCEPT:  /*  (*ACCEPT)  */
5095             if (ARG(scan)){
5096                 regnode *cursor;
5097                 for (cursor=scan;
5098                      cursor && OP(cursor)!=END;
5099                      cursor=regnext(cursor))
5100                 {
5101                     if ( OP(cursor)==CLOSE ){
5102                         n = ARG(cursor);
5103                         if ( n <= lastopen ) {
5104                             CLOSE_CAPTURE;
5105                             /*if (n > PL_regsize)
5106                             PL_regsize = n;*/
5107                             if (n > rex->lastparen)
5108                                 rex->lastparen = n;
5109                             rex->lastcloseparen = n;
5110                             if ( n == ARG(scan) || (cur_eval &&
5111                                 cur_eval->u.eval.close_paren == n))
5112                                 break;
5113                         }
5114                     }
5115                 }
5116             }
5117             goto fake_end;
5118             /*NOTREACHED*/
5119
5120         case GROUPP:  /*  (?(1))  */
5121             n = ARG(scan);  /* which paren pair */
5122             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5123             break;
5124
5125         case NGROUPP:  /*  (?(<name>))  */
5126             /* reg_check_named_buff_matched returns 0 for no match */
5127             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5128             break;
5129
5130         case INSUBP:   /*  (?(R))  */
5131             n = ARG(scan);
5132             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5133             break;
5134
5135         case DEFINEP:  /*  (?(DEFINE))  */
5136             sw = 0;
5137             break;
5138
5139         case IFTHEN:   /*  (?(cond)A|B)  */
5140             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5141             if (sw)
5142                 next = NEXTOPER(NEXTOPER(scan));
5143             else {
5144                 next = scan + ARG(scan);
5145                 if (OP(next) == IFTHEN) /* Fake one. */
5146                     next = NEXTOPER(NEXTOPER(next));
5147             }
5148             break;
5149
5150         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5151             logical = scan->flags;
5152             break;
5153
5154 /*******************************************************************
5155
5156 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5157 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5158 STAR/PLUS/CURLY/CURLYN are used instead.)
5159
5160 A*B is compiled as <CURLYX><A><WHILEM><B>
5161
5162 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5163 state, which contains the current count, initialised to -1. It also sets
5164 cur_curlyx to point to this state, with any previous value saved in the
5165 state block.
5166
5167 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5168 since the pattern may possibly match zero times (i.e. it's a while {} loop
5169 rather than a do {} while loop).
5170
5171 Each entry to WHILEM represents a successful match of A. The count in the
5172 CURLYX block is incremented, another WHILEM state is pushed, and execution
5173 passes to A or B depending on greediness and the current count.
5174
5175 For example, if matching against the string a1a2a3b (where the aN are
5176 substrings that match /A/), then the match progresses as follows: (the
5177 pushed states are interspersed with the bits of strings matched so far):
5178
5179     <CURLYX cnt=-1>
5180     <CURLYX cnt=0><WHILEM>
5181     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5182     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5183     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5184     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5185
5186 (Contrast this with something like CURLYM, which maintains only a single
5187 backtrack state:
5188
5189     <CURLYM cnt=0> a1
5190     a1 <CURLYM cnt=1> a2
5191     a1 a2 <CURLYM cnt=2> a3
5192     a1 a2 a3 <CURLYM cnt=3> b
5193 )
5194
5195 Each WHILEM state block marks a point to backtrack to upon partial failure
5196 of A or B, and also contains some minor state data related to that
5197 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5198 overall state, such as the count, and pointers to the A and B ops.
5199
5200 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5201 must always point to the *current* CURLYX block, the rules are:
5202
5203 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5204 and set cur_curlyx to point the new block.
5205
5206 When popping the CURLYX block after a successful or unsuccessful match,
5207 restore the previous cur_curlyx.
5208
5209 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5210 to the outer one saved in the CURLYX block.
5211
5212 When popping the WHILEM block after a successful or unsuccessful B match,
5213 restore the previous cur_curlyx.
5214
5215 Here's an example for the pattern (AI* BI)*BO
5216 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5217
5218 cur_
5219 curlyx backtrack stack
5220 ------ ---------------
5221 NULL
5222 CO     <CO prev=NULL> <WO>
5223 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5224 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5225 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5226
5227 At this point the pattern succeeds, and we work back down the stack to
5228 clean up, restoring as we go:
5229
5230 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5231 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5232 CO     <CO prev=NULL> <WO>
5233 NULL
5234
5235 *******************************************************************/
5236
5237 #define ST st->u.curlyx
5238
5239         case CURLYX:    /* start of /A*B/  (for complex A) */
5240         {
5241             /* No need to save/restore up to this paren */
5242             I32 parenfloor = scan->flags;
5243
5244             assert(next); /* keep Coverity happy */
5245             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5246                 next += ARG(next);
5247
5248             /* XXXX Probably it is better to teach regpush to support
5249                parenfloor > PL_regsize... */
5250             if (parenfloor > (I32)rex->lastparen)
5251                 parenfloor = rex->lastparen; /* Pessimization... */
5252
5253             ST.prev_curlyx= cur_curlyx;
5254             cur_curlyx = st;
5255             ST.cp = PL_savestack_ix;
5256
5257             /* these fields contain the state of the current curly.
5258              * they are accessed by subsequent WHILEMs */
5259             ST.parenfloor = parenfloor;
5260             ST.me = scan;
5261             ST.B = next;
5262             ST.minmod = minmod;
5263             minmod = 0;
5264             ST.count = -1;      /* this will be updated by WHILEM */
5265             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5266
5267             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5268             assert(0); /* NOTREACHED */
5269         }
5270
5271         case CURLYX_end: /* just finished matching all of A*B */
5272             cur_curlyx = ST.prev_curlyx;
5273             sayYES;
5274             assert(0); /* NOTREACHED */
5275
5276         case CURLYX_end_fail: /* just failed to match all of A*B */
5277             regcpblow(ST.cp);
5278             cur_curlyx = ST.prev_curlyx;
5279             sayNO;
5280             assert(0); /* NOTREACHED */
5281
5282
5283 #undef ST
5284 #define ST st->u.whilem
5285
5286         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5287         {
5288             /* see the discussion above about CURLYX/WHILEM */
5289             I32 n;
5290             int min = ARG1(cur_curlyx->u.curlyx.me);
5291             int max = ARG2(cur_curlyx->u.curlyx.me);
5292             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5293
5294             assert(cur_curlyx); /* keep Coverity happy */
5295             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5296             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5297             ST.cache_offset = 0;
5298             ST.cache_mask = 0;
5299
5300
5301             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5302                   "%*s  whilem: matched %ld out of %d..%d\n",
5303                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5304             );
5305
5306             /* First just match a string of min A's. */
5307
5308             if (n < min) {
5309                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5310                 cur_curlyx->u.curlyx.lastloc = locinput;
5311                 REGCP_SET(ST.lastcp);
5312
5313                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5314                 assert(0); /* NOTREACHED */
5315             }
5316
5317             /* If degenerate A matches "", assume A done. */
5318
5319             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5320                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5321                    "%*s  whilem: empty match detected, trying continuation...\n",
5322                    REPORT_CODE_OFF+depth*2, "")
5323                 );
5324                 goto do_whilem_B_max;
5325             }
5326
5327             /* super-linear cache processing */
5328
5329             if (scan->flags) {
5330
5331                 if (!PL_reg_maxiter) {
5332                     /* start the countdown: Postpone detection until we
5333                      * know the match is not *that* much linear. */
5334                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5335                     /* possible overflow for long strings and many CURLYX's */
5336                     if (PL_reg_maxiter < 0)
5337                         PL_reg_maxiter = I32_MAX;
5338                     PL_reg_leftiter = PL_reg_maxiter;
5339                 }
5340
5341                 if (PL_reg_leftiter-- == 0) {
5342                     /* initialise cache */
5343                     const I32 size = (PL_reg_maxiter + 7)/8;
5344                     if (PL_reg_poscache) {
5345                         if ((I32)PL_reg_poscache_size < size) {
5346                             Renew(PL_reg_poscache, size, char);
5347                             PL_reg_poscache_size = size;
5348                         }
5349                         Zero(PL_reg_poscache, size, char);
5350                     }
5351                     else {
5352                         PL_reg_poscache_size = size;
5353                         Newxz(PL_reg_poscache, size, char);
5354                     }
5355                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5356       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5357                               PL_colors[4], PL_colors[5])
5358                     );
5359                 }
5360
5361                 if (PL_reg_leftiter < 0) {
5362                     /* have we already failed at this position? */
5363                     I32 offset, mask;
5364                     offset  = (scan->flags & 0xf) - 1
5365                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5366                     mask    = 1 << (offset % 8);
5367                     offset /= 8;
5368                     if (PL_reg_poscache[offset] & mask) {
5369                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5370                             "%*s  whilem: (cache) already tried at this position...\n",
5371                             REPORT_CODE_OFF+depth*2, "")
5372                         );
5373                         sayNO; /* cache records failure */
5374                     }
5375                     ST.cache_offset = offset;
5376                     ST.cache_mask   = mask;
5377                 }
5378             }
5379
5380             /* Prefer B over A for minimal matching. */
5381
5382             if (cur_curlyx->u.curlyx.minmod) {
5383                 ST.save_curlyx = cur_curlyx;
5384                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5385                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5386                 REGCP_SET(ST.lastcp);
5387                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5388                                     locinput);
5389                 assert(0); /* NOTREACHED */
5390             }
5391
5392             /* Prefer A over B for maximal matching. */
5393
5394             if (n < max) { /* More greed allowed? */
5395                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5396                 cur_curlyx->u.curlyx.lastloc = locinput;
5397                 REGCP_SET(ST.lastcp);
5398                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5399                 assert(0); /* NOTREACHED */
5400             }
5401             goto do_whilem_B_max;
5402         }
5403         assert(0); /* NOTREACHED */
5404
5405         case WHILEM_B_min: /* just matched B in a minimal match */
5406         case WHILEM_B_max: /* just matched B in a maximal match */
5407             cur_curlyx = ST.save_curlyx;
5408             sayYES;
5409             assert(0); /* NOTREACHED */
5410
5411         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5412             cur_curlyx = ST.save_curlyx;
5413             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5414             cur_curlyx->u.curlyx.count--;
5415             CACHEsayNO;
5416             assert(0); /* NOTREACHED */
5417
5418         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5419             /* FALL THROUGH */
5420         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5421             REGCP_UNWIND(ST.lastcp);
5422             regcppop(rex);
5423             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5424             cur_curlyx->u.curlyx.count--;
5425             CACHEsayNO;
5426             assert(0); /* NOTREACHED */
5427
5428         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5429             REGCP_UNWIND(ST.lastcp);
5430             regcppop(rex);      /* Restore some previous $<digit>s? */
5431             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5432                 "%*s  whilem: failed, trying continuation...\n",
5433                 REPORT_CODE_OFF+depth*2, "")
5434             );
5435           do_whilem_B_max:
5436             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5437                 && ckWARN(WARN_REGEXP)
5438                 && !(PL_reg_flags & RF_warned))
5439             {
5440                 PL_reg_flags |= RF_warned;
5441                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5442                      "Complex regular subexpression recursion limit (%d) "
5443                      "exceeded",
5444                      REG_INFTY - 1);
5445             }
5446
5447             /* now try B */
5448             ST.save_curlyx = cur_curlyx;
5449             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5450             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5451                                 locinput);
5452             assert(0); /* NOTREACHED */
5453
5454         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5455             cur_curlyx = ST.save_curlyx;
5456             REGCP_UNWIND(ST.lastcp);
5457             regcppop(rex);
5458
5459             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5460                 /* Maximum greed exceeded */
5461                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5462                     && ckWARN(WARN_REGEXP)
5463                     && !(PL_reg_flags & RF_warned))
5464                 {
5465                     PL_reg_flags |= RF_warned;
5466                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5467                         "Complex regular subexpression recursion "
5468                         "limit (%d) exceeded",
5469                         REG_INFTY - 1);
5470                 }
5471                 cur_curlyx->u.curlyx.count--;
5472                 CACHEsayNO;
5473             }
5474
5475             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5476                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5477             );
5478             /* Try grabbing another A and see if it helps. */
5479             cur_curlyx->u.curlyx.lastloc = locinput;
5480             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5481             REGCP_SET(ST.lastcp);
5482             PUSH_STATE_GOTO(WHILEM_A_min,
5483                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5484                 locinput);
5485             assert(0); /* NOTREACHED */
5486
5487 #undef  ST
5488 #define ST st->u.branch
5489
5490         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5491             next = scan + ARG(scan);
5492             if (next == scan)
5493                 next = NULL;
5494             scan = NEXTOPER(scan);
5495             /* FALL THROUGH */
5496
5497         case BRANCH:        /*  /(...|A|...)/ */
5498             scan = NEXTOPER(scan); /* scan now points to inner node */
5499             ST.lastparen = rex->lastparen;
5500             ST.lastcloseparen = rex->lastcloseparen;
5501             ST.next_branch = next;
5502             REGCP_SET(ST.cp);
5503
5504             /* Now go into the branch */
5505             if (has_cutgroup) {
5506                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5507             } else {
5508                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5509             }
5510             assert(0); /* NOTREACHED */
5511
5512         case CUTGROUP:  /*  /(*THEN)/  */
5513             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5514                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5515             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5516             assert(0); /* NOTREACHED */
5517
5518         case CUTGROUP_next_fail:
5519             do_cutgroup = 1;
5520             no_final = 1;
5521             if (st->u.mark.mark_name)
5522                 sv_commit = st->u.mark.mark_name;
5523             sayNO;
5524             assert(0); /* NOTREACHED */
5525
5526         case BRANCH_next:
5527             sayYES;
5528             assert(0); /* NOTREACHED */
5529
5530         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5531             if (do_cutgroup) {
5532                 do_cutgroup = 0;
5533                 no_final = 0;
5534             }
5535             REGCP_UNWIND(ST.cp);
5536             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5537             scan = ST.next_branch;
5538             /* no more branches? */
5539             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5540                 DEBUG_EXECUTE_r({
5541                     PerlIO_printf( Perl_debug_log,
5542                         "%*s  %sBRANCH failed...%s\n",
5543                         REPORT_CODE_OFF+depth*2, "",
5544                         PL_colors[4],
5545                         PL_colors[5] );
5546                 });
5547                 sayNO_SILENT;
5548             }
5549             continue; /* execute next BRANCH[J] op */
5550             assert(0); /* NOTREACHED */
5551
5552         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5553             minmod = 1;
5554             break;
5555
5556 #undef  ST
5557 #define ST st->u.curlym
5558
5559         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5560
5561             /* This is an optimisation of CURLYX that enables us to push
5562              * only a single backtracking state, no matter how many matches
5563              * there are in {m,n}. It relies on the pattern being constant
5564              * length, with no parens to influence future backrefs
5565              */
5566
5567             ST.me = scan;
5568             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5569
5570             ST.lastparen      = rex->lastparen;
5571             ST.lastcloseparen = rex->lastcloseparen;
5572
5573             /* if paren positive, emulate an OPEN/CLOSE around A */
5574             if (ST.me->flags) {
5575                 U32 paren = ST.me->flags;
5576                 if (paren > PL_regsize)
5577                     PL_regsize = paren;
5578                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5579             }
5580             ST.A = scan;
5581             ST.B = next;
5582             ST.alen = 0;
5583             ST.count = 0;
5584             ST.minmod = minmod;
5585             minmod = 0;
5586             ST.c1 = CHRTEST_UNINIT;
5587             REGCP_SET(ST.cp);
5588
5589             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5590                 goto curlym_do_B;
5591
5592           curlym_do_A: /* execute the A in /A{m,n}B/  */
5593             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5594             assert(0); /* NOTREACHED */
5595
5596         case CURLYM_A: /* we've just matched an A */
5597             ST.count++;
5598             /* after first match, determine A's length: u.curlym.alen */
5599             if (ST.count == 1) {
5600                 if (PL_reg_match_utf8) {
5601                     char *s = st->locinput;
5602                     while (s < locinput) {
5603                         ST.alen++;
5604                         s += UTF8SKIP(s);
5605                     }
5606                 }
5607                 else {
5608                     ST.alen = locinput - st->locinput;
5609                 }
5610                 if (ST.alen == 0)
5611                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5612             }
5613             DEBUG_EXECUTE_r(
5614                 PerlIO_printf(Perl_debug_log,
5615                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5616                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5617                           (IV) ST.count, (IV)ST.alen)
5618             );
5619
5620             if (cur_eval && cur_eval->u.eval.close_paren &&
5621                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5622                 goto fake_end;
5623
5624             {
5625                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5626                 if ( max == REG_INFTY || ST.count < max )
5627                     goto curlym_do_A; /* try to match another A */
5628             }
5629             goto curlym_do_B; /* try to match B */
5630
5631         case CURLYM_A_fail: /* just failed to match an A */
5632             REGCP_UNWIND(ST.cp);
5633
5634             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5635                 || (cur_eval && cur_eval->u.eval.close_paren &&
5636                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5637                 sayNO;
5638
5639           curlym_do_B: /* execute the B in /A{m,n}B/  */
5640             if (ST.c1 == CHRTEST_UNINIT) {
5641                 /* calculate c1 and c2 for possible match of 1st char
5642                  * following curly */
5643                 ST.c1 = ST.c2 = CHRTEST_VOID;
5644                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5645                     regnode *text_node = ST.B;
5646                     if (! HAS_TEXT(text_node))
5647                         FIND_NEXT_IMPT(text_node);
5648                     /* this used to be
5649
5650                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5651
5652                         But the former is redundant in light of the latter.
5653
5654                         if this changes back then the macro for
5655                         IS_TEXT and friends need to change.
5656                      */
5657                     if (PL_regkind[OP(text_node)] == EXACT) {
5658                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5659                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5660                         {
5661                             sayNO;
5662                         }
5663                     }
5664                 }
5665             }
5666
5667             DEBUG_EXECUTE_r(
5668                 PerlIO_printf(Perl_debug_log,
5669                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5670                     (int)(REPORT_CODE_OFF+(depth*2)),
5671                     "", (IV)ST.count)
5672                 );
5673             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5674                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5675                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5676                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5677                     {
5678                         /* simulate B failing */
5679                         DEBUG_OPTIMISE_r(
5680                             PerlIO_printf(Perl_debug_log,
5681                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5682                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5683                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5684                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5685                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5686                         );
5687                         state_num = CURLYM_B_fail;
5688                         goto reenter_switch;
5689                     }
5690                 }
5691                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5692                     /* simulate B failing */
5693                     DEBUG_OPTIMISE_r(
5694                         PerlIO_printf(Perl_debug_log,
5695                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5696                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5697                             (int) nextchr, ST.c1, ST.c2)
5698                     );
5699                     state_num = CURLYM_B_fail;
5700                     goto reenter_switch;
5701                 }
5702             }
5703
5704             if (ST.me->flags) {
5705                 /* emulate CLOSE: mark current A as captured */
5706                 I32 paren = ST.me->flags;
5707                 if (ST.count) {
5708                     rex->offs[paren].start
5709                         = HOPc(locinput, -ST.alen) - PL_bostr;
5710                     rex->offs[paren].end = locinput - PL_bostr;
5711                     if ((U32)paren > rex->lastparen)
5712                         rex->lastparen = paren;
5713                     rex->lastcloseparen = paren;
5714                 }
5715                 else
5716                     rex->offs[paren].end = -1;
5717                 if (cur_eval && cur_eval->u.eval.close_paren &&
5718                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5719                 {
5720                     if (ST.count)
5721                         goto fake_end;
5722                     else
5723                         sayNO;
5724                 }
5725             }
5726
5727             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5728             assert(0); /* NOTREACHED */
5729
5730         case CURLYM_B_fail: /* just failed to match a B */
5731             REGCP_UNWIND(ST.cp);
5732             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5733             if (ST.minmod) {
5734                 I32 max = ARG2(ST.me);
5735                 if (max != REG_INFTY && ST.count == max)
5736                     sayNO;
5737                 goto curlym_do_A; /* try to match a further A */
5738             }
5739             /* backtrack one A */
5740             if (ST.count == ARG1(ST.me) /* min */)
5741                 sayNO;
5742             ST.count--;
5743             SET_locinput(HOPc(locinput, -ST.alen));
5744             goto curlym_do_B; /* try to match B */
5745
5746 #undef ST
5747 #define ST st->u.curly
5748
5749 #define CURLY_SETPAREN(paren, success) \
5750     if (paren) { \
5751         if (success) { \
5752             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5753             rex->offs[paren].end = locinput - PL_bostr; \
5754             if (paren > rex->lastparen) \
5755                 rex->lastparen = paren; \
5756             rex->lastcloseparen = paren; \
5757         } \
5758         else { \
5759             rex->offs[paren].end = -1; \
5760             rex->lastparen      = ST.lastparen; \
5761             rex->lastcloseparen = ST.lastcloseparen; \
5762         } \
5763     }
5764
5765         case STAR:              /*  /A*B/ where A is width 1 char */
5766             ST.paren = 0;
5767             ST.min = 0;
5768             ST.max = REG_INFTY;
5769             scan = NEXTOPER(scan);
5770             goto repeat;
5771
5772         case PLUS:              /*  /A+B/ where A is width 1 char */
5773             ST.paren = 0;
5774             ST.min = 1;
5775             ST.max = REG_INFTY;
5776             scan = NEXTOPER(scan);
5777             goto repeat;
5778
5779         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5780             ST.paren = scan->flags;     /* Which paren to set */
5781             ST.lastparen      = rex->lastparen;
5782             ST.lastcloseparen = rex->lastcloseparen;
5783             if (ST.paren > PL_regsize)
5784                 PL_regsize = ST.paren;
5785             ST.min = ARG1(scan);  /* min to match */
5786             ST.max = ARG2(scan);  /* max to match */
5787             if (cur_eval && cur_eval->u.eval.close_paren &&
5788                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5789                 ST.min=1;
5790                 ST.max=1;
5791             }
5792             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5793             goto repeat;
5794
5795         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5796             ST.paren = 0;
5797             ST.min = ARG1(scan);  /* min to match */
5798             ST.max = ARG2(scan);  /* max to match */
5799             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5800           repeat:
5801             /*
5802             * Lookahead to avoid useless match attempts
5803             * when we know what character comes next.
5804             *
5805             * Used to only do .*x and .*?x, but now it allows
5806             * for )'s, ('s and (?{ ... })'s to be in the way
5807             * of the quantifier and the EXACT-like node.  -- japhy
5808             */
5809
5810             assert(ST.min <= ST.max);
5811             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5812                 ST.c1 = ST.c2 = CHRTEST_VOID;
5813             }
5814             else {
5815                 regnode *text_node = next;
5816
5817                 if (! HAS_TEXT(text_node))
5818                     FIND_NEXT_IMPT(text_node);
5819
5820                 if (! HAS_TEXT(text_node))
5821                     ST.c1 = ST.c2 = CHRTEST_VOID;
5822                 else {
5823                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5824                         ST.c1 = ST.c2 = CHRTEST_VOID;
5825                     }
5826                     else {
5827
5828                     /*  Currently we only get here when
5829
5830                         PL_rekind[OP(text_node)] == EXACT
5831
5832                         if this changes back then the macro for IS_TEXT and
5833                         friends need to change. */
5834                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5835                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5836                         {
5837                             sayNO;
5838                         }
5839                     }
5840                 }
5841             }
5842
5843             ST.A = scan;
5844             ST.B = next;
5845             if (minmod) {
5846                 char *li = locinput;
5847                 minmod = 0;
5848                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5849                     sayNO;
5850                 SET_locinput(li);
5851                 ST.count = ST.min;
5852                 REGCP_SET(ST.cp);
5853                 if (ST.c1 == CHRTEST_VOID)
5854                     goto curly_try_B_min;
5855
5856                 ST.oldloc = locinput;
5857
5858                 /* set ST.maxpos to the furthest point along the
5859                  * string that could possibly match */
5860                 if  (ST.max == REG_INFTY) {
5861                     ST.maxpos = PL_regeol - 1;
5862                     if (utf8_target)
5863                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5864                             ST.maxpos--;
5865                 }
5866                 else if (utf8_target) {
5867                     int m = ST.max - ST.min;
5868                     for (ST.maxpos = locinput;
5869                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5870                         ST.maxpos += UTF8SKIP(ST.maxpos);
5871                 }
5872                 else {
5873                     ST.maxpos = locinput + ST.max - ST.min;
5874                     if (ST.maxpos >= PL_regeol)
5875                         ST.maxpos = PL_regeol - 1;
5876                 }
5877                 goto curly_try_B_min_known;
5878
5879             }
5880             else {
5881                 /* avoid taking address of locinput, so it can remain
5882                  * a register var */
5883                 char *li = locinput;
5884                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5885                 if (ST.count < ST.min)
5886                     sayNO;
5887                 SET_locinput(li);
5888                 if ((ST.count > ST.min)
5889                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5890                 {
5891                     /* A{m,n} must come at the end of the string, there's
5892                      * no point in backing off ... */
5893                     ST.min = ST.count;
5894                     /* ...except that $ and \Z can match before *and* after
5895                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5896                        We may back off by one in this case. */
5897                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5898                         ST.min--;
5899                 }
5900                 REGCP_SET(ST.cp);
5901                 goto curly_try_B_max;
5902             }
5903             assert(0); /* NOTREACHED */
5904
5905
5906         case CURLY_B_min_known_fail:
5907             /* failed to find B in a non-greedy match where c1,c2 valid */
5908
5909             REGCP_UNWIND(ST.cp);
5910             if (ST.paren) {
5911                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5912             }
5913             /* Couldn't or didn't -- move forward. */
5914             ST.oldloc = locinput;
5915             if (utf8_target)
5916                 locinput += UTF8SKIP(locinput);
5917             else
5918                 locinput++;
5919             ST.count++;
5920           curly_try_B_min_known:
5921              /* find the next place where 'B' could work, then call B */
5922             {
5923                 int n;
5924                 if (utf8_target) {
5925                     n = (ST.oldloc == locinput) ? 0 : 1;
5926                     if (ST.c1 == ST.c2) {
5927                         /* set n to utf8_distance(oldloc, locinput) */
5928                         while (locinput <= ST.maxpos
5929                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5930                         {
5931                             locinput += UTF8SKIP(locinput);
5932                             n++;
5933                         }
5934                     }
5935                     else {
5936                         /* set n to utf8_distance(oldloc, locinput) */
5937                         while (locinput <= ST.maxpos
5938                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5939                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5940                         {
5941                             locinput += UTF8SKIP(locinput);
5942                             n++;
5943                         }
5944                     }
5945                 }
5946                 else {  /* Not utf8_target */
5947                     if (ST.c1 == ST.c2) {
5948                         while (locinput <= ST.maxpos &&
5949                                UCHARAT(locinput) != ST.c1)
5950                             locinput++;
5951                     }
5952                     else {
5953                         while (locinput <= ST.maxpos
5954                                && UCHARAT(locinput) != ST.c1
5955                                && UCHARAT(locinput) != ST.c2)
5956                             locinput++;
5957                     }
5958                     n = locinput - ST.oldloc;
5959                 }
5960                 if (locinput > ST.maxpos)
5961                     sayNO;
5962                 if (n) {
5963                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5964                      * at b; check that everything between oldloc and
5965                      * locinput matches */
5966                     char *li = ST.oldloc;
5967                     ST.count += n;
5968                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
5969                         sayNO;
5970                     assert(n == REG_INFTY || locinput == li);
5971                 }
5972                 CURLY_SETPAREN(ST.paren, ST.count);
5973                 if (cur_eval && cur_eval->u.eval.close_paren &&
5974                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5975                     goto fake_end;
5976                 }
5977                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5978             }
5979             assert(0); /* NOTREACHED */
5980
5981
5982         case CURLY_B_min_fail:
5983             /* failed to find B in a non-greedy match where c1,c2 invalid */
5984
5985             REGCP_UNWIND(ST.cp);
5986             if (ST.paren) {
5987                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5988             }
5989             /* failed -- move forward one */
5990             {
5991                 char *li = locinput;
5992                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5993                     sayNO;
5994                 }
5995                 locinput = li;
5996             }
5997             {
5998                 ST.count++;
5999                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6000                         ST.count > 0)) /* count overflow ? */
6001                 {
6002                   curly_try_B_min:
6003                     CURLY_SETPAREN(ST.paren, ST.count);
6004                     if (cur_eval && cur_eval->u.eval.close_paren &&
6005                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6006                         goto fake_end;
6007                     }
6008                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6009                 }
6010             }
6011             sayNO;
6012             assert(0); /* NOTREACHED */
6013
6014
6015         curly_try_B_max:
6016             /* a successful greedy match: now try to match B */
6017             if (cur_eval && cur_eval->u.eval.close_paren &&
6018                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6019                 goto fake_end;
6020             }
6021             {
6022                 bool could_match = locinput < PL_regeol;
6023
6024                 /* If it could work, try it. */
6025                 if (ST.c1 != CHRTEST_VOID && could_match) {
6026                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6027                     {
6028                         could_match = memEQ(locinput,
6029                                             ST.c1_utf8,
6030                                             UTF8SKIP(locinput))
6031                                     || memEQ(locinput,
6032                                              ST.c2_utf8,
6033                                              UTF8SKIP(locinput));
6034                     }
6035                     else {
6036                         could_match = UCHARAT(locinput) == ST.c1
6037                                       || UCHARAT(locinput) == ST.c2;
6038                     }
6039                 }
6040                 if (ST.c1 == CHRTEST_VOID || could_match) {
6041                     CURLY_SETPAREN(ST.paren, ST.count);
6042                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6043                     assert(0); /* NOTREACHED */
6044                 }
6045             }
6046             /* FALL THROUGH */
6047
6048         case CURLY_B_max_fail:
6049             /* failed to find B in a greedy match */
6050
6051             REGCP_UNWIND(ST.cp);
6052             if (ST.paren) {
6053                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6054             }
6055             /*  back up. */
6056             if (--ST.count < ST.min)
6057                 sayNO;
6058             locinput = HOPc(locinput, -1);
6059             goto curly_try_B_max;
6060
6061 #undef ST
6062
6063         case END: /*  last op of main pattern  */
6064             fake_end:
6065             if (cur_eval) {
6066                 /* we've just finished A in /(??{A})B/; now continue with B */
6067                 st->u.eval.toggle_reg_flags
6068                             = cur_eval->u.eval.toggle_reg_flags;
6069                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6070
6071                 st->u.eval.prev_rex = rex_sv;           /* inner */
6072                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6073                 rex_sv = cur_eval->u.eval.prev_rex;
6074                 SET_reg_curpm(rex_sv);
6075                 rex = ReANY(rex_sv);
6076                 rexi = RXi_GET(rex);
6077                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6078
6079                 REGCP_SET(st->u.eval.lastcp);
6080
6081                 /* Restore parens of the outer rex without popping the
6082                  * savestack */
6083                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6084
6085                 st->u.eval.prev_eval = cur_eval;
6086                 cur_eval = cur_eval->u.eval.prev_eval;
6087                 DEBUG_EXECUTE_r(
6088                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6089                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6090                 if ( nochange_depth )
6091                     nochange_depth--;
6092
6093                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6094                                     locinput); /* match B */
6095             }
6096
6097             if (locinput < reginfo->till) {
6098                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6099                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6100                                       PL_colors[4],
6101                                       (long)(locinput - PL_reg_starttry),
6102                                       (long)(reginfo->till - PL_reg_starttry),
6103                                       PL_colors[5]));
6104
6105                 sayNO_SILENT;           /* Cannot match: too short. */
6106             }
6107             sayYES;                     /* Success! */
6108
6109         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6110             DEBUG_EXECUTE_r(
6111             PerlIO_printf(Perl_debug_log,
6112                 "%*s  %ssubpattern success...%s\n",
6113                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6114             sayYES;                     /* Success! */
6115
6116 #undef  ST
6117 #define ST st->u.ifmatch
6118
6119         {
6120             char *newstart;
6121
6122         case SUSPEND:   /* (?>A) */
6123             ST.wanted = 1;
6124             newstart = locinput;
6125             goto do_ifmatch;
6126
6127         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6128             ST.wanted = 0;
6129             goto ifmatch_trivial_fail_test;
6130
6131         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6132             ST.wanted = 1;
6133           ifmatch_trivial_fail_test:
6134             if (scan->flags) {
6135                 char * const s = HOPBACKc(locinput, scan->flags);
6136                 if (!s) {
6137                     /* trivial fail */
6138                     if (logical) {
6139                         logical = 0;
6140                         sw = 1 - cBOOL(ST.wanted);
6141                     }
6142                     else if (ST.wanted)
6143                         sayNO;
6144                     next = scan + ARG(scan);
6145                     if (next == scan)
6146                         next = NULL;
6147                     break;
6148                 }
6149                 newstart = s;
6150             }
6151             else
6152                 newstart = locinput;
6153
6154           do_ifmatch:
6155             ST.me = scan;
6156             ST.logical = logical;
6157             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6158
6159             /* execute body of (?...A) */
6160             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6161             assert(0); /* NOTREACHED */
6162         }
6163
6164         case IFMATCH_A_fail: /* body of (?...A) failed */
6165             ST.wanted = !ST.wanted;
6166             /* FALL THROUGH */
6167
6168         case IFMATCH_A: /* body of (?...A) succeeded */
6169             if (ST.logical) {
6170                 sw = cBOOL(ST.wanted);
6171             }
6172             else if (!ST.wanted)
6173                 sayNO;
6174
6175             if (OP(ST.me) != SUSPEND) {
6176                 /* restore old position except for (?>...) */
6177                 locinput = st->locinput;
6178             }
6179             scan = ST.me + ARG(ST.me);
6180             if (scan == ST.me)
6181                 scan = NULL;
6182             continue; /* execute B */
6183
6184 #undef ST
6185
6186         case LONGJMP: /*  alternative with many branches compiles to
6187                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6188             next = scan + ARG(scan);
6189             if (next == scan)
6190                 next = NULL;
6191             break;
6192
6193         case COMMIT:  /*  (*COMMIT)  */
6194             reginfo->cutpoint = PL_regeol;
6195             /* FALLTHROUGH */
6196
6197         case PRUNE:   /*  (*PRUNE)   */
6198             if (!scan->flags)
6199                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6200             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6201             assert(0); /* NOTREACHED */
6202
6203         case COMMIT_next_fail:
6204             no_final = 1;
6205             /* FALLTHROUGH */
6206
6207         case OPFAIL:   /* (*FAIL)  */
6208             sayNO;
6209             assert(0); /* NOTREACHED */
6210
6211 #define ST st->u.mark
6212         case MARKPOINT: /*  (*MARK:foo)  */
6213             ST.prev_mark = mark_state;
6214             ST.mark_name = sv_commit = sv_yes_mark
6215                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6216             mark_state = st;
6217             ST.mark_loc = locinput;
6218             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6219             assert(0); /* NOTREACHED */
6220
6221         case MARKPOINT_next:
6222             mark_state = ST.prev_mark;
6223             sayYES;
6224             assert(0); /* NOTREACHED */
6225
6226         case MARKPOINT_next_fail:
6227             if (popmark && sv_eq(ST.mark_name,popmark))
6228             {
6229                 if (ST.mark_loc > startpoint)
6230                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6231                 popmark = NULL; /* we found our mark */
6232                 sv_commit = ST.mark_name;
6233
6234                 DEBUG_EXECUTE_r({
6235                         PerlIO_printf(Perl_debug_log,
6236                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6237                             REPORT_CODE_OFF+depth*2, "",
6238                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6239                 });
6240             }
6241             mark_state = ST.prev_mark;
6242             sv_yes_mark = mark_state ?
6243                 mark_state->u.mark.mark_name : NULL;
6244             sayNO;
6245             assert(0); /* NOTREACHED */
6246
6247         case SKIP:  /*  (*SKIP)  */
6248             if (scan->flags) {
6249                 /* (*SKIP) : if we fail we cut here*/
6250                 ST.mark_name = NULL;
6251                 ST.mark_loc = locinput;
6252                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6253             } else {
6254                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6255                    otherwise do nothing.  Meaning we need to scan
6256                  */
6257                 regmatch_state *cur = mark_state;
6258                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6259
6260                 while (cur) {
6261                     if ( sv_eq( cur->u.mark.mark_name,
6262                                 find ) )
6263                     {
6264                         ST.mark_name = find;
6265                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6266                     }
6267                     cur = cur->u.mark.prev_mark;
6268                 }
6269             }
6270             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6271             break;
6272
6273         case SKIP_next_fail:
6274             if (ST.mark_name) {
6275                 /* (*CUT:NAME) - Set up to search for the name as we
6276                    collapse the stack*/
6277                 popmark = ST.mark_name;
6278             } else {
6279                 /* (*CUT) - No name, we cut here.*/
6280                 if (ST.mark_loc > startpoint)
6281                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6282                 /* but we set sv_commit to latest mark_name if there
6283                    is one so they can test to see how things lead to this
6284                    cut */
6285                 if (mark_state)
6286                     sv_commit=mark_state->u.mark.mark_name;
6287             }
6288             no_final = 1;
6289             sayNO;
6290             assert(0); /* NOTREACHED */
6291 #undef ST
6292
6293         case LNBREAK: /* \R */
6294             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6295                 locinput += n;
6296             } else
6297                 sayNO;
6298             break;
6299
6300 #define CASE_CLASS(nAmE)                              \
6301         case nAmE:                                    \
6302             if (NEXTCHR_IS_EOS)                       \
6303                 sayNO;                                \
6304             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6305                 locinput += n;                        \
6306             } else                                    \
6307                 sayNO;                                \
6308             break;                                    \
6309         case N##nAmE:                                 \
6310             if (NEXTCHR_IS_EOS)                       \
6311                 sayNO;                                \
6312             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6313                 sayNO;                                \
6314             } else {                                  \
6315                 locinput += UTF8SKIP(locinput);       \
6316             }                                         \
6317             break
6318
6319         CASE_CLASS(VERTWS);  /*  \v \V  */
6320         CASE_CLASS(HORIZWS); /*  \h \H  */
6321 #undef CASE_CLASS
6322
6323         default:
6324             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6325                           PTR2UV(scan), OP(scan));
6326             Perl_croak(aTHX_ "regexp memory corruption");
6327
6328         /* this is a point to jump to in order to increment
6329          * locinput by one character */
6330         increment_locinput:
6331             assert(!NEXTCHR_IS_EOS);
6332             if (utf8_target) {
6333                 locinput += PL_utf8skip[nextchr];
6334                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6335                 if (locinput > PL_regeol)
6336                     sayNO;
6337             }
6338             else
6339                 locinput++;
6340             break;
6341
6342         } /* end switch */
6343
6344         /* switch break jumps here */
6345         scan = next; /* prepare to execute the next op and ... */
6346         continue;    /* ... jump back to the top, reusing st */
6347         assert(0); /* NOTREACHED */
6348
6349       push_yes_state:
6350         /* push a state that backtracks on success */
6351         st->u.yes.prev_yes_state = yes_state;
6352         yes_state = st;
6353         /* FALL THROUGH */
6354       push_state:
6355         /* push a new regex state, then continue at scan  */
6356         {
6357             regmatch_state *newst;
6358
6359             DEBUG_STACK_r({
6360                 regmatch_state *cur = st;
6361                 regmatch_state *curyes = yes_state;
6362                 int curd = depth;
6363                 regmatch_slab *slab = PL_regmatch_slab;
6364                 for (;curd > -1;cur--,curd--) {
6365                     if (cur < SLAB_FIRST(slab)) {
6366                         slab = slab->prev;
6367                         cur = SLAB_LAST(slab);
6368                     }
6369                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6370                         REPORT_CODE_OFF + 2 + depth * 2,"",
6371                         curd, PL_reg_name[cur->resume_state],
6372                         (curyes == cur) ? "yes" : ""
6373                     );
6374                     if (curyes == cur)
6375                         curyes = cur->u.yes.prev_yes_state;
6376                 }
6377             } else
6378                 DEBUG_STATE_pp("push")
6379             );
6380             depth++;
6381             st->locinput = locinput;
6382             newst = st+1;
6383             if (newst >  SLAB_LAST(PL_regmatch_slab))
6384                 newst = S_push_slab(aTHX);
6385             PL_regmatch_state = newst;
6386
6387             locinput = pushinput;
6388             st = newst;
6389             continue;
6390             assert(0); /* NOTREACHED */
6391         }
6392     }
6393
6394     /*
6395     * We get here only if there's trouble -- normally "case END" is
6396     * the terminating point.
6397     */
6398     Perl_croak(aTHX_ "corrupted regexp pointers");
6399     /*NOTREACHED*/
6400     sayNO;
6401
6402 yes:
6403     if (yes_state) {
6404         /* we have successfully completed a subexpression, but we must now
6405          * pop to the state marked by yes_state and continue from there */
6406         assert(st != yes_state);
6407 #ifdef DEBUGGING
6408         while (st != yes_state) {
6409             st--;
6410             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6411                 PL_regmatch_slab = PL_regmatch_slab->prev;
6412                 st = SLAB_LAST(PL_regmatch_slab);
6413             }
6414             DEBUG_STATE_r({
6415                 if (no_final) {
6416                     DEBUG_STATE_pp("pop (no final)");
6417                 } else {
6418                     DEBUG_STATE_pp("pop (yes)");
6419                 }
6420             });
6421             depth--;
6422         }
6423 #else
6424         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6425             || yes_state > SLAB_LAST(PL_regmatch_slab))
6426         {
6427             /* not in this slab, pop slab */
6428             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6429             PL_regmatch_slab = PL_regmatch_slab->prev;
6430             st = SLAB_LAST(PL_regmatch_slab);
6431         }
6432         depth -= (st - yes_state);
6433 #endif
6434         st = yes_state;
6435         yes_state = st->u.yes.prev_yes_state;
6436         PL_regmatch_state = st;
6437
6438         if (no_final)
6439             locinput= st->locinput;
6440         state_num = st->resume_state + no_final;
6441         goto reenter_switch;
6442     }
6443
6444     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6445                           PL_colors[4], PL_colors[5]));
6446
6447     if (PL_reg_state.re_state_eval_setup_done) {
6448         /* each successfully executed (?{...}) block does the equivalent of
6449          *   local $^R = do {...}
6450          * When popping the save stack, all these locals would be undone;
6451          * bypass this by setting the outermost saved $^R to the latest
6452          * value */
6453         if (oreplsv != GvSV(PL_replgv))
6454             sv_setsv(oreplsv, GvSV(PL_replgv));
6455     }
6456     result = 1;
6457     goto final_exit;
6458
6459 no:
6460     DEBUG_EXECUTE_r(
6461         PerlIO_printf(Perl_debug_log,
6462             "%*s  %sfailed...%s\n",
6463             REPORT_CODE_OFF+depth*2, "",
6464             PL_colors[4], PL_colors[5])
6465         );
6466
6467 no_silent:
6468     if (no_final) {
6469         if (yes_state) {
6470             goto yes;
6471         } else {
6472             goto final_exit;
6473         }
6474     }
6475     if (depth) {
6476         /* there's a previous state to backtrack to */
6477         st--;
6478         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6479             PL_regmatch_slab = PL_regmatch_slab->prev;
6480             st = SLAB_LAST(PL_regmatch_slab);
6481         }
6482         PL_regmatch_state = st;
6483         locinput= st->locinput;
6484
6485         DEBUG_STATE_pp("pop");
6486         depth--;
6487         if (yes_state == st)
6488             yes_state = st->u.yes.prev_yes_state;
6489
6490         state_num = st->resume_state + 1; /* failure = success + 1 */
6491         goto reenter_switch;
6492     }
6493     result = 0;
6494
6495   final_exit:
6496     if (rex->intflags & PREGf_VERBARG_SEEN) {
6497         SV *sv_err = get_sv("REGERROR", 1);
6498         SV *sv_mrk = get_sv("REGMARK", 1);
6499         if (result) {
6500             sv_commit = &PL_sv_no;
6501             if (!sv_yes_mark)
6502                 sv_yes_mark = &PL_sv_yes;
6503         } else {
6504             if (!sv_commit)
6505                 sv_commit = &PL_sv_yes;
6506             sv_yes_mark = &PL_sv_no;
6507         }
6508         sv_setsv(sv_err, sv_commit);
6509         sv_setsv(sv_mrk, sv_yes_mark);
6510     }
6511
6512
6513     if (last_pushed_cv) {
6514         dSP;
6515         POP_MULTICALL;
6516         PERL_UNUSED_VAR(SP);
6517     }
6518
6519     /* clean up; in particular, free all slabs above current one */
6520     LEAVE_SCOPE(oldsave);
6521
6522     assert(!result ||  locinput - PL_bostr >= 0);
6523     return result ?  locinput - PL_bostr : -1;
6524 }
6525
6526 /*
6527  - regrepeat - repeatedly match something simple, report how many
6528  *
6529  * What 'simple' means is a node which can be the operand of a quantifier like
6530  * '+', or {1,3}
6531  *
6532  * startposp - pointer a pointer to the start position.  This is updated
6533  *             to point to the byte following the highest successful
6534  *             match.
6535  * p         - the regnode to be repeatedly matched against.
6536  * max       - maximum number of things to match.
6537  * depth     - (for debugging) backtracking depth.
6538  */
6539 STATIC I32
6540 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6541 {
6542     dVAR;
6543     char *scan;     /* Pointer to current position in target string */
6544     I32 c;
6545     char *loceol = PL_regeol;   /* local version */
6546     I32 hardcount = 0;  /* How many matches so far */
6547     bool utf8_target = PL_reg_match_utf8;
6548     UV utf8_flags;
6549 #ifndef DEBUGGING
6550     PERL_UNUSED_ARG(depth);
6551 #endif
6552
6553     PERL_ARGS_ASSERT_REGREPEAT;
6554
6555     scan = *startposp;
6556     if (max == REG_INFTY)
6557         max = I32_MAX;
6558     else if (! utf8_target && scan + max < loceol)
6559         loceol = scan + max;
6560
6561     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6562      * to the maximum of how far we should go in it (leaving it set to the real
6563      * end, if the maximum permissible would take us beyond that).  This allows
6564      * us to make the loop exit condition that we haven't gone past <loceol> to
6565      * also mean that we haven't exceeded the max permissible count, saving a
6566      * test each time through the loop.  But it assumes that the OP matches a
6567      * single byte, which is true for most of the OPs below when applied to a
6568      * non-UTF-8 target.  Those relatively few OPs that don't have this
6569      * characteristic will have to compensate.
6570      *
6571      * There is no adjustment for UTF-8 targets, as the number of bytes per
6572      * character varies.  OPs will have to test both that the count is less
6573      * than the max permissible (using <hardcount> to keep track), and that we
6574      * are still within the bounds of the string (using <loceol>.  A few OPs
6575      * match a single byte no matter what the encoding.  They can omit the max
6576      * test if, for the UTF-8 case, they do the adjustment that was skipped
6577      * above.
6578      *
6579      * Thus, the code above sets things up for the common case; and exceptional
6580      * cases need extra work; the common case is to make sure <scan> doesn't
6581      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6582      * count doesn't exceed the maximum permissible */
6583
6584     switch (OP(p)) {
6585     case REG_ANY:
6586         if (utf8_target) {
6587             while (scan < loceol && hardcount < max && *scan != '\n') {
6588                 scan += UTF8SKIP(scan);
6589                 hardcount++;
6590             }
6591         } else {
6592             while (scan < loceol && *scan != '\n')
6593                 scan++;
6594         }
6595         break;
6596     case SANY:
6597         if (utf8_target) {
6598             while (scan < loceol && hardcount < max) {
6599                 scan += UTF8SKIP(scan);
6600                 hardcount++;
6601             }
6602         }
6603         else
6604             scan = loceol;
6605         break;
6606     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6607         if (utf8_target && scan + max < loceol) {
6608
6609             /* <loceol> hadn't been adjusted in the UTF-8 case */
6610             scan +=  max;
6611         }
6612         else {
6613             scan = loceol;
6614         }
6615         break;
6616     case EXACT:
6617         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6618
6619         c = (U8)*STRING(p);
6620
6621         /* Can use a simple loop if the pattern char to match on is invariant
6622          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6623          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6624          * true iff it doesn't matter if the argument is in UTF-8 or not */
6625         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6626             if (utf8_target && scan + max < loceol) {
6627                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6628                  * since here, to match at all, 1 char == 1 byte */
6629                 loceol = scan + max;
6630             }
6631             while (scan < loceol && UCHARAT(scan) == c) {
6632                 scan++;
6633             }
6634         }
6635         else if (UTF_PATTERN) {
6636             if (utf8_target) {
6637                 STRLEN scan_char_len;
6638
6639                 /* When both target and pattern are UTF-8, we have to do
6640                  * string EQ */
6641                 while (hardcount < max
6642                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6643                        && scan_char_len <= STR_LEN(p)
6644                        && memEQ(scan, STRING(p), scan_char_len))
6645                 {
6646                     scan += scan_char_len;
6647                     hardcount++;
6648                 }
6649             }
6650             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6651
6652                 /* Target isn't utf8; convert the character in the UTF-8
6653                  * pattern to non-UTF8, and do a simple loop */
6654                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6655                 while (scan < loceol && UCHARAT(scan) == c) {
6656                     scan++;
6657                 }
6658             } /* else pattern char is above Latin1, can't possibly match the
6659                  non-UTF-8 target */
6660         }
6661         else {
6662
6663             /* Here, the string must be utf8; pattern isn't, and <c> is
6664              * different in utf8 than not, so can't compare them directly.
6665              * Outside the loop, find the two utf8 bytes that represent c, and
6666              * then look for those in sequence in the utf8 string */
6667             U8 high = UTF8_TWO_BYTE_HI(c);
6668             U8 low = UTF8_TWO_BYTE_LO(c);
6669
6670             while (hardcount < max
6671                     && scan + 1 < loceol
6672                     && UCHARAT(scan) == high
6673                     && UCHARAT(scan + 1) == low)
6674             {
6675                 scan += 2;
6676                 hardcount++;
6677             }
6678         }
6679         break;
6680
6681     case EXACTFA:
6682         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6683         goto do_exactf;
6684
6685     case EXACTFL:
6686         PL_reg_flags |= RF_tainted;
6687         utf8_flags = FOLDEQ_UTF8_LOCALE;
6688         goto do_exactf;
6689
6690     case EXACTF:
6691             utf8_flags = 0;
6692             goto do_exactf;
6693
6694     case EXACTFU_SS:
6695     case EXACTFU_TRICKYFOLD:
6696     case EXACTFU:
6697         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6698
6699     do_exactf: {
6700         int c1, c2;
6701         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6702
6703         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6704
6705         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6706             if (c1 == CHRTEST_VOID) {
6707                 /* Use full Unicode fold matching */
6708                 char *tmpeol = PL_regeol;
6709                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6710                 while (hardcount < max
6711                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6712                                              STRING(p), NULL, pat_len,
6713                                              cBOOL(UTF_PATTERN), utf8_flags))
6714                 {
6715                     scan = tmpeol;
6716                     tmpeol = PL_regeol;
6717                     hardcount++;
6718                 }
6719             }
6720             else if (utf8_target) {
6721                 if (c1 == c2) {
6722                     while (scan < loceol
6723                            && hardcount < max
6724                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6725                     {
6726                         scan += UTF8SKIP(scan);
6727                         hardcount++;
6728                     }
6729                 }
6730                 else {
6731                     while (scan < loceol
6732                            && hardcount < max
6733                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6734                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6735                     {
6736                         scan += UTF8SKIP(scan);
6737                         hardcount++;
6738                     }
6739                 }
6740             }
6741             else if (c1 == c2) {
6742                 while (scan < loceol && UCHARAT(scan) == c1) {
6743                     scan++;
6744                 }
6745             }
6746             else {
6747                 while (scan < loceol &&
6748                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6749                 {
6750                     scan++;
6751                 }
6752             }
6753         }
6754         break;
6755     }
6756     case ANYOF:
6757         if (utf8_target) {
6758             STRLEN inclasslen;
6759             while (hardcount < max
6760                    && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6761                    && reginclass(prog, p, (U8*)scan, utf8_target))
6762             {
6763                 scan += inclasslen;
6764                 hardcount++;
6765             }
6766         } else {
6767             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6768                 scan++;
6769         }
6770         break;
6771     case ALNUMU:
6772         if (utf8_target) {
6773     utf8_wordchar:
6774             LOAD_UTF8_CHARCLASS_ALNUM();
6775             while (hardcount < max && scan < loceol &&
6776                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6777             {
6778                 scan += UTF8SKIP(scan);
6779                 hardcount++;
6780             }
6781         } else {
6782             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6783                 scan++;
6784             }
6785         }
6786         break;
6787     case ALNUM:
6788         if (utf8_target)
6789             goto utf8_wordchar;
6790         while (scan < loceol && isALNUM((U8) *scan)) {
6791             scan++;
6792         }
6793         break;
6794     case ALNUMA:
6795         if (utf8_target && scan + max < loceol) {
6796
6797             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6798              * since here, to match, 1 char == 1 byte */
6799             loceol = scan + max;
6800         }
6801         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6802             scan++;
6803         }
6804         break;
6805     case ALNUML:
6806         PL_reg_flags |= RF_tainted;
6807         if (utf8_target) {
6808             while (hardcount < max && scan < loceol &&
6809                    isALNUM_LC_utf8((U8*)scan)) {
6810                 scan += UTF8SKIP(scan);
6811                 hardcount++;
6812             }
6813         } else {
6814             while (scan < loceol && isALNUM_LC(*scan))
6815                 scan++;
6816         }
6817         break;
6818     case NALNUMU:
6819         if (utf8_target) {
6820
6821     utf8_Nwordchar:
6822
6823             LOAD_UTF8_CHARCLASS_ALNUM();
6824             while (hardcount < max && scan < loceol &&
6825                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6826             {
6827                 scan += UTF8SKIP(scan);
6828                 hardcount++;
6829             }
6830         } else {
6831             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6832                 scan++;
6833             }
6834         }
6835         break;
6836     case NALNUM:
6837         if (utf8_target)
6838             goto utf8_Nwordchar;
6839         while (scan < loceol && ! isALNUM((U8) *scan)) {
6840             scan++;
6841         }
6842         break;
6843
6844     case POSIXA:
6845         if (utf8_target && scan + max < loceol) {
6846
6847             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6848              * since here, to match, 1 char == 1 byte */
6849             loceol = scan + max;
6850         }
6851         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6852             scan++;
6853         }
6854         break;
6855     case NPOSIXA:
6856         if (utf8_target) {
6857             while (scan < loceol && hardcount < max
6858                    && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6859             {
6860                 scan += UTF8SKIP(scan);
6861                 hardcount++;
6862             }
6863         }
6864         else {
6865             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6866                 scan++;
6867             }
6868         }
6869         break;
6870     case NALNUMA:
6871         if (utf8_target) {
6872             while (scan < loceol && hardcount < max
6873                    && ! isWORDCHAR_A((U8) *scan))
6874             {
6875                 scan += UTF8SKIP(scan);
6876                 hardcount++;
6877             }
6878         }
6879         else {
6880             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6881                 scan++;
6882             }
6883         }
6884         break;
6885     case NALNUML:
6886         PL_reg_flags |= RF_tainted;
6887         if (utf8_target) {
6888             while (hardcount < max && scan < loceol &&
6889                    !isALNUM_LC_utf8((U8*)scan)) {
6890                 scan += UTF8SKIP(scan);
6891                 hardcount++;
6892             }
6893         } else {
6894             while (scan < loceol && !isALNUM_LC(*scan))
6895                 scan++;
6896         }
6897         break;
6898     case SPACEU:
6899         if (utf8_target) {
6900
6901     utf8_space:
6902
6903             LOAD_UTF8_CHARCLASS_SPACE();
6904             while (hardcount < max && scan < loceol &&
6905                    (*scan == ' ' ||
6906                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6907             {
6908                 scan += UTF8SKIP(scan);
6909                 hardcount++;
6910             }
6911             break;
6912         }
6913         else {
6914             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6915                 scan++;
6916             }
6917             break;
6918         }
6919     case SPACE:
6920         if (utf8_target)
6921             goto utf8_space;
6922
6923         while (scan < loceol && isSPACE((U8) *scan)) {
6924             scan++;
6925         }
6926         break;
6927     case SPACEA:
6928         if (utf8_target && scan + max < loceol) {
6929
6930             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6931              * since here, to match, 1 char == 1 byte */
6932             loceol = scan + max;
6933         }
6934         while (scan < loceol && isSPACE_A((U8) *scan)) {
6935             scan++;
6936         }
6937         break;
6938     case SPACEL:
6939         PL_reg_flags |= RF_tainted;
6940         if (utf8_target) {
6941             while (hardcount < max && scan < loceol &&
6942                    isSPACE_LC_utf8((U8*)scan)) {
6943                 scan += UTF8SKIP(scan);
6944                 hardcount++;
6945             }
6946         } else {
6947             while (scan < loceol && isSPACE_LC(*scan))
6948                 scan++;
6949         }
6950         break;
6951     case NSPACEU:
6952         if (utf8_target) {
6953
6954     utf8_Nspace:
6955
6956             LOAD_UTF8_CHARCLASS_SPACE();
6957             while (hardcount < max && scan < loceol &&
6958                    ! (*scan == ' ' ||
6959                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6960             {
6961                 scan += UTF8SKIP(scan);
6962                 hardcount++;
6963             }
6964             break;
6965         }
6966         else {
6967             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6968                 scan++;
6969             }
6970         }
6971         break;
6972     case NSPACE:
6973         if (utf8_target)
6974             goto utf8_Nspace;
6975
6976         while (scan < loceol && ! isSPACE((U8) *scan)) {
6977             scan++;
6978         }
6979         break;
6980     case NSPACEA:
6981         if (utf8_target) {
6982             while (hardcount < max && scan < loceol
6983                    && ! isSPACE_A((U8) *scan))
6984             {
6985                 scan += UTF8SKIP(scan);
6986                 hardcount++;
6987             }
6988         }
6989         else {
6990             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6991                 scan++;
6992             }
6993         }
6994         break;
6995     case NSPACEL:
6996         PL_reg_flags |= RF_tainted;
6997         if (utf8_target) {
6998             while (hardcount < max && scan < loceol &&
6999                    !isSPACE_LC_utf8((U8*)scan)) {
7000                 scan += UTF8SKIP(scan);
7001                 hardcount++;
7002             }
7003         } else {
7004             while (scan < loceol && !isSPACE_LC(*scan))
7005                 scan++;
7006         }
7007         break;
7008     case DIGIT:
7009         if (utf8_target) {
7010             LOAD_UTF8_CHARCLASS_DIGIT();
7011             while (hardcount < max && scan < loceol &&
7012                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7013                 scan += UTF8SKIP(scan);
7014                 hardcount++;
7015             }
7016         } else {
7017             while (scan < loceol && isDIGIT(*scan))
7018                 scan++;
7019         }
7020         break;
7021     case DIGITA:
7022         if (utf8_target && scan + max < loceol) {
7023
7024             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7025              * since here, to match, 1 char == 1 byte */
7026             loceol = scan + max;
7027         }
7028         while (scan < loceol && isDIGIT_A((U8) *scan)) {
7029             scan++;
7030         }
7031         break;
7032     case DIGITL:
7033         PL_reg_flags |= RF_tainted;
7034         if (utf8_target) {
7035             while (hardcount < max && scan < loceol &&
7036                    isDIGIT_LC_utf8((U8*)scan)) {
7037                 scan += UTF8SKIP(scan);
7038                 hardcount++;
7039             }
7040         } else {
7041             while (scan < loceol && isDIGIT_LC(*scan))
7042                 scan++;
7043         }
7044         break;
7045     case NDIGIT:
7046         if (utf8_target) {
7047             LOAD_UTF8_CHARCLASS_DIGIT();
7048             while (hardcount < max && scan < loceol &&
7049                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7050                 scan += UTF8SKIP(scan);
7051                 hardcount++;
7052             }
7053         } else {
7054             while (scan < loceol && !isDIGIT(*scan))
7055                 scan++;
7056         }
7057         break;
7058     case NDIGITA:
7059         if (utf8_target) {
7060             while (hardcount < max && scan < loceol
7061                    && ! isDIGIT_A((U8) *scan)) {
7062                 scan += UTF8SKIP(scan);
7063                 hardcount++;
7064             }
7065         }
7066         else {
7067             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7068                 scan++;
7069             }
7070         }
7071         break;
7072     case NDIGITL:
7073         PL_reg_flags |= RF_tainted;
7074         if (utf8_target) {
7075             while (hardcount < max && scan < loceol &&
7076                    !isDIGIT_LC_utf8((U8*)scan)) {
7077                 scan += UTF8SKIP(scan);
7078                 hardcount++;
7079             }
7080         } else {
7081             while (scan < loceol && !isDIGIT_LC(*scan))
7082                 scan++;
7083         }
7084         break;
7085     case LNBREAK:
7086         if (utf8_target) {
7087             while (hardcount < max && scan < loceol &&
7088                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7089                 scan += c;
7090                 hardcount++;
7091             }
7092         } else {
7093             /* LNBREAK can match one or two latin chars, which is ok, but we
7094              * have to use hardcount in this situation, and throw away the
7095              * adjustment to <loceol> done before the switch statement */
7096             loceol = PL_regeol;
7097             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7098                 scan+=c;
7099                 hardcount++;
7100             }
7101         }
7102         break;
7103     case HORIZWS:
7104         if (utf8_target) {
7105             while (hardcount < max && scan < loceol &&
7106                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7107             {
7108                 scan += c;
7109                 hardcount++;
7110             }
7111         } else {
7112             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7113                 scan++;
7114         }
7115         break;
7116     case NHORIZWS:
7117         if (utf8_target) {
7118             while (hardcount < max && scan < loceol &&
7119                         !is_HORIZWS_utf8_safe(scan, loceol))
7120             {
7121                 scan += UTF8SKIP(scan);
7122                 hardcount++;
7123             }
7124         } else {
7125             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7126                 scan++;
7127
7128         }
7129         break;
7130     case VERTWS:
7131         if (utf8_target) {
7132             while (hardcount < max && scan < loceol &&
7133                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7134             {
7135                 scan += c;
7136                 hardcount++;
7137             }
7138         } else {
7139             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7140                 scan++;
7141
7142         }
7143         break;
7144     case NVERTWS:
7145         if (utf8_target) {
7146             while (hardcount < max && scan < loceol &&
7147                                 !is_VERTWS_utf8_safe(scan, loceol))
7148             {
7149                 scan += UTF8SKIP(scan);
7150                 hardcount++;
7151             }
7152         } else {
7153             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7154                 scan++;
7155
7156         }
7157         break;
7158
7159     case BOUND:
7160     case BOUNDA:
7161     case BOUNDL:
7162     case BOUNDU:
7163     case EOS:
7164     case GPOS:
7165     case KEEPS:
7166     case NBOUND:
7167     case NBOUNDA:
7168     case NBOUNDL:
7169     case NBOUNDU:
7170     case OPFAIL:
7171     case SBOL:
7172     case SEOL:
7173         /* These are all 0 width, so match right here or not at all. */
7174         break;
7175
7176     default:
7177         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7178         assert(0); /* NOTREACHED */
7179
7180     }
7181
7182     if (hardcount)
7183         c = hardcount;
7184     else
7185         c = scan - *startposp;
7186     *startposp = scan;
7187
7188     DEBUG_r({
7189         GET_RE_DEBUG_FLAGS_DECL;
7190         DEBUG_EXECUTE_r({
7191             SV * const prop = sv_newmortal();
7192             regprop(prog, prop, p);
7193             PerlIO_printf(Perl_debug_log,
7194                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7195                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7196         });
7197     });
7198
7199     return(c);
7200 }
7201
7202
7203 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7204 /*
7205 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7206 create a copy so that changes the caller makes won't change the shared one.
7207 If <altsvp> is non-null, will return NULL in it, for back-compat.
7208  */
7209 SV *
7210 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7211 {
7212     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7213
7214     if (altsvp) {
7215         *altsvp = NULL;
7216     }
7217
7218     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7219 }
7220 #endif
7221
7222 STATIC SV *
7223 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7224 {
7225     /* Returns the swash for the input 'node' in the regex 'prog'.
7226      * If <doinit> is true, will attempt to create the swash if not already
7227      *    done.
7228      * If <listsvp> is non-null, will return the swash initialization string in
7229      *    it.
7230      * Tied intimately to how regcomp.c sets up the data structure */
7231
7232     dVAR;
7233     SV *sw  = NULL;
7234     SV *si  = NULL;
7235     SV*  invlist = NULL;
7236
7237     RXi_GET_DECL(prog,progi);
7238     const struct reg_data * const data = prog ? progi->data : NULL;
7239
7240     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7241
7242     assert(ANYOF_NONBITMAP(node));
7243
7244     if (data && data->count) {
7245         const U32 n = ARG(node);
7246
7247         if (data->what[n] == 's') {
7248             SV * const rv = MUTABLE_SV(data->data[n]);
7249             AV * const av = MUTABLE_AV(SvRV(rv));
7250             SV **const ary = AvARRAY(av);
7251             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7252
7253             si = *ary;  /* ary[0] = the string to initialize the swash with */
7254
7255             /* Elements 2 and 3 are either both present or both absent. [2] is
7256              * any inversion list generated at compile time; [3] indicates if
7257              * that inversion list has any user-defined properties in it. */
7258             if (av_len(av) >= 2) {
7259                 invlist = ary[2];
7260                 if (SvUV(ary[3])) {
7261                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7262                 }
7263             }
7264             else {
7265                 invlist = NULL;
7266             }
7267
7268             /* Element [1] is reserved for the set-up swash.  If already there,
7269              * return it; if not, create it and store it there */
7270             if (SvROK(ary[1])) {
7271                 sw = ary[1];
7272             }
7273             else if (si && doinit) {
7274
7275                 sw = _core_swash_init("utf8", /* the utf8 package */
7276                                       "", /* nameless */
7277                                       si,
7278                                       1, /* binary */
7279                                       0, /* not from tr/// */
7280                                       invlist,
7281                                       &swash_init_flags);
7282                 (void)av_store(av, 1, sw);
7283             }
7284         }
7285     }
7286
7287     if (listsvp) {
7288         SV* matches_string = newSVpvn("", 0);
7289
7290         /* Use the swash, if any, which has to have incorporated into it all
7291          * possibilities */
7292         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7293             && (si && si != &PL_sv_undef))
7294         {
7295
7296             /* If no swash, use the input initialization string, if available */
7297             sv_catsv(matches_string, si);
7298         }
7299
7300         /* Add the inversion list to whatever we have.  This may have come from
7301          * the swash, or from an input parameter */
7302         if (invlist) {
7303             sv_catsv(matches_string, _invlist_contents(invlist));
7304         }
7305         *listsvp = matches_string;
7306     }
7307
7308     return sw;
7309 }
7310
7311 /*
7312  - reginclass - determine if a character falls into a character class
7313
7314   n is the ANYOF regnode
7315   p is the target string
7316   utf8_target tells whether p is in UTF-8.
7317
7318   Returns true if matched; false otherwise.
7319
7320   Note that this can be a synthetic start class, a combination of various
7321   nodes, so things you think might be mutually exclusive, such as locale,
7322   aren't.  It can match both locale and non-locale
7323
7324  */
7325
7326 STATIC bool
7327 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
7328 {
7329     dVAR;
7330     const char flags = ANYOF_FLAGS(n);
7331     bool match = FALSE;
7332     UV c = *p;
7333
7334     PERL_ARGS_ASSERT_REGINCLASS;
7335
7336     /* If c is not already the code point, get it.  Note that
7337      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7338     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7339         STRLEN c_len = 0;
7340         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7341                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7342                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7343                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7344                  * UTF8_ALLOW_FFFF */
7345         if (c_len == (STRLEN)-1)
7346             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7347     }
7348
7349     /* If this character is potentially in the bitmap, check it */
7350     if (c < 256) {
7351         if (ANYOF_BITMAP_TEST(n, c))
7352             match = TRUE;
7353         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7354                 && ! utf8_target
7355                 && ! isASCII(c))
7356         {
7357             match = TRUE;
7358         }
7359         else if (flags & ANYOF_LOCALE) {
7360             PL_reg_flags |= RF_tainted;
7361
7362             if ((flags & ANYOF_LOC_FOLD)
7363                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7364             {
7365                 match = TRUE;
7366             }
7367             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7368                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7369                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7370                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7371                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7372                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7373                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7374                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7375                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7376                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7377                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7378                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7379                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7380                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7381                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7382                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7383                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7384                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7385                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7386                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7387                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7388                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7389                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7390                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7391                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7392                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7393                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7394                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7395                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7396                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7397                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7398                      ) /* How's that for a conditional? */
7399             ) {
7400                 match = TRUE;
7401             }
7402         }
7403     }
7404
7405     /* If the bitmap didn't (or couldn't) match, and something outside the
7406      * bitmap could match, try that.  Locale nodes specify completely the
7407      * behavior of code points in the bit map (otherwise, a utf8 target would
7408      * cause them to be treated as Unicode and not locale), except in
7409      * the very unlikely event when this node is a synthetic start class, which
7410      * could be a combination of locale and non-locale nodes.  So allow locale
7411      * to match for the synthetic start class, which will give a false
7412      * positive that will be resolved when the match is done again as not part
7413      * of the synthetic start class */
7414     if (!match) {
7415         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7416             match = TRUE;       /* Everything above 255 matches */
7417         }
7418         else if (ANYOF_NONBITMAP(n)
7419                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7420                      || (utf8_target
7421                          && (c >=256
7422                              || (! (flags & ANYOF_LOCALE))
7423                              || (flags & ANYOF_IS_SYNTHETIC)))))
7424         {
7425             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7426             if (sw) {
7427                 U8 * utf8_p;
7428                 if (utf8_target) {
7429                     utf8_p = (U8 *) p;
7430                 } else { /* Convert to utf8 */
7431                     STRLEN len = 1;
7432                     utf8_p = bytes_to_utf8(p, &len);
7433                 }
7434
7435                 if (swash_fetch(sw, utf8_p, TRUE)) {
7436                     match = TRUE;
7437                 }
7438
7439                 /* If we allocated a string above, free it */
7440                 if (! utf8_target) Safefree(utf8_p);
7441             }
7442         }
7443
7444         if (UNICODE_IS_SUPER(c)
7445             && (flags & ANYOF_WARN_SUPER)
7446             && ckWARN_d(WARN_NON_UNICODE))
7447         {
7448             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7449                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7450         }
7451     }
7452
7453     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7454     return cBOOL(flags & ANYOF_INVERT) ^ match;
7455 }
7456
7457 STATIC U8 *
7458 S_reghop3(U8 *s, I32 off, const U8* lim)
7459 {
7460     /* return the position 'off' UTF-8 characters away from 's', forward if
7461      * 'off' >= 0, backwards if negative.  But don't go outside of position
7462      * 'lim', which better be < s  if off < 0 */
7463
7464     dVAR;
7465
7466     PERL_ARGS_ASSERT_REGHOP3;
7467
7468     if (off >= 0) {
7469         while (off-- && s < lim) {
7470             /* XXX could check well-formedness here */
7471             s += UTF8SKIP(s);
7472         }
7473     }
7474     else {
7475         while (off++ && s > lim) {
7476             s--;
7477             if (UTF8_IS_CONTINUED(*s)) {
7478                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7479                     s--;
7480             }
7481             /* XXX could check well-formedness here */
7482         }
7483     }
7484     return s;
7485 }
7486
7487 #ifdef XXX_dmq
7488 /* there are a bunch of places where we use two reghop3's that should
7489    be replaced with this routine. but since thats not done yet
7490    we ifdef it out - dmq
7491 */
7492 STATIC U8 *
7493 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7494 {
7495     dVAR;
7496
7497     PERL_ARGS_ASSERT_REGHOP4;
7498
7499     if (off >= 0) {
7500         while (off-- && s < rlim) {
7501             /* XXX could check well-formedness here */
7502             s += UTF8SKIP(s);
7503         }
7504     }
7505     else {
7506         while (off++ && s > llim) {
7507             s--;
7508             if (UTF8_IS_CONTINUED(*s)) {
7509                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7510                     s--;
7511             }
7512             /* XXX could check well-formedness here */
7513         }
7514     }
7515     return s;
7516 }
7517 #endif
7518
7519 STATIC U8 *
7520 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7521 {
7522     dVAR;
7523
7524     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7525
7526     if (off >= 0) {
7527         while (off-- && s < lim) {
7528             /* XXX could check well-formedness here */
7529             s += UTF8SKIP(s);
7530         }
7531         if (off >= 0)
7532             return NULL;
7533     }
7534     else {
7535         while (off++ && s > lim) {
7536             s--;
7537             if (UTF8_IS_CONTINUED(*s)) {
7538                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7539                     s--;
7540             }
7541             /* XXX could check well-formedness here */
7542         }
7543         if (off <= 0)
7544             return NULL;
7545     }
7546     return s;
7547 }
7548
7549 static void
7550 restore_pos(pTHX_ void *arg)
7551 {
7552     dVAR;
7553     regexp * const rex = (regexp *)arg;
7554     if (PL_reg_state.re_state_eval_setup_done) {
7555         if (PL_reg_oldsaved) {
7556             rex->subbeg = PL_reg_oldsaved;
7557             rex->sublen = PL_reg_oldsavedlen;
7558             rex->suboffset = PL_reg_oldsavedoffset;
7559             rex->subcoffset = PL_reg_oldsavedcoffset;
7560 #ifdef PERL_OLD_COPY_ON_WRITE
7561             rex->saved_copy = PL_nrs;
7562 #endif
7563             RXp_MATCH_COPIED_on(rex);
7564         }
7565         PL_reg_magic->mg_len = PL_reg_oldpos;
7566         PL_reg_state.re_state_eval_setup_done = FALSE;
7567         PL_curpm = PL_reg_oldcurpm;
7568     }
7569 }
7570
7571 STATIC void
7572 S_to_utf8_substr(pTHX_ register regexp *prog)
7573 {
7574     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7575      * on the converted value */
7576
7577     int i = 1;
7578
7579     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7580
7581     do {
7582         if (prog->substrs->data[i].substr
7583             && !prog->substrs->data[i].utf8_substr) {
7584             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7585             prog->substrs->data[i].utf8_substr = sv;
7586             sv_utf8_upgrade(sv);
7587             if (SvVALID(prog->substrs->data[i].substr)) {
7588                 if (SvTAIL(prog->substrs->data[i].substr)) {
7589                     /* Trim the trailing \n that fbm_compile added last
7590                        time.  */
7591                     SvCUR_set(sv, SvCUR(sv) - 1);
7592                     /* Whilst this makes the SV technically "invalid" (as its
7593                        buffer is no longer followed by "\0") when fbm_compile()
7594                        adds the "\n" back, a "\0" is restored.  */
7595                     fbm_compile(sv, FBMcf_TAIL);
7596                 } else
7597                     fbm_compile(sv, 0);
7598             }
7599             if (prog->substrs->data[i].substr == prog->check_substr)
7600                 prog->check_utf8 = sv;
7601         }
7602     } while (i--);
7603 }
7604
7605 STATIC bool
7606 S_to_byte_substr(pTHX_ register regexp *prog)
7607 {
7608     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7609      * on the converted value; returns FALSE if can't be converted. */
7610
7611     dVAR;
7612     int i = 1;
7613
7614     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7615
7616     do {
7617         if (prog->substrs->data[i].utf8_substr
7618             && !prog->substrs->data[i].substr) {
7619             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7620             if (! sv_utf8_downgrade(sv, TRUE)) {
7621                 return FALSE;
7622             }
7623             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7624                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7625                     /* Trim the trailing \n that fbm_compile added last
7626                         time.  */
7627                     SvCUR_set(sv, SvCUR(sv) - 1);
7628                     fbm_compile(sv, FBMcf_TAIL);
7629                 } else
7630                     fbm_compile(sv, 0);
7631             }
7632             prog->substrs->data[i].substr = sv;
7633             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7634                 prog->check_substr = sv;
7635         }
7636     } while (i--);
7637
7638     return TRUE;
7639 }
7640
7641 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7642  * pre-composed Hangul syllables, which are all in a contiguous block and
7643  * arranged there in such a way so as to facilitate alorithmic determination of
7644  * their characteristics.  As such, they don't need a swash, but can be
7645  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7646  * is a GCB=LV */
7647 #define SBASE 0xAC00    /* Start of block */
7648 #define SCount 11172    /* Length of block */
7649 #define TCount 28
7650
7651 #if 0   /* This routine is not currently used */
7652 PERL_STATIC_INLINE bool
7653 S_is_utf8_X_LV(pTHX_ const U8 *p)
7654 {
7655     /* Unlike most other similarly named routines here, this does not create a
7656      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7657
7658     dVAR;
7659
7660     UV cp = valid_utf8_to_uvchr(p, NULL);
7661
7662     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7663
7664     /* The earliest Unicode releases did not have these precomposed Hangul
7665      * syllables.  Set to point to undef in that case, so will return false on
7666      * every call */
7667     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7668         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7669         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7670             SvREFCNT_dec(PL_utf8_X_LV);
7671             PL_utf8_X_LV = &PL_sv_undef;
7672         }
7673     }
7674
7675     return (PL_utf8_X_LV != &PL_sv_undef
7676             && cp >= SBASE && cp < SBASE + SCount
7677             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7678 }
7679 #endif
7680
7681 PERL_STATIC_INLINE bool
7682 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7683 {
7684     /* Unlike most other similarly named routines here, this does not create a
7685      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7686
7687     dVAR;
7688
7689     UV cp = valid_utf8_to_uvchr(p, NULL);
7690
7691     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7692
7693     /* The earliest Unicode releases did not have these precomposed Hangul
7694      * syllables.  Set to point to undef in that case, so will return false on
7695      * every call */
7696     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7697         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7698         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7699             SvREFCNT_dec(PL_utf8_X_LVT);
7700             PL_utf8_X_LVT = &PL_sv_undef;
7701         }
7702     }
7703
7704     return (PL_utf8_X_LVT != &PL_sv_undef
7705             && cp >= SBASE && cp < SBASE + SCount
7706             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7707 }
7708
7709 /*
7710  * Local variables:
7711  * c-indentation-style: bsd
7712  * c-basic-offset: 4
7713  * indent-tabs-mode: nil
7714  * End:
7715  *
7716  * ex: set ts=8 sts=4 sw=4 et:
7717  */