regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define RF_tainted      1       /* tainted information used? e.g. locale */
  97 #define RF_warned       2               /* warned about big count? */
  98
  99 #define RF_utf8         8               /* Pattern contains multibyte chars? */
 100
 101 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
 102
 103 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 104
 105 #ifndef STATIC
 106 #define STATIC  static
 107 #endif
 108
 109 /* Valid for non-utf8 strings: avoids the reginclass
 110  * call if there are no complications: i.e., if everything matchable is
 111  * straight forward in the bitmap */
 112 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 113                                               : ANYOF_BITMAP_TEST(p,*(c)))
 114
 115 /*
 116  * Forwards.
 117  */
 118
 119 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 120 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 121
 122 #define HOPc(pos,off) \
 123         (char *)(PL_reg_match_utf8 \
 124             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 125             : (U8*)(pos + off))
 126 #define HOPBACKc(pos, off) \
 127         (char*)(PL_reg_match_utf8\
 128             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 129             : (pos - off >= PL_bostr)           \
 130                 ? (U8*)pos - off                \
 131                 : NULL)
 132
 133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 135
 136
 137 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 138 #define NEXTCHR_IS_EOS (nextchr < 0)
 139
 140 #define SET_nextchr \
 141     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 142
 143 #define SET_locinput(p) \
 144     locinput = (p);  \
 145     SET_nextchr
 146
 147
 148 /* these are unrolled below in the CCC_TRY_XXX defined */
 149 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 150     if (!CAT2(PL_utf8_,class)) { \
 151         bool ok; \
 152         ENTER; save_re_context(); \
 153         ok=CAT2(is_utf8_,class)((const U8*)str); \
 154         PERL_UNUSED_VAR(ok); \
 155         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 156 /* Doesn't do an assert to verify that is correct */
 157 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 158     if (!CAT2(PL_utf8_,class)) { \
 159         bool throw_away; \
 160         PERL_UNUSED_VAR(throw_away); \
 161         ENTER; save_re_context(); \
 162         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 163         PERL_UNUSED_VAR(throw_away); \
 164         LEAVE; } } STMT_END
 165
 166 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 167 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 168
 169 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 170         /* No asserts are done for some of these, in case called on a   */  \
 171         /* Unicode version in which they map to nothing */                  \
 172         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                  \
 173         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 174
 175 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 176
 177 /* The actual code for CCC_TRY, which uses several variables from the routine
 178  * it's callable from.  It is designed to be the bulk of a case statement.
 179  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 180  *      nextchr matches the class.
 181  * UTF8_TEST is the whole test string to use for utf8 targets
 182  * LOAD is what to use to test, and if not present to load in the swash for the
 183  *      class
 184  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 185  *      UTF8_TEST test.
 186  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 187  * utf8 and a variant, load the swash if necessary and test using the utf8
 188  * test.  Advance to the next character if test is ok, otherwise fail; If not
 189  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 190  * fails, or advance to the next character */
 191
 192 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 193     if (NEXTCHR_IS_EOS) {                                                     \
 194         sayNO;                                                                \
 195     }                                                                         \
 196     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 197         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 198         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 199             sayNO;                                                            \
 200         }                                                                     \
 201     }                                                                         \
 202     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 203             sayNO;                                                            \
 204     }                                                                         \
 205     goto increment_locinput;
 206
 207 /* Handle the non-locale cases for a character class and its complement.  It
 208  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 209  * This is because that code fails when the test succeeds, so we want to have
 210  * the test fail so that the code succeeds.  The swash is stored in a
 211  * predictable PL_ place */
 212 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 213                            CLASS, STR)                                        \
 214     case NAME:                                                                \
 215         _CCC_TRY_CODE( !, FUNC,                                               \
 216                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 217                                             (U8*)locinput, TRUE)),            \
 218                           CLASS, STR)                                         \
 219     case NNAME:                                                               \
 220         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 221                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 222                                             (U8*)locinput, TRUE)),            \
 223                           CLASS, STR)
 224 /* Generate the case statements for both locale and non-locale character
 225  * classes in regmatch for classes that don't have special unicode semantics.
 226  * Locales don't use an immediate swash, but an intermediary special locale
 227  * function that is called on the pointer to the current place in the input
 228  * string.  That function will resolve to needing the same swash.  One might
 229  * think that because we don't know what the locale will match, we shouldn't
 230  * check with the swash loading function that it loaded properly; ie, that we
 231  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 232  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 233  * irrelevant here */
 234 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 235                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 236                 NAMEA, NNAMEA, FUNCA,                                         \
 237                 CLASS, STR)                                                   \
 238     case NAMEL:                                                               \
 239         PL_reg_flags |= RF_tainted;                                           \
 240         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 241     case NNAMEL:                                                              \
 242         PL_reg_flags |= RF_tainted;                                           \
 243         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 244                        CLASS, STR)                                            \
 245     case NAMEA:                                                               \
 246         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 247             sayNO;                                                            \
 248         }                                                                     \
 249         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 250         locinput++;                                        \
 251         break;                                                                \
 252     case NNAMEA:                                                              \
 253         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 254             sayNO;                                                            \
 255         }                                                                     \
 256         goto increment_locinput;                                              \
 257     /* Generate the non-locale cases */                                       \
 258     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 259
 260 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 261  * statements to handle separate Unicode semantics nodes */
 262 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 263                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 264                   NAMEU, NNAMEU, FUNCU,                                        \
 265                   NAMEA, NNAMEA, FUNCA,                                        \
 266                   CLASS, STR)                                                  \
 267     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 268             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 269             NAMEA, NNAMEA, FUNCA,                                              \
 270             CLASS, STR)                                                        \
 271     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 272
 273 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 274
 275 /* for use after a quantifier and before an EXACT-like node -- japhy */
 276 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 277  *
 278  * NOTE that *nothing* that affects backtracking should be in here, specifically
 279  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 280  * node that is in between two EXACT like nodes when ascertaining what the required
 281  * "follow" character is. This should probably be moved to regex compile time
 282  * although it may be done at run time beause of the REF possibility - more
 283  * investigation required. -- demerphq
 284 */
 285 #define JUMPABLE(rn) (      \
 286     OP(rn) == OPEN ||       \
 287     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 288     OP(rn) == EVAL ||   \
 289     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 290     OP(rn) == PLUS || OP(rn) == MINMOD || \
 291     OP(rn) == KEEPS || \
 292     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 293 )
 294 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 295
 296 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 297
 298 #if 0
 299 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 300    we don't need this definition. */
 301 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 302 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 303 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 304
 305 #else
 306 /* ... so we use this as its faster. */
 307 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 308 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 309 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 310 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 311
 312 #endif
 313
 314 /*
 315   Search for mandatory following text node; for lookahead, the text must
 316   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 317 */
 318 #define FIND_NEXT_IMPT(rn) STMT_START { \
 319     while (JUMPABLE(rn)) { \
 320         const OPCODE type = OP(rn); \
 321         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 322             rn = NEXTOPER(NEXTOPER(rn)); \
 323         else if (type == PLUS) \
 324             rn = NEXTOPER(rn); \
 325         else if (type == IFMATCH) \
 326             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 327         else rn += NEXT_OFF(rn); \
 328     } \
 329 } STMT_END
 330
 331 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 332  * These are for the pre-composed Hangul syllables, which are all in a
 333  * contiguous block and arranged there in such a way so as to facilitate
 334  * alorithmic determination of their characteristics.  As such, they don't need
 335  * a swash, but can be determined by simple arithmetic.  Almost all are
 336  * GCB=LVT, but every 28th one is a GCB=LV */
 337 #define SBASE 0xAC00    /* Start of block */
 338 #define SCount 11172    /* Length of block */
 339 #define TCount 28
 340
 341 static void restore_pos(pTHX_ void *arg);
 342
 343 #define REGCP_PAREN_ELEMS 3
 344 #define REGCP_OTHER_ELEMS 3
 345 #define REGCP_FRAME_ELEMS 1
 346 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 347  * are needed for the regexp context stack bookkeeping. */
 348
 349 STATIC CHECKPOINT
 350 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 351 {
 352     dVAR;
 353     const int retval = PL_savestack_ix;
 354     const int paren_elems_to_push =
 355                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 356     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 357     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 358     I32 p;
 359     GET_RE_DEBUG_FLAGS_DECL;
 360
 361     PERL_ARGS_ASSERT_REGCPPUSH;
 362
 363     if (paren_elems_to_push < 0)
 364         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 365                    paren_elems_to_push);
 366
 367     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 368         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 369                    " out of range (%lu-%ld)",
 370                    total_elems,
 371                    (unsigned long)maxopenparen,
 372                    (long)parenfloor);
 373
 374     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 375
 376     DEBUG_BUFFERS_r(
 377         if ((int)maxopenparen > (int)parenfloor)
 378             PerlIO_printf(Perl_debug_log,
 379                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 380                 PTR2UV(rex),
 381                 PTR2UV(rex->offs)
 382             );
 383     );
 384     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 385 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 386         SSPUSHINT(rex->offs[p].end);
 387         SSPUSHINT(rex->offs[p].start);
 388         SSPUSHINT(rex->offs[p].start_tmp);
 389         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 390             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 391             (UV)p,
 392             (IV)rex->offs[p].start,
 393             (IV)rex->offs[p].start_tmp,
 394             (IV)rex->offs[p].end
 395         ));
 396     }
 397 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 398     SSPUSHINT(maxopenparen);
 399     SSPUSHINT(rex->lastparen);
 400     SSPUSHINT(rex->lastcloseparen);
 401     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 402
 403     return retval;
 404 }
 405
 406 /* These are needed since we do not localize EVAL nodes: */
 407 #define REGCP_SET(cp)                                           \
 408     DEBUG_STATE_r(                                              \
 409             PerlIO_printf(Perl_debug_log,                       \
 410                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 411                 (IV)PL_savestack_ix));                          \
 412     cp = PL_savestack_ix
 413
 414 #define REGCP_UNWIND(cp)                                        \
 415     DEBUG_STATE_r(                                              \
 416         if (cp != PL_savestack_ix)                              \
 417             PerlIO_printf(Perl_debug_log,                       \
 418                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 419                 (IV)(cp), (IV)PL_savestack_ix));                \
 420     regcpblow(cp)
 421
 422 #define UNWIND_PAREN(lp, lcp)               \
 423     for (n = rex->lastparen; n > lp; n--)   \
 424         rex->offs[n].end = -1;              \
 425     rex->lastparen = n;                     \
 426     rex->lastcloseparen = lcp;
 427
 428
 429 STATIC void
 430 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 431 {
 432     dVAR;
 433     UV i;
 434     U32 paren;
 435     GET_RE_DEBUG_FLAGS_DECL;
 436
 437     PERL_ARGS_ASSERT_REGCPPOP;
 438
 439     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 440     i = SSPOPUV;
 441     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 442     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 443     rex->lastcloseparen = SSPOPINT;
 444     rex->lastparen = SSPOPINT;
 445     *maxopenparen_p = SSPOPINT;
 446
 447     i -= REGCP_OTHER_ELEMS;
 448     /* Now restore the parentheses context. */
 449     DEBUG_BUFFERS_r(
 450         if (i || rex->lastparen + 1 <= rex->nparens)
 451             PerlIO_printf(Perl_debug_log,
 452                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 453                 PTR2UV(rex),
 454                 PTR2UV(rex->offs)
 455             );
 456     );
 457     paren = *maxopenparen_p;
 458     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 459         I32 tmps;
 460         rex->offs[paren].start_tmp = SSPOPINT;
 461         rex->offs[paren].start = SSPOPINT;
 462         tmps = SSPOPINT;
 463         if (paren <= rex->lastparen)
 464             rex->offs[paren].end = tmps;
 465         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 466             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 467             (UV)paren,
 468             (IV)rex->offs[paren].start,
 469             (IV)rex->offs[paren].start_tmp,
 470             (IV)rex->offs[paren].end,
 471             (paren > rex->lastparen ? "(skipped)" : ""));
 472         );
 473         paren--;
 474     }
 475 #if 1
 476     /* It would seem that the similar code in regtry()
 477      * already takes care of this, and in fact it is in
 478      * a better location to since this code can #if 0-ed out
 479      * but the code in regtry() is needed or otherwise tests
 480      * requiring null fields (pat.t#187 and split.t#{13,14}
 481      * (as of patchlevel 7877)  will fail.  Then again,
 482      * this code seems to be necessary or otherwise
 483      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 484      * --jhi updated by dapm */
 485     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 486         if (i > *maxopenparen_p)
 487             rex->offs[i].start = -1;
 488         rex->offs[i].end = -1;
 489         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 490             "    \\%"UVuf": %s   ..-1 undeffing\n",
 491             (UV)i,
 492             (i > *maxopenparen_p) ? "-1" : "  "
 493         ));
 494     }
 495 #endif
 496 }
 497
 498 /* restore the parens and associated vars at savestack position ix,
 499  * but without popping the stack */
 500
 501 STATIC void
 502 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 503 {
 504     I32 tmpix = PL_savestack_ix;
 505     PL_savestack_ix = ix;
 506     regcppop(rex, maxopenparen_p);
 507     PL_savestack_ix = tmpix;
 508 }
 509
 510 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 511
 512 STATIC bool
 513 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 514 {
 515     /* Returns a boolean as to whether or not 'character' is a member of the
 516      * Posix character class given by 'classnum' that should be equivalent to a
 517      * value in the typedef '_char_class_number'.
 518      *
 519      * Ideally this could be replaced by a just an array of function pointers
 520      * to the C library functions that implement the macros this calls.
 521      * However, to compile, the precise function signatures are required, and
 522      * these may vary from platform to to platform.  To avoid having to figure
 523      * out what those all are on each platform, I (khw) am using this method,
 524      * which adds an extra layer of function call overhead.  But we don't
 525      * particularly care about performance with locales anyway. */
 526
 527     switch ((_char_class_number) classnum) {
 528         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 529         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 530         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 531         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 532         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 533         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 534         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 535         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 536         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 537         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 538         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 539         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 540         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 541         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 542         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 543         default:    /* VERTSPACE should never occur in locales */
 544             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 545     }
 546
 547     assert(0); /* NOTREACHED */
 548     return FALSE;
 549 }
 550
 551 /*
 552  * pregexec and friends
 553  */
 554
 555 #ifndef PERL_IN_XSUB_RE
 556 /*
 557  - pregexec - match a regexp against a string
 558  */
 559 I32
 560 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 561          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 562 /* stringarg: the point in the string at which to begin matching */
 563 /* strend:    pointer to null at end of string */
 564 /* strbeg:    real beginning of string */
 565 /* minend:    end of match must be >= minend bytes after stringarg. */
 566 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 567  *            itself is accessed via the pointers above */
 568 /* nosave:    For optimizations. */
 569 {
 570     PERL_ARGS_ASSERT_PREGEXEC;
 571
 572     return
 573         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 574                       nosave ? 0 : REXEC_COPY_STR);
 575 }
 576 #endif
 577
 578 /*
 579  * Need to implement the following flags for reg_anch:
 580  *
 581  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 582  * USE_INTUIT_ML
 583  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 584  * INTUIT_AUTORITATIVE_ML
 585  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 586  * INTUIT_ONCE_ML
 587  *
 588  * Another flag for this function: SECOND_TIME (so that float substrs
 589  * with giant delta may be not rechecked).
 590  */
 591
 592 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 593
 594 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 595    Otherwise, only SvCUR(sv) is used to get strbeg. */
 596
 597 /* XXXX We assume that strpos is strbeg unless sv. */
 598
 599 /* XXXX Some places assume that there is a fixed substring.
 600         An update may be needed if optimizer marks as "INTUITable"
 601         RExen without fixed substrings.  Similarly, it is assumed that
 602         lengths of all the strings are no more than minlen, thus they
 603         cannot come from lookahead.
 604         (Or minlen should take into account lookahead.)
 605   NOTE: Some of this comment is not correct. minlen does now take account
 606   of lookahead/behind. Further research is required. -- demerphq
 607
 608 */
 609
 610 /* A failure to find a constant substring means that there is no need to make
 611    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 612    finding a substring too deep into the string means that less calls to
 613    regtry() should be needed.
 614
 615    REx compiler's optimizer found 4 possible hints:
 616         a) Anchored substring;
 617         b) Fixed substring;
 618         c) Whether we are anchored (beginning-of-line or \G);
 619         d) First node (of those at offset 0) which may distinguish positions;
 620    We use a)b)d) and multiline-part of c), and try to find a position in the
 621    string which does not contradict any of them.
 622  */
 623
 624 /* Most of decisions we do here should have been done at compile time.
 625    The nodes of the REx which we used for the search should have been
 626    deleted from the finite automaton. */
 627
 628 char *
 629 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 630                      char *strend, const U32 flags, re_scream_pos_data *data)
 631 {
 632     dVAR;
 633     struct regexp *const prog = ReANY(rx);
 634     I32 start_shift = 0;
 635     /* Should be nonnegative! */
 636     I32 end_shift   = 0;
 637     char *s;
 638     SV *check;
 639     char *strbeg;
 640     char *t;
 641     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 642     I32 ml_anch;
 643     char *other_last = NULL;    /* other substr checked before this */
 644     char *check_at = NULL;              /* check substr found at this pos */
 645     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 646     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 647     RXi_GET_DECL(prog,progi);
 648 #ifdef DEBUGGING
 649     const char * const i_strpos = strpos;
 650 #endif
 651     GET_RE_DEBUG_FLAGS_DECL;
 652
 653     PERL_ARGS_ASSERT_RE_INTUIT_START;
 654     PERL_UNUSED_ARG(flags);
 655     PERL_UNUSED_ARG(data);
 656
 657     RX_MATCH_UTF8_set(rx,utf8_target);
 658
 659     if (RX_UTF8(rx)) {
 660         PL_reg_flags |= RF_utf8;
 661     }
 662     DEBUG_EXECUTE_r(
 663         debug_start_match(rx, utf8_target, strpos, strend,
 664             sv ? "Guessing start of match in sv for"
 665                : "Guessing start of match in string for");
 666               );
 667
 668     /* CHR_DIST() would be more correct here but it makes things slow. */
 669     if (prog->minlen > strend - strpos) {
 670         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 671                               "String too short... [re_intuit_start]\n"));
 672         goto fail;
 673     }
 674
 675     /* XXX we need to pass strbeg as a separate arg: the following is
 676      * guesswork and can be wrong... */
 677     if (sv && SvPOK(sv)) {
 678         char * p   = SvPVX(sv);
 679         STRLEN cur = SvCUR(sv);
 680         if (p <= strpos && strpos < p + cur) {
 681             strbeg = p;
 682             assert(p <= strend && strend <= p + cur);
 683         }
 684         else
 685             strbeg = strend - cur;
 686     }
 687     else
 688         strbeg = strpos;
 689
 690     PL_regeol = strend;
 691     if (utf8_target) {
 692         if (!prog->check_utf8 && prog->check_substr)
 693             to_utf8_substr(prog);
 694         check = prog->check_utf8;
 695     } else {
 696         if (!prog->check_substr && prog->check_utf8) {
 697             if (! to_byte_substr(prog)) {
 698                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 699             }
 700         }
 701         check = prog->check_substr;
 702     }
 703     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 704         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 705                      || ( (prog->extflags & RXf_ANCH_BOL)
 706                           && !multiline ) );    /* Check after \n? */
 707
 708         if (!ml_anch) {
 709           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 710                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 711                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 712                && sv && !SvROK(sv)
 713                && (strpos != strbeg)) {
 714               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 715               goto fail;
 716           }
 717           if (prog->check_offset_min == prog->check_offset_max
 718               && !(prog->extflags & RXf_CANY_SEEN)
 719               && ! multiline)   /* /m can cause \n's to match that aren't
 720                                    accounted for in the string max length.
 721                                    See [perl #115242] */
 722           {
 723             /* Substring at constant offset from beg-of-str... */
 724             I32 slen;
 725
 726             s = HOP3c(strpos, prog->check_offset_min, strend);
 727
 728             if (SvTAIL(check)) {
 729                 slen = SvCUR(check);    /* >= 1 */
 730
 731                 if ( strend - s > slen || strend - s < slen - 1
 732                      || (strend - s == slen && strend[-1] != '\n')) {
 733                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 734                     goto fail_finish;
 735                 }
 736                 /* Now should match s[0..slen-2] */
 737                 slen--;
 738                 if (slen && (*SvPVX_const(check) != *s
 739                              || (slen > 1
 740                                  && memNE(SvPVX_const(check), s, slen)))) {
 741                   report_neq:
 742                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 743                     goto fail_finish;
 744                 }
 745             }
 746             else if (*SvPVX_const(check) != *s
 747                      || ((slen = SvCUR(check)) > 1
 748                          && memNE(SvPVX_const(check), s, slen)))
 749                 goto report_neq;
 750             check_at = s;
 751             goto success_at_start;
 752           }
 753         }
 754         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 755         s = strpos;
 756         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 757         end_shift = prog->check_end_shift;
 758
 759         if (!ml_anch) {
 760             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 761                                          - (SvTAIL(check) != 0);
 762             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 763
 764             if (end_shift < eshift)
 765                 end_shift = eshift;
 766         }
 767     }
 768     else {                              /* Can match at random position */
 769         ml_anch = 0;
 770         s = strpos;
 771         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 772         end_shift = prog->check_end_shift;
 773
 774         /* end shift should be non negative here */
 775     }
 776
 777 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 778     if (end_shift < 0)
 779         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 780                    (IV)end_shift, RX_PRECOMP(prog));
 781 #endif
 782
 783   restart:
 784     /* Find a possible match in the region s..strend by looking for
 785        the "check" substring in the region corrected by start/end_shift. */
 786
 787     {
 788         I32 srch_start_shift = start_shift;
 789         I32 srch_end_shift = end_shift;
 790         U8* start_point;
 791         U8* end_point;
 792         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 793             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 794             srch_start_shift = strbeg - s;
 795         }
 796     DEBUG_OPTIMISE_MORE_r({
 797         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 798             (IV)prog->check_offset_min,
 799             (IV)srch_start_shift,
 800             (IV)srch_end_shift,
 801             (IV)prog->check_end_shift);
 802     });
 803
 804         if (prog->extflags & RXf_CANY_SEEN) {
 805             start_point= (U8*)(s + srch_start_shift);
 806             end_point= (U8*)(strend - srch_end_shift);
 807         } else {
 808             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 809             end_point= HOP3(strend, -srch_end_shift, strbeg);
 810         }
 811         DEBUG_OPTIMISE_MORE_r({
 812             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 813                 (int)(end_point - start_point),
 814                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 815                 start_point);
 816         });
 817
 818         s = fbm_instr( start_point, end_point,
 819                       check, multiline ? FBMrf_MULTILINE : 0);
 820     }
 821     /* Update the count-of-usability, remove useless subpatterns,
 822         unshift s.  */
 823
 824     DEBUG_EXECUTE_r({
 825         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 826             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 827         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 828                           (s ? "Found" : "Did not find"),
 829             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 830                 ? "anchored" : "floating"),
 831             quoted,
 832             RE_SV_TAIL(check),
 833             (s ? " at offset " : "...\n") );
 834     });
 835
 836     if (!s)
 837         goto fail_finish;
 838     /* Finish the diagnostic message */
 839     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 840
 841     /* XXX dmq: first branch is for positive lookbehind...
 842        Our check string is offset from the beginning of the pattern.
 843        So we need to do any stclass tests offset forward from that
 844        point. I think. :-(
 845      */
 846
 847
 848
 849     check_at=s;
 850
 851
 852     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 853        Start with the other substr.
 854        XXXX no SCREAM optimization yet - and a very coarse implementation
 855        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 856                 *always* match.  Probably should be marked during compile...
 857        Probably it is right to do no SCREAM here...
 858      */
 859
 860     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 861                 : (prog->float_substr && prog->anchored_substr))
 862     {
 863         /* Take into account the "other" substring. */
 864         /* XXXX May be hopelessly wrong for UTF... */
 865         if (!other_last)
 866             other_last = strpos;
 867         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 868           do_other_anchored:
 869             {
 870                 char * const last = HOP3c(s, -start_shift, strbeg);
 871                 char *last1, *last2;
 872                 char * const saved_s = s;
 873                 SV* must;
 874
 875                 t = s - prog->check_offset_max;
 876                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 877                     && (!utf8_target
 878                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 879                             && t > strpos)))
 880                     NOOP;
 881                 else
 882                     t = strpos;
 883                 t = HOP3c(t, prog->anchored_offset, strend);
 884                 if (t < other_last)     /* These positions already checked */
 885                     t = other_last;
 886                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 887                 if (last < last1)
 888                     last1 = last;
 889                 /* XXXX It is not documented what units *_offsets are in.
 890                    We assume bytes, but this is clearly wrong.
 891                    Meaning this code needs to be carefully reviewed for errors.
 892                    dmq.
 893                   */
 894
 895                 /* On end-of-str: see comment below. */
 896                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 897                 if (must == &PL_sv_undef) {
 898                     s = (char*)NULL;
 899                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 900                 }
 901                 else
 902                     s = fbm_instr(
 903                         (unsigned char*)t,
 904                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 905                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 906                         must,
 907                         multiline ? FBMrf_MULTILINE : 0
 908                     );
 909                 DEBUG_EXECUTE_r({
 910                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 911                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 912                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 913                         (s ? "Found" : "Contradicts"),
 914                         quoted, RE_SV_TAIL(must));
 915                 });
 916
 917
 918                 if (!s) {
 919                     if (last1 >= last2) {
 920                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 921                                                 ", giving up...\n"));
 922                         goto fail_finish;
 923                     }
 924                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 925                         ", trying floating at offset %ld...\n",
 926                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 927                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 928                     s = HOP3c(last, 1, strend);
 929                     goto restart;
 930                 }
 931                 else {
 932                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 933                           (long)(s - i_strpos)));
 934                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 935                     other_last = HOP3c(s, 1, strend);
 936                     s = saved_s;
 937                     if (t == strpos)
 938                         goto try_at_start;
 939                     goto try_at_offset;
 940                 }
 941             }
 942         }
 943         else {          /* Take into account the floating substring. */
 944             char *last, *last1;
 945             char * const saved_s = s;
 946             SV* must;
 947
 948             t = HOP3c(s, -start_shift, strbeg);
 949             last1 = last =
 950                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 951             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 952                 last = HOP3c(t, prog->float_max_offset, strend);
 953             s = HOP3c(t, prog->float_min_offset, strend);
 954             if (s < other_last)
 955                 s = other_last;
 956  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 957             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 958             /* fbm_instr() takes into account exact value of end-of-str
 959                if the check is SvTAIL(ed).  Since false positives are OK,
 960                and end-of-str is not later than strend we are OK. */
 961             if (must == &PL_sv_undef) {
 962                 s = (char*)NULL;
 963                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 964             }
 965             else
 966                 s = fbm_instr((unsigned char*)s,
 967                               (unsigned char*)last + SvCUR(must)
 968                                   - (SvTAIL(must)!=0),
 969                               must, multiline ? FBMrf_MULTILINE : 0);
 970             DEBUG_EXECUTE_r({
 971                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 972                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 973                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 974                     (s ? "Found" : "Contradicts"),
 975                     quoted, RE_SV_TAIL(must));
 976             });
 977             if (!s) {
 978                 if (last1 == last) {
 979                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 980                                             ", giving up...\n"));
 981                     goto fail_finish;
 982                 }
 983                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 984                     ", trying anchored starting at offset %ld...\n",
 985                     (long)(saved_s + 1 - i_strpos)));
 986                 other_last = last;
 987                 s = HOP3c(t, 1, strend);
 988                 goto restart;
 989             }
 990             else {
 991                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 992                       (long)(s - i_strpos)));
 993                 other_last = s; /* Fix this later. --Hugo */
 994                 s = saved_s;
 995                 if (t == strpos)
 996                     goto try_at_start;
 997                 goto try_at_offset;
 998             }
 999         }
1000     }
1001
1002
1003     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1004
1005     DEBUG_OPTIMISE_MORE_r(
1006         PerlIO_printf(Perl_debug_log,
1007             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
1008             (IV)prog->check_offset_min,
1009             (IV)prog->check_offset_max,
1010             (IV)(s-strpos),
1011             (IV)(t-strpos),
1012             (IV)(t-s),
1013             (IV)(strend-strpos)
1014         )
1015     );
1016
1017     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
1018         && (!utf8_target
1019             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1020                  && t > strpos)))
1021     {
1022         /* Fixed substring is found far enough so that the match
1023            cannot start at strpos. */
1024       try_at_offset:
1025         if (ml_anch && t[-1] != '\n') {
1026             /* Eventually fbm_*() should handle this, but often
1027                anchored_offset is not 0, so this check will not be wasted. */
1028             /* XXXX In the code below we prefer to look for "^" even in
1029                presence of anchored substrings.  And we search even
1030                beyond the found float position.  These pessimizations
1031                are historical artefacts only.  */
1032           find_anchor:
1033             while (t < strend - prog->minlen) {
1034                 if (*t == '\n') {
1035                     if (t < check_at - prog->check_offset_min) {
1036                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1037                             /* Since we moved from the found position,
1038                                we definitely contradict the found anchored
1039                                substr.  Due to the above check we do not
1040                                contradict "check" substr.
1041                                Thus we can arrive here only if check substr
1042                                is float.  Redo checking for "other"=="fixed".
1043                              */
1044                             strpos = t + 1;
1045                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1046                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1047                             goto do_other_anchored;
1048                         }
1049                         /* We don't contradict the found floating substring. */
1050                         /* XXXX Why not check for STCLASS? */
1051                         s = t + 1;
1052                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1053                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1054                         goto set_useful;
1055                     }
1056                     /* Position contradicts check-string */
1057                     /* XXXX probably better to look for check-string
1058                        than for "\n", so one should lower the limit for t? */
1059                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1060                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1061                     other_last = strpos = s = t + 1;
1062                     goto restart;
1063                 }
1064                 t++;
1065             }
1066             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1067                         PL_colors[0], PL_colors[1]));
1068             goto fail_finish;
1069         }
1070         else {
1071             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1072                         PL_colors[0], PL_colors[1]));
1073         }
1074         s = t;
1075       set_useful:
1076         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1077     }
1078     else {
1079         /* The found string does not prohibit matching at strpos,
1080            - no optimization of calling REx engine can be performed,
1081            unless it was an MBOL and we are not after MBOL,
1082            or a future STCLASS check will fail this. */
1083       try_at_start:
1084         /* Even in this situation we may use MBOL flag if strpos is offset
1085            wrt the start of the string. */
1086         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1087             && (strpos != strbeg) && strpos[-1] != '\n'
1088             /* May be due to an implicit anchor of m{.*foo}  */
1089             && !(prog->intflags & PREGf_IMPLICIT))
1090         {
1091             t = strpos;
1092             goto find_anchor;
1093         }
1094         DEBUG_EXECUTE_r( if (ml_anch)
1095             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1096                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1097         );
1098       success_at_start:
1099         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1100             && (utf8_target ? (
1101                 prog->check_utf8                /* Could be deleted already */
1102                 && --BmUSEFUL(prog->check_utf8) < 0
1103                 && (prog->check_utf8 == prog->float_utf8)
1104             ) : (
1105                 prog->check_substr              /* Could be deleted already */
1106                 && --BmUSEFUL(prog->check_substr) < 0
1107                 && (prog->check_substr == prog->float_substr)
1108             )))
1109         {
1110             /* If flags & SOMETHING - do not do it many times on the same match */
1111             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1112             /* XXX Does the destruction order has to change with utf8_target? */
1113             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1114             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1115             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1116             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1117             check = NULL;                       /* abort */
1118             s = strpos;
1119             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1120                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1121             if (prog->intflags & PREGf_IMPLICIT)
1122                 prog->extflags &= ~RXf_ANCH_MBOL;
1123             /* XXXX This is a remnant of the old implementation.  It
1124                     looks wasteful, since now INTUIT can use many
1125                     other heuristics. */
1126             prog->extflags &= ~RXf_USE_INTUIT;
1127             /* XXXX What other flags might need to be cleared in this branch? */
1128         }
1129         else
1130             s = strpos;
1131     }
1132
1133     /* Last resort... */
1134     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1135     /* trie stclasses are too expensive to use here, we are better off to
1136        leave it to regmatch itself */
1137     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1138         /* minlen == 0 is possible if regstclass is \b or \B,
1139            and the fixed substr is ''$.
1140            Since minlen is already taken into account, s+1 is before strend;
1141            accidentally, minlen >= 1 guaranties no false positives at s + 1
1142            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1143            regstclass does not come from lookahead...  */
1144         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1145            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1146         const U8* const str = (U8*)STRING(progi->regstclass);
1147         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1148                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1149                     : 1);
1150         char * endpos;
1151         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1152             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1153         else if (prog->float_substr || prog->float_utf8)
1154             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1155         else
1156             endpos= strend;
1157
1158         if (checked_upto < s)
1159            checked_upto = s;
1160         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1161                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1162
1163         t = s;
1164         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1165         if (s) {
1166             checked_upto = s;
1167         } else {
1168 #ifdef DEBUGGING
1169             const char *what = NULL;
1170 #endif
1171             if (endpos == strend) {
1172                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1173                                 "Could not match STCLASS...\n") );
1174                 goto fail;
1175             }
1176             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1177                                    "This position contradicts STCLASS...\n") );
1178             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1179                 goto fail;
1180             checked_upto = HOPBACKc(endpos, start_shift);
1181             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1182                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1183             /* Contradict one of substrings */
1184             if (prog->anchored_substr || prog->anchored_utf8) {
1185                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1186                     DEBUG_EXECUTE_r( what = "anchored" );
1187                   hop_and_restart:
1188                     s = HOP3c(t, 1, strend);
1189                     if (s + start_shift + end_shift > strend) {
1190                         /* XXXX Should be taken into account earlier? */
1191                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1192                                                "Could not match STCLASS...\n") );
1193                         goto fail;
1194                     }
1195                     if (!check)
1196                         goto giveup;
1197                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1198                                 "Looking for %s substr starting at offset %ld...\n",
1199                                  what, (long)(s + start_shift - i_strpos)) );
1200                     goto restart;
1201                 }
1202                 /* Have both, check_string is floating */
1203                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1204                     goto retry_floating_check;
1205                 /* Recheck anchored substring, but not floating... */
1206                 s = check_at;
1207                 if (!check)
1208                     goto giveup;
1209                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1210                           "Looking for anchored substr starting at offset %ld...\n",
1211                           (long)(other_last - i_strpos)) );
1212                 goto do_other_anchored;
1213             }
1214             /* Another way we could have checked stclass at the
1215                current position only: */
1216             if (ml_anch) {
1217                 s = t = t + 1;
1218                 if (!check)
1219                     goto giveup;
1220                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1221                           "Looking for /%s^%s/m starting at offset %ld...\n",
1222                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1223                 goto try_at_offset;
1224             }
1225             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1226                 goto fail;
1227             /* Check is floating substring. */
1228           retry_floating_check:
1229             t = check_at - start_shift;
1230             DEBUG_EXECUTE_r( what = "floating" );
1231             goto hop_and_restart;
1232         }
1233         if (t != s) {
1234             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1235                         "By STCLASS: moving %ld --> %ld\n",
1236                                   (long)(t - i_strpos), (long)(s - i_strpos))
1237                    );
1238         }
1239         else {
1240             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1241                                   "Does not contradict STCLASS...\n");
1242                    );
1243         }
1244     }
1245   giveup:
1246     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1247                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1248                           PL_colors[5], (long)(s - i_strpos)) );
1249     return s;
1250
1251   fail_finish:                          /* Substring not found */
1252     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1253         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1254   fail:
1255     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1256                           PL_colors[4], PL_colors[5]));
1257     return NULL;
1258 }
1259
1260 #define DECL_TRIE_TYPE(scan) \
1261     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1262                     trie_type = ((scan->flags == EXACT) \
1263                               ? (utf8_target ? trie_utf8 : trie_plain) \
1264                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1265
1266 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1267 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1268     STRLEN skiplen;                                                                 \
1269     switch (trie_type) {                                                            \
1270     case trie_utf8_fold:                                                            \
1271         if ( foldlen>0 ) {                                                          \
1272             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1273             foldlen -= len;                                                         \
1274             uscan += len;                                                           \
1275             len=0;                                                                  \
1276         } else {                                                                    \
1277             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1278             len = UTF8SKIP(uc);                                                     \
1279             skiplen = UNISKIP( uvc );                                               \
1280             foldlen -= skiplen;                                                     \
1281             uscan = foldbuf + skiplen;                                              \
1282         }                                                                           \
1283         break;                                                                      \
1284     case trie_latin_utf8_fold:                                                      \
1285         if ( foldlen>0 ) {                                                          \
1286             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1287             foldlen -= len;                                                         \
1288             uscan += len;                                                           \
1289             len=0;                                                                  \
1290         } else {                                                                    \
1291             len = 1;                                                                \
1292             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1293             skiplen = UNISKIP( uvc );                                               \
1294             foldlen -= skiplen;                                                     \
1295             uscan = foldbuf + skiplen;                                              \
1296         }                                                                           \
1297         break;                                                                      \
1298     case trie_utf8:                                                                 \
1299         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1300         break;                                                                      \
1301     case trie_plain:                                                                \
1302         uvc = (UV)*uc;                                                              \
1303         len = 1;                                                                    \
1304     }                                                                               \
1305     if (uvc < 256) {                                                                \
1306         charid = trie->charmap[ uvc ];                                              \
1307     }                                                                               \
1308     else {                                                                          \
1309         charid = 0;                                                                 \
1310         if (widecharmap) {                                                          \
1311             SV** const svpp = hv_fetch(widecharmap,                                 \
1312                         (char*)&uvc, sizeof(UV), 0);                                \
1313             if (svpp)                                                               \
1314                 charid = (U16)SvIV(*svpp);                                          \
1315         }                                                                           \
1316     }                                                                               \
1317 } STMT_END
1318
1319 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1320 STMT_START {                                              \
1321     while (s <= e) {                                      \
1322         if ( (CoNd)                                       \
1323              && (ln == 1 || folder(s, pat_string, ln))    \
1324              && (!reginfo || regtry(reginfo, &s)) )       \
1325             goto got_it;                                  \
1326         s++;                                              \
1327     }                                                     \
1328 } STMT_END
1329
1330 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1331 STMT_START {                                          \
1332     while (s < strend) {                              \
1333         CoDe                                          \
1334         s += UTF8SKIP(s);                             \
1335     }                                                 \
1336 } STMT_END
1337
1338 #define REXEC_FBC_SCAN(CoDe)                          \
1339 STMT_START {                                          \
1340     while (s < strend) {                              \
1341         CoDe                                          \
1342         s++;                                          \
1343     }                                                 \
1344 } STMT_END
1345
1346 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1347 REXEC_FBC_UTF8_SCAN(                                  \
1348     if (CoNd) {                                       \
1349         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1350             goto got_it;                              \
1351         else                                          \
1352             tmp = doevery;                            \
1353     }                                                 \
1354     else                                              \
1355         tmp = 1;                                      \
1356 )
1357
1358 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1359 REXEC_FBC_SCAN(                                       \
1360     if (CoNd) {                                       \
1361         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1362             goto got_it;                              \
1363         else                                          \
1364             tmp = doevery;                            \
1365     }                                                 \
1366     else                                              \
1367         tmp = 1;                                      \
1368 )
1369
1370 #define REXEC_FBC_TRYIT               \
1371 if ((!reginfo || regtry(reginfo, &s))) \
1372     goto got_it
1373
1374 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1375     if (utf8_target) {                                             \
1376         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1377     }                                                          \
1378     else {                                                     \
1379         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1380     }
1381
1382 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1383     if (utf8_target) {                                             \
1384         UtFpReLoAd;                                            \
1385         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1386     }                                                          \
1387     else {                                                     \
1388         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1389     }
1390
1391 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1392     PL_reg_flags |= RF_tainted;                                \
1393     if (utf8_target) {                                             \
1394         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1395     }                                                          \
1396     else {                                                     \
1397         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1398     }
1399
1400 #define DUMP_EXEC_POS(li,s,doutf8) \
1401     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1402
1403
1404 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1405         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1406         tmp = TEST_NON_UTF8(tmp);                                              \
1407         REXEC_FBC_UTF8_SCAN(                                                   \
1408             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1409                 tmp = !tmp;                                                    \
1410                 IF_SUCCESS;                                                    \
1411             }                                                                  \
1412             else {                                                             \
1413                 IF_FAIL;                                                       \
1414             }                                                                  \
1415         );                                                                     \
1416
1417 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1418         if (s == PL_bostr) {                                                   \
1419             tmp = '\n';                                                        \
1420         }                                                                      \
1421         else {                                                                 \
1422             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1423             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1424         }                                                                      \
1425         tmp = TeSt1_UtF8;                                                      \
1426         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1427         REXEC_FBC_UTF8_SCAN(                                                   \
1428             if (tmp == ! (TeSt2_UtF8)) { \
1429                 tmp = !tmp;                                                    \
1430                 IF_SUCCESS;                                                    \
1431             }                                                                  \
1432             else {                                                             \
1433                 IF_FAIL;                                                       \
1434             }                                                                  \
1435         );                                                                     \
1436
1437 /* The only difference between the BOUND and NBOUND cases is that
1438  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1439  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1440  * with the other one being empty */
1441 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1442     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1443
1444 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1445     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1446
1447 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1448     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1449
1450 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1451     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1452
1453
1454 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1455  * be passed in completely with the variable name being tested, which isn't
1456  * such a clean interface, but this is easier to read than it was before.  We
1457  * are looking for the boundary (or non-boundary between a word and non-word
1458  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1459  * must be different.  Find the "wordness" of the character just prior to this
1460  * one, and compare it with the wordness of this one.  If they differ, we have
1461  * a boundary.  At the beginning of the string, pretend that the previous
1462  * character was a new-line */
1463 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1464     if (utf8_target) {                                                         \
1465                 UTF8_CODE \
1466     }                                                                          \
1467     else {  /* Not utf8 */                                                     \
1468         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1469         tmp = TEST_NON_UTF8(tmp);                                              \
1470         REXEC_FBC_SCAN(                                                        \
1471             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1472                 tmp = !tmp;                                                    \
1473                 IF_SUCCESS;                                                    \
1474             }                                                                  \
1475             else {                                                             \
1476                 IF_FAIL;                                                       \
1477             }                                                                  \
1478         );                                                                     \
1479     }                                                                          \
1480     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1481         goto got_it;
1482
1483 /* We know what class REx starts with.  Try to find this position... */
1484 /* if reginfo is NULL, its a dryrun */
1485 /* annoyingly all the vars in this routine have different names from their counterparts
1486    in regmatch. /grrr */
1487
1488 STATIC char *
1489 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1490     const char *strend, regmatch_info *reginfo)
1491 {
1492     dVAR;
1493     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1494     char *pat_string;   /* The pattern's exactish string */
1495     char *pat_end;          /* ptr to end char of pat_string */
1496     re_fold_t folder;   /* Function for computing non-utf8 folds */
1497     const U8 *fold_array;   /* array for folding ords < 256 */
1498     STRLEN ln;
1499     STRLEN lnc;
1500     STRLEN uskip;
1501     U8 c1;
1502     U8 c2;
1503     char *e;
1504     I32 tmp = 1;        /* Scratch variable? */
1505     const bool utf8_target = PL_reg_match_utf8;
1506     UV utf8_fold_flags = 0;
1507     RXi_GET_DECL(prog,progi);
1508
1509     PERL_ARGS_ASSERT_FIND_BYCLASS;
1510
1511     /* We know what class it must start with. */
1512     switch (OP(c)) {
1513     case ANYOF:
1514         if (utf8_target) {
1515             REXEC_FBC_UTF8_CLASS_SCAN(
1516                       reginclass(prog, c, (U8*)s, utf8_target));
1517         }
1518         else {
1519             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1520         }
1521         break;
1522     case CANY:
1523         REXEC_FBC_SCAN(
1524             if (tmp && (!reginfo || regtry(reginfo, &s)))
1525                 goto got_it;
1526             else
1527                 tmp = doevery;
1528         );
1529         break;
1530
1531     case EXACTFA:
1532         if (UTF_PATTERN || utf8_target) {
1533             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1534             goto do_exactf_utf8;
1535         }
1536         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1537         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1538         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1539
1540     case EXACTF:
1541         if (utf8_target) {
1542
1543             /* regcomp.c already folded this if pattern is in UTF-8 */
1544             utf8_fold_flags = 0;
1545             goto do_exactf_utf8;
1546         }
1547         fold_array = PL_fold;
1548         folder = foldEQ;
1549         goto do_exactf_non_utf8;
1550
1551     case EXACTFL:
1552         if (UTF_PATTERN || utf8_target) {
1553             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1554             goto do_exactf_utf8;
1555         }
1556         fold_array = PL_fold_locale;
1557         folder = foldEQ_locale;
1558         goto do_exactf_non_utf8;
1559
1560     case EXACTFU_SS:
1561         if (UTF_PATTERN) {
1562             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1563         }
1564         goto do_exactf_utf8;
1565
1566     case EXACTFU_TRICKYFOLD:
1567     case EXACTFU:
1568         if (UTF_PATTERN || utf8_target) {
1569             utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1570             goto do_exactf_utf8;
1571         }
1572
1573         /* Any 'ss' in the pattern should have been replaced by regcomp,
1574          * so we don't have to worry here about this single special case
1575          * in the Latin1 range */
1576         fold_array = PL_fold_latin1;
1577         folder = foldEQ_latin1;
1578
1579         /* FALL THROUGH */
1580
1581     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1582                            are no glitches with fold-length differences
1583                            between the target string and pattern */
1584
1585         /* The idea in the non-utf8 EXACTF* cases is to first find the
1586          * first character of the EXACTF* node and then, if necessary,
1587          * case-insensitively compare the full text of the node.  c1 is the
1588          * first character.  c2 is its fold.  This logic will not work for
1589          * Unicode semantics and the german sharp ss, which hence should
1590          * not be compiled into a node that gets here. */
1591         pat_string = STRING(c);
1592         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1593
1594         /* We know that we have to match at least 'ln' bytes (which is the
1595          * same as characters, since not utf8).  If we have to match 3
1596          * characters, and there are only 2 availabe, we know without
1597          * trying that it will fail; so don't start a match past the
1598          * required minimum number from the far end */
1599         e = HOP3c(strend, -((I32)ln), s);
1600
1601         if (!reginfo && e < s) {
1602             e = s;                      /* Due to minlen logic of intuit() */
1603         }
1604
1605         c1 = *pat_string;
1606         c2 = fold_array[c1];
1607         if (c1 == c2) { /* If char and fold are the same */
1608             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1609         }
1610         else {
1611             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1612         }
1613         break;
1614
1615     do_exactf_utf8:
1616     {
1617         unsigned expansion;
1618
1619         /* If one of the operands is in utf8, we can't use the simpler folding
1620          * above, due to the fact that many different characters can have the
1621          * same fold, or portion of a fold, or different- length fold */
1622         pat_string = STRING(c);
1623         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1624         pat_end = pat_string + ln;
1625         lnc = (UTF_PATTERN)     /* length to match in characters */
1626                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1627                 : ln;
1628
1629         /* We have 'lnc' characters to match in the pattern, but because of
1630          * multi-character folding, each character in the target can match
1631          * up to 3 characters (Unicode guarantees it will never exceed
1632          * this) if it is utf8-encoded; and up to 2 if not (based on the
1633          * fact that the Latin 1 folds are already determined, and the
1634          * only multi-char fold in that range is the sharp-s folding to
1635          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1636          * string character.  Adjust lnc accordingly, rounding up, so that
1637          * if we need to match at least 4+1/3 chars, that really is 5. */
1638         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1639         lnc = (lnc + expansion - 1) / expansion;
1640
1641         /* As in the non-UTF8 case, if we have to match 3 characters, and
1642          * only 2 are left, it's guaranteed to fail, so don't start a
1643          * match that would require us to go beyond the end of the string
1644          */
1645         e = HOP3c(strend, -((I32)lnc), s);
1646
1647         if (!reginfo && e < s) {
1648             e = s;                      /* Due to minlen logic of intuit() */
1649         }
1650
1651         /* XXX Note that we could recalculate e to stop the loop earlier,
1652          * as the worst case expansion above will rarely be met, and as we
1653          * go along we would usually find that e moves further to the left.
1654          * This would happen only after we reached the point in the loop
1655          * where if there were no expansion we should fail.  Unclear if
1656          * worth the expense */
1657
1658         while (s <= e) {
1659             char *my_strend= (char *)strend;
1660             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1661                   pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1662                 && (!reginfo || regtry(reginfo, &s)) )
1663             {
1664                 goto got_it;
1665             }
1666             s += (utf8_target) ? UTF8SKIP(s) : 1;
1667         }
1668         break;
1669     }
1670     case BOUNDL:
1671         PL_reg_flags |= RF_tainted;
1672         FBC_BOUND(isALNUM_LC,
1673                   isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1674                   isALNUM_LC_utf8((U8*)s));
1675         break;
1676     case NBOUNDL:
1677         PL_reg_flags |= RF_tainted;
1678         FBC_NBOUND(isALNUM_LC,
1679                    isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1680                    isALNUM_LC_utf8((U8*)s));
1681         break;
1682     case BOUND:
1683         FBC_BOUND(isWORDCHAR,
1684                   isALNUM_uni(tmp),
1685                   cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1686         break;
1687     case BOUNDA:
1688         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1689                          isWORDCHAR_A(tmp),
1690                          isWORDCHAR_A((U8*)s));
1691         break;
1692     case NBOUND:
1693         FBC_NBOUND(isWORDCHAR,
1694                    isALNUM_uni(tmp),
1695                    cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1696         break;
1697     case NBOUNDA:
1698         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1699                           isWORDCHAR_A(tmp),
1700                           isWORDCHAR_A((U8*)s));
1701         break;
1702     case BOUNDU:
1703         FBC_BOUND(isWORDCHAR_L1,
1704                   isALNUM_uni(tmp),
1705                   cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1706         break;
1707     case NBOUNDU:
1708         FBC_NBOUND(isWORDCHAR_L1,
1709                    isALNUM_uni(tmp),
1710                    cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1711         break;
1712     case ALNUML:
1713         REXEC_FBC_CSCAN_TAINT(
1714             isALNUM_LC_utf8((U8*)s),
1715             isALNUM_LC(*s)
1716         );
1717         break;
1718     case ALNUMU:
1719         REXEC_FBC_CSCAN_PRELOAD(
1720             LOAD_UTF8_CHARCLASS_ALNUM(),
1721             swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1722             isWORDCHAR_L1((U8) *s)
1723         );
1724         break;
1725     case ALNUM:
1726         REXEC_FBC_CSCAN_PRELOAD(
1727             LOAD_UTF8_CHARCLASS_ALNUM(),
1728             swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1729             isWORDCHAR((U8) *s)
1730         );
1731         break;
1732     case ALNUMA:
1733         /* Don't need to worry about utf8, as it can match only a single
1734          * byte invariant character */
1735         REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1736         break;
1737     case NALNUMU:
1738         REXEC_FBC_CSCAN_PRELOAD(
1739             LOAD_UTF8_CHARCLASS_ALNUM(),
1740             !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1741             ! isWORDCHAR_L1((U8) *s)
1742         );
1743         break;
1744     case NALNUM:
1745         REXEC_FBC_CSCAN_PRELOAD(
1746             LOAD_UTF8_CHARCLASS_ALNUM(),
1747             !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1748             ! isALNUM(*s)
1749         );
1750         break;
1751     case NALNUMA:
1752         REXEC_FBC_CSCAN(
1753             !isWORDCHAR_A(*s),
1754             !isWORDCHAR_A(*s)
1755         );
1756         break;
1757     case NALNUML:
1758         REXEC_FBC_CSCAN_TAINT(
1759             !isALNUM_LC_utf8((U8*)s),
1760             !isALNUM_LC(*s)
1761         );
1762         break;
1763     case SPACEU:
1764         REXEC_FBC_CSCAN(
1765             is_XPERLSPACE_utf8(s),
1766             isSPACE_L1((U8) *s)
1767         );
1768         break;
1769     case SPACE:
1770         REXEC_FBC_CSCAN(
1771             is_XPERLSPACE_utf8(s),
1772             isSPACE((U8) *s)
1773         );
1774         break;
1775     case SPACEA:
1776         /* Don't need to worry about utf8, as it can match only a single
1777          * byte invariant character */
1778         REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1779         break;
1780     case SPACEL:
1781         REXEC_FBC_CSCAN_TAINT(
1782             isSPACE_LC_utf8((U8*)s),
1783             isSPACE_LC(*s)
1784         );
1785         break;
1786     case NSPACEU:
1787         REXEC_FBC_CSCAN(
1788             ! is_XPERLSPACE_utf8(s),
1789             ! isSPACE_L1((U8) *s)
1790         );
1791         break;
1792     case NSPACE:
1793         REXEC_FBC_CSCAN(
1794             ! is_XPERLSPACE_utf8(s),
1795             ! isSPACE((U8) *s)
1796         );
1797         break;
1798     case NSPACEA:
1799         REXEC_FBC_CSCAN(
1800             !isSPACE_A(*s),
1801             !isSPACE_A(*s)
1802         );
1803         break;
1804     case NSPACEL:
1805         REXEC_FBC_CSCAN_TAINT(
1806             !isSPACE_LC_utf8((U8*)s),
1807             !isSPACE_LC(*s)
1808         );
1809         break;
1810     case DIGIT:
1811         REXEC_FBC_CSCAN_PRELOAD(
1812             LOAD_UTF8_CHARCLASS_DIGIT(),
1813             swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1814             isDIGIT(*s)
1815         );
1816         break;
1817     case DIGITA:
1818         /* Don't need to worry about utf8, as it can match only a single
1819          * byte invariant character */
1820         REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1821         break;
1822     case DIGITL:
1823         REXEC_FBC_CSCAN_TAINT(
1824             isDIGIT_LC_utf8((U8*)s),
1825             isDIGIT_LC(*s)
1826         );
1827         break;
1828     case NDIGIT:
1829         REXEC_FBC_CSCAN_PRELOAD(
1830             LOAD_UTF8_CHARCLASS_DIGIT(),
1831             !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1832             !isDIGIT(*s)
1833         );
1834         break;
1835     case NDIGITA:
1836         REXEC_FBC_CSCAN(
1837             !isDIGIT_A(*s),
1838             !isDIGIT_A(*s)
1839         );
1840         break;
1841     case NDIGITL:
1842         REXEC_FBC_CSCAN_TAINT(
1843             !isDIGIT_LC_utf8((U8*)s),
1844             !isDIGIT_LC(*s)
1845         );
1846         break;
1847     case LNBREAK:
1848         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1849                         is_LNBREAK_latin1_safe(s, strend)
1850         );
1851         break;
1852     case VERTWS:
1853         REXEC_FBC_CSCAN(
1854             is_VERTWS_utf8_safe(s, strend),
1855             is_VERTWS_latin1_safe(s, strend)
1856         );
1857         break;
1858     case NVERTWS:
1859         REXEC_FBC_CSCAN(
1860             !is_VERTWS_utf8_safe(s, strend),
1861             !is_VERTWS_latin1_safe(s, strend)
1862         );
1863         break;
1864     case HORIZWS:
1865         REXEC_FBC_CSCAN(
1866             is_HORIZWS_utf8_safe(s, strend),
1867             is_HORIZWS_latin1_safe(s, strend)
1868         );
1869         break;
1870     case NHORIZWS:
1871         REXEC_FBC_CSCAN(
1872             !is_HORIZWS_utf8_safe(s, strend),
1873             !is_HORIZWS_latin1_safe(s, strend)
1874         );
1875         break;
1876     case POSIXA:
1877         /* Don't need to worry about utf8, as it can match only a single
1878         * byte invariant character.  The flag in this node type is the
1879         * class number to pass to _generic_isCC() to build a mask for
1880         * searching in PL_charclass[] */
1881         REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1882         break;
1883     case NPOSIXA:
1884         REXEC_FBC_CSCAN(
1885             !_generic_isCC_A(*s, FLAGS(c)),
1886             !_generic_isCC_A(*s, FLAGS(c))
1887         );
1888         break;
1889
1890     case AHOCORASICKC:
1891     case AHOCORASICK:
1892         {
1893             DECL_TRIE_TYPE(c);
1894             /* what trie are we using right now */
1895             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1896             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1897             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1898
1899             const char *last_start = strend - trie->minlen;
1900 #ifdef DEBUGGING
1901             const char *real_start = s;
1902 #endif
1903             STRLEN maxlen = trie->maxlen;
1904             SV *sv_points;
1905             U8 **points; /* map of where we were in the input string
1906                             when reading a given char. For ASCII this
1907                             is unnecessary overhead as the relationship
1908                             is always 1:1, but for Unicode, especially
1909                             case folded Unicode this is not true. */
1910             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1911             U8 *bitmap=NULL;
1912
1913
1914             GET_RE_DEBUG_FLAGS_DECL;
1915
1916             /* We can't just allocate points here. We need to wrap it in
1917              * an SV so it gets freed properly if there is a croak while
1918              * running the match */
1919             ENTER;
1920             SAVETMPS;
1921             sv_points=newSV(maxlen * sizeof(U8 *));
1922             SvCUR_set(sv_points,
1923                 maxlen * sizeof(U8 *));
1924             SvPOK_on(sv_points);
1925             sv_2mortal(sv_points);
1926             points=(U8**)SvPV_nolen(sv_points );
1927             if ( trie_type != trie_utf8_fold
1928                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
1929             {
1930                 if (trie->bitmap)
1931                     bitmap=(U8*)trie->bitmap;
1932                 else
1933                     bitmap=(U8*)ANYOF_BITMAP(c);
1934             }
1935             /* this is the Aho-Corasick algorithm modified a touch
1936                to include special handling for long "unknown char" sequences.
1937                The basic idea being that we use AC as long as we are dealing
1938                with a possible matching char, when we encounter an unknown char
1939                (and we have not encountered an accepting state) we scan forward
1940                until we find a legal starting char.
1941                AC matching is basically that of trie matching, except that when
1942                we encounter a failing transition, we fall back to the current
1943                states "fail state", and try the current char again, a process
1944                we repeat until we reach the root state, state 1, or a legal
1945                transition. If we fail on the root state then we can either
1946                terminate if we have reached an accepting state previously, or
1947                restart the entire process from the beginning if we have not.
1948
1949              */
1950             while (s <= last_start) {
1951                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1952                 U8 *uc = (U8*)s;
1953                 U16 charid = 0;
1954                 U32 base = 1;
1955                 U32 state = 1;
1956                 UV uvc = 0;
1957                 STRLEN len = 0;
1958                 STRLEN foldlen = 0;
1959                 U8 *uscan = (U8*)NULL;
1960                 U8 *leftmost = NULL;
1961 #ifdef DEBUGGING
1962                 U32 accepted_word= 0;
1963 #endif
1964                 U32 pointpos = 0;
1965
1966                 while ( state && uc <= (U8*)strend ) {
1967                     int failed=0;
1968                     U32 word = aho->states[ state ].wordnum;
1969
1970                     if( state==1 ) {
1971                         if ( bitmap ) {
1972                             DEBUG_TRIE_EXECUTE_r(
1973                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1974                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1975                                         (char *)uc, utf8_target );
1976                                     PerlIO_printf( Perl_debug_log,
1977                                         " Scanning for legal start char...\n");
1978                                 }
1979                             );
1980                             if (utf8_target) {
1981                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1982                                     uc += UTF8SKIP(uc);
1983                                 }
1984                             } else {
1985                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1986                                     uc++;
1987                                 }
1988                             }
1989                             s= (char *)uc;
1990                         }
1991                         if (uc >(U8*)last_start) break;
1992                     }
1993
1994                     if ( word ) {
1995                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1996                         if (!leftmost || lpos < leftmost) {
1997                             DEBUG_r(accepted_word=word);
1998                             leftmost= lpos;
1999                         }
2000                         if (base==0) break;
2001
2002                     }
2003                     points[pointpos++ % maxlen]= uc;
2004                     if (foldlen || uc < (U8*)strend) {
2005                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2006                                          widecharmap, uc,
2007                                          uscan, len, uvc, charid, foldlen,
2008                                          foldbuf, uniflags);
2009                         DEBUG_TRIE_EXECUTE_r({
2010                             dump_exec_pos( (char *)uc, c, strend,
2011                                         real_start, s, utf8_target);
2012                             PerlIO_printf(Perl_debug_log,
2013                                 " Charid:%3u CP:%4"UVxf" ",
2014                                  charid, uvc);
2015                         });
2016                     }
2017                     else {
2018                         len = 0;
2019                         charid = 0;
2020                     }
2021
2022
2023                     do {
2024 #ifdef DEBUGGING
2025                         word = aho->states[ state ].wordnum;
2026 #endif
2027                         base = aho->states[ state ].trans.base;
2028
2029                         DEBUG_TRIE_EXECUTE_r({
2030                             if (failed)
2031                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2032                                     s,   utf8_target );
2033                             PerlIO_printf( Perl_debug_log,
2034                                 "%sState: %4"UVxf", word=%"UVxf,
2035                                 failed ? " Fail transition to " : "",
2036                                 (UV)state, (UV)word);
2037                         });
2038                         if ( base ) {
2039                             U32 tmp;
2040                             I32 offset;
2041                             if (charid &&
2042                                  ( ((offset = base + charid
2043                                     - 1 - trie->uniquecharcount)) >= 0)
2044                                  && ((U32)offset < trie->lasttrans)
2045                                  && trie->trans[offset].check == state
2046                                  && (tmp=trie->trans[offset].next))
2047                             {
2048                                 DEBUG_TRIE_EXECUTE_r(
2049                                     PerlIO_printf( Perl_debug_log," - legal\n"));
2050                                 state = tmp;
2051                                 break;
2052                             }
2053                             else {
2054                                 DEBUG_TRIE_EXECUTE_r(
2055                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2056                                 failed = 1;
2057                                 state = aho->fail[state];
2058                             }
2059                         }
2060                         else {
2061                             /* we must be accepting here */
2062                             DEBUG_TRIE_EXECUTE_r(
2063                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2064                             failed = 1;
2065                             break;
2066                         }
2067                     } while(state);
2068                     uc += len;
2069                     if (failed) {
2070                         if (leftmost)
2071                             break;
2072                         if (!state) state = 1;
2073                     }
2074                 }
2075                 if ( aho->states[ state ].wordnum ) {
2076                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2077                     if (!leftmost || lpos < leftmost) {
2078                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2079                         leftmost = lpos;
2080                     }
2081                 }
2082                 if (leftmost) {
2083                     s = (char*)leftmost;
2084                     DEBUG_TRIE_EXECUTE_r({
2085                         PerlIO_printf(
2086                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2087                             (UV)accepted_word, (IV)(s - real_start)
2088                         );
2089                     });
2090                     if (!reginfo || regtry(reginfo, &s)) {
2091                         FREETMPS;
2092                         LEAVE;
2093                         goto got_it;
2094                     }
2095                     s = HOPc(s,1);
2096                     DEBUG_TRIE_EXECUTE_r({
2097                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2098                     });
2099                 } else {
2100                     DEBUG_TRIE_EXECUTE_r(
2101                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2102                     break;
2103                 }
2104             }
2105             FREETMPS;
2106             LEAVE;
2107         }
2108         break;
2109     default:
2110         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2111         break;
2112     }
2113     return 0;
2114   got_it:
2115     return s;
2116 }
2117
2118
2119 /*
2120  - regexec_flags - match a regexp against a string
2121  */
2122 I32
2123 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2124               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2125 /* stringarg: the point in the string at which to begin matching */
2126 /* strend:    pointer to null at end of string */
2127 /* strbeg:    real beginning of string */
2128 /* minend:    end of match must be >= minend bytes after stringarg. */
2129 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2130  *            itself is accessed via the pointers above */
2131 /* data:      May be used for some additional optimizations.
2132               Currently its only used, with a U32 cast, for transmitting
2133               the ganch offset when doing a /g match. This will change */
2134 /* nosave:    For optimizations. */
2135
2136 {
2137     dVAR;
2138     struct regexp *const prog = ReANY(rx);
2139     char *s;
2140     regnode *c;
2141     char *startpos = stringarg;
2142     I32 minlen;         /* must match at least this many chars */
2143     I32 dontbother = 0; /* how many characters not to try at end */
2144     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2145     I32 scream_pos = -1;                /* Internal iterator of scream. */
2146     char *scream_olds = NULL;
2147     const bool utf8_target = cBOOL(DO_UTF8(sv));
2148     I32 multiline;
2149     RXi_GET_DECL(prog,progi);
2150     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2151     regexp_paren_pair *swap = NULL;
2152     GET_RE_DEBUG_FLAGS_DECL;
2153
2154     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2155     PERL_UNUSED_ARG(data);
2156
2157     /* Be paranoid... */
2158     if (prog == NULL || startpos == NULL) {
2159         Perl_croak(aTHX_ "NULL regexp parameter");
2160         return 0;
2161     }
2162
2163     multiline = prog->extflags & RXf_PMf_MULTILINE;
2164     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2165
2166     RX_MATCH_UTF8_set(rx, utf8_target);
2167     DEBUG_EXECUTE_r(
2168         debug_start_match(rx, utf8_target, startpos, strend,
2169         "Matching");
2170     );
2171
2172     minlen = prog->minlen;
2173
2174     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2175         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2176                               "String too short [regexec_flags]...\n"));
2177         goto phooey;
2178     }
2179
2180
2181     /* Check validity of program. */
2182     if (UCHARAT(progi->program) != REG_MAGIC) {
2183         Perl_croak(aTHX_ "corrupted regexp program");
2184     }
2185
2186     PL_reg_flags = 0;
2187     PL_reg_state.re_state_eval_setup_done = FALSE;
2188     PL_reg_maxiter = 0;
2189
2190     if (RX_UTF8(rx))
2191         PL_reg_flags |= RF_utf8;
2192
2193     /* Mark beginning of line for ^ and lookbehind. */
2194     reginfo.bol = startpos; /* XXX not used ??? */
2195     PL_bostr  = strbeg;
2196     reginfo.sv = sv;
2197
2198     /* Mark end of line for $ (and such) */
2199     PL_regeol = strend;
2200
2201     /* see how far we have to get to not match where we matched before */
2202     reginfo.till = startpos+minend;
2203
2204     /* If there is a "must appear" string, look for it. */
2205     s = startpos;
2206
2207     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2208         MAGIC *mg;
2209         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2210             reginfo.ganch = startpos + prog->gofs;
2211             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2212               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2213         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2214                   && SvMAGIC(sv)
2215                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2216                   && mg->mg_len >= 0) {
2217             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2218             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2219                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2220
2221             if (prog->extflags & RXf_ANCH_GPOS) {
2222                 if (s > reginfo.ganch)
2223                     goto phooey;
2224                 s = reginfo.ganch - prog->gofs;
2225                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2226                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2227                 if (s < strbeg)
2228                     goto phooey;
2229             }
2230         }
2231         else if (data) {
2232             reginfo.ganch = strbeg + PTR2UV(data);
2233             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2234                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2235
2236         } else {                                /* pos() not defined */
2237             reginfo.ganch = strbeg;
2238             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2239                  "GPOS: reginfo.ganch = strbeg\n"));
2240         }
2241     }
2242     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2243         /* We have to be careful. If the previous successful match
2244            was from this regex we don't want a subsequent partially
2245            successful match to clobber the old results.
2246            So when we detect this possibility we add a swap buffer
2247            to the re, and switch the buffer each match. If we fail
2248            we switch it back, otherwise we leave it swapped.
2249         */
2250         swap = prog->offs;
2251         /* do we need a save destructor here for eval dies? */
2252         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2253         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2254             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2255             PTR2UV(prog),
2256             PTR2UV(swap),
2257             PTR2UV(prog->offs)
2258         ));
2259     }
2260     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2261         re_scream_pos_data d;
2262
2263         d.scream_olds = &scream_olds;
2264         d.scream_pos = &scream_pos;
2265         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2266         if (!s) {
2267             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2268             goto phooey;        /* not present */
2269         }
2270     }
2271
2272
2273
2274     /* Simplest case:  anchored match need be tried only once. */
2275     /*  [unless only anchor is BOL and multiline is set] */
2276     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2277         if (s == startpos && regtry(&reginfo, &startpos))
2278             goto got_it;
2279         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2280                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2281         {
2282             char *end;
2283
2284             if (minlen)
2285                 dontbother = minlen - 1;
2286             end = HOP3c(strend, -dontbother, strbeg) - 1;
2287             /* for multiline we only have to try after newlines */
2288             if (prog->check_substr || prog->check_utf8) {
2289                 /* because of the goto we can not easily reuse the macros for bifurcating the
2290                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2291                 if (utf8_target) {
2292                     if (s == startpos)
2293                         goto after_try_utf8;
2294                     while (1) {
2295                         if (regtry(&reginfo, &s)) {
2296                             goto got_it;
2297                         }
2298                       after_try_utf8:
2299                         if (s > end) {
2300                             goto phooey;
2301                         }
2302                         if (prog->extflags & RXf_USE_INTUIT) {
2303                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2304                             if (!s) {
2305                                 goto phooey;
2306                             }
2307                         }
2308                         else {
2309                             s += UTF8SKIP(s);
2310                         }
2311                     }
2312                 } /* end search for check string in unicode */
2313                 else {
2314                     if (s == startpos) {
2315                         goto after_try_latin;
2316                     }
2317                     while (1) {
2318                         if (regtry(&reginfo, &s)) {
2319                             goto got_it;
2320                         }
2321                       after_try_latin:
2322                         if (s > end) {
2323                             goto phooey;
2324                         }
2325                         if (prog->extflags & RXf_USE_INTUIT) {
2326                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2327                             if (!s) {
2328                                 goto phooey;
2329                             }
2330                         }
2331                         else {
2332                             s++;
2333                         }
2334                     }
2335                 } /* end search for check string in latin*/
2336             } /* end search for check string */
2337             else { /* search for newline */
2338                 if (s > startpos) {
2339                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2340                     s--;
2341                 }
2342                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2343                 while (s <= end) { /* note it could be possible to match at the end of the string */
2344                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2345                         if (regtry(&reginfo, &s))
2346                             goto got_it;
2347                     }
2348                 }
2349             } /* end search for newline */
2350         } /* end anchored/multiline check string search */
2351         goto phooey;
2352     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2353     {
2354         /* the warning about reginfo.ganch being used without initialization
2355            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2356            and we only enter this block when the same bit is set. */
2357         char *tmp_s = reginfo.ganch - prog->gofs;
2358
2359         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2360             goto got_it;
2361         goto phooey;
2362     }
2363
2364     /* Messy cases:  unanchored match. */
2365     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2366         /* we have /x+whatever/ */
2367         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2368         char ch;
2369 #ifdef DEBUGGING
2370         int did_match = 0;
2371 #endif
2372         if (utf8_target) {
2373             if (! prog->anchored_utf8) {
2374                 to_utf8_substr(prog);
2375             }
2376             ch = SvPVX_const(prog->anchored_utf8)[0];
2377             REXEC_FBC_SCAN(
2378                 if (*s == ch) {
2379                     DEBUG_EXECUTE_r( did_match = 1 );
2380                     if (regtry(&reginfo, &s)) goto got_it;
2381                     s += UTF8SKIP(s);
2382                     while (s < strend && *s == ch)
2383                         s += UTF8SKIP(s);
2384                 }
2385             );
2386
2387         }
2388         else {
2389             if (! prog->anchored_substr) {
2390                 if (! to_byte_substr(prog)) {
2391                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2392                 }
2393             }
2394             ch = SvPVX_const(prog->anchored_substr)[0];
2395             REXEC_FBC_SCAN(
2396                 if (*s == ch) {
2397                     DEBUG_EXECUTE_r( did_match = 1 );
2398                     if (regtry(&reginfo, &s)) goto got_it;
2399                     s++;
2400                     while (s < strend && *s == ch)
2401                         s++;
2402                 }
2403             );
2404         }
2405         DEBUG_EXECUTE_r(if (!did_match)
2406                 PerlIO_printf(Perl_debug_log,
2407                                   "Did not find anchored character...\n")
2408                );
2409     }
2410     else if (prog->anchored_substr != NULL
2411               || prog->anchored_utf8 != NULL
2412               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2413                   && prog->float_max_offset < strend - s)) {
2414         SV *must;
2415         I32 back_max;
2416         I32 back_min;
2417         char *last;
2418         char *last1;            /* Last position checked before */
2419 #ifdef DEBUGGING
2420         int did_match = 0;
2421 #endif
2422         if (prog->anchored_substr || prog->anchored_utf8) {
2423             if (utf8_target) {
2424                 if (! prog->anchored_utf8) {
2425                     to_utf8_substr(prog);
2426                 }
2427                 must = prog->anchored_utf8;
2428             }
2429             else {
2430                 if (! prog->anchored_substr) {
2431                     if (! to_byte_substr(prog)) {
2432                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2433                     }
2434                 }
2435                 must = prog->anchored_substr;
2436             }
2437             back_max = back_min = prog->anchored_offset;
2438         } else {
2439             if (utf8_target) {
2440                 if (! prog->float_utf8) {
2441                     to_utf8_substr(prog);
2442                 }
2443                 must = prog->float_utf8;
2444             }
2445             else {
2446                 if (! prog->float_substr) {
2447                     if (! to_byte_substr(prog)) {
2448                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2449                     }
2450                 }
2451                 must = prog->float_substr;
2452             }
2453             back_max = prog->float_max_offset;
2454             back_min = prog->float_min_offset;
2455         }
2456
2457         if (back_min<0) {
2458             last = strend;
2459         } else {
2460             last = HOP3c(strend,        /* Cannot start after this */
2461                   -(I32)(CHR_SVLEN(must)
2462                          - (SvTAIL(must) != 0) + back_min), strbeg);
2463         }
2464         if (s > PL_bostr)
2465             last1 = HOPc(s, -1);
2466         else
2467             last1 = s - 1;      /* bogus */
2468
2469         /* XXXX check_substr already used to find "s", can optimize if
2470            check_substr==must. */
2471         scream_pos = -1;
2472         dontbother = end_shift;
2473         strend = HOPc(strend, -dontbother);
2474         while ( (s <= last) &&
2475                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2476                                   (unsigned char*)strend, must,
2477                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2478             DEBUG_EXECUTE_r( did_match = 1 );
2479             if (HOPc(s, -back_max) > last1) {
2480                 last1 = HOPc(s, -back_min);
2481                 s = HOPc(s, -back_max);
2482             }
2483             else {
2484                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2485
2486                 last1 = HOPc(s, -back_min);
2487                 s = t;
2488             }
2489             if (utf8_target) {
2490                 while (s <= last1) {
2491                     if (regtry(&reginfo, &s))
2492                         goto got_it;
2493                     if (s >= last1) {
2494                         s++; /* to break out of outer loop */
2495                         break;
2496                     }
2497                     s += UTF8SKIP(s);
2498                 }
2499             }
2500             else {
2501                 while (s <= last1) {
2502                     if (regtry(&reginfo, &s))
2503                         goto got_it;
2504                     s++;
2505                 }
2506             }
2507         }
2508         DEBUG_EXECUTE_r(if (!did_match) {
2509             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2510                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2511             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2512                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2513                                ? "anchored" : "floating"),
2514                 quoted, RE_SV_TAIL(must));
2515         });
2516         goto phooey;
2517     }
2518     else if ( (c = progi->regstclass) ) {
2519         if (minlen) {
2520             const OPCODE op = OP(progi->regstclass);
2521             /* don't bother with what can't match */
2522             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2523                 strend = HOPc(strend, -(minlen - 1));
2524         }
2525         DEBUG_EXECUTE_r({
2526             SV * const prop = sv_newmortal();
2527             regprop(prog, prop, c);
2528             {
2529                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2530                     s,strend-s,60);
2531                 PerlIO_printf(Perl_debug_log,
2532                     "Matching stclass %.*s against %s (%d bytes)\n",
2533                     (int)SvCUR(prop), SvPVX_const(prop),
2534                      quoted, (int)(strend - s));
2535             }
2536         });
2537         if (find_byclass(prog, c, s, strend, &reginfo))
2538             goto got_it;
2539         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2540     }
2541     else {
2542         dontbother = 0;
2543         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2544             /* Trim the end. */
2545             char *last= NULL;
2546             SV* float_real;
2547             STRLEN len;
2548             const char *little;
2549
2550             if (utf8_target) {
2551                 if (! prog->float_utf8) {
2552                     to_utf8_substr(prog);
2553                 }
2554                 float_real = prog->float_utf8;
2555             }
2556             else {
2557                 if (! prog->float_substr) {
2558                     if (! to_byte_substr(prog)) {
2559                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2560                     }
2561                 }
2562                 float_real = prog->float_substr;
2563             }
2564
2565             little = SvPV_const(float_real, len);
2566             if (SvTAIL(float_real)) {
2567                     /* This means that float_real contains an artificial \n on
2568                      * the end due to the presence of something like this:
2569                      * /foo$/ where we can match both "foo" and "foo\n" at the
2570                      * end of the string.  So we have to compare the end of the
2571                      * string first against the float_real without the \n and
2572                      * then against the full float_real with the string.  We
2573                      * have to watch out for cases where the string might be
2574                      * smaller than the float_real or the float_real without
2575                      * the \n. */
2576                     char *checkpos= strend - len;
2577                     DEBUG_OPTIMISE_r(
2578                         PerlIO_printf(Perl_debug_log,
2579                             "%sChecking for float_real.%s\n",
2580                             PL_colors[4], PL_colors[5]));
2581                     if (checkpos + 1 < strbeg) {
2582                         /* can't match, even if we remove the trailing \n
2583                          * string is too short to match */
2584                         DEBUG_EXECUTE_r(
2585                             PerlIO_printf(Perl_debug_log,
2586                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2587                                 PL_colors[4], PL_colors[5]));
2588                         goto phooey;
2589                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2590                         /* can match, the end of the string matches without the
2591                          * "\n" */
2592                         last = checkpos + 1;
2593                     } else if (checkpos < strbeg) {
2594                         /* cant match, string is too short when the "\n" is
2595                          * included */
2596                         DEBUG_EXECUTE_r(
2597                             PerlIO_printf(Perl_debug_log,
2598                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2599                                 PL_colors[4], PL_colors[5]));
2600                         goto phooey;
2601                     } else if (!multiline) {
2602                         /* non multiline match, so compare with the "\n" at the
2603                          * end of the string */
2604                         if (memEQ(checkpos, little, len)) {
2605                             last= checkpos;
2606                         } else {
2607                             DEBUG_EXECUTE_r(
2608                                 PerlIO_printf(Perl_debug_log,
2609                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2610                                     PL_colors[4], PL_colors[5]));
2611                             goto phooey;
2612                         }
2613                     } else {
2614                         /* multiline match, so we have to search for a place
2615                          * where the full string is located */
2616                         goto find_last;
2617                     }
2618             } else {
2619                   find_last:
2620                     if (len)
2621                         last = rninstr(s, strend, little, little + len);
2622                     else
2623                         last = strend;  /* matching "$" */
2624             }
2625             if (!last) {
2626                 /* at one point this block contained a comment which was
2627                  * probably incorrect, which said that this was a "should not
2628                  * happen" case.  Even if it was true when it was written I am
2629                  * pretty sure it is not anymore, so I have removed the comment
2630                  * and replaced it with this one. Yves */
2631                 DEBUG_EXECUTE_r(
2632                     PerlIO_printf(Perl_debug_log,
2633                         "String does not contain required substring, cannot match.\n"
2634                     ));
2635                 goto phooey;
2636             }
2637             dontbother = strend - last + prog->float_min_offset;
2638         }
2639         if (minlen && (dontbother < minlen))
2640             dontbother = minlen - 1;
2641         strend -= dontbother;              /* this one's always in bytes! */
2642         /* We don't know much -- general case. */
2643         if (utf8_target) {
2644             for (;;) {
2645                 if (regtry(&reginfo, &s))
2646                     goto got_it;
2647                 if (s >= strend)
2648                     break;
2649                 s += UTF8SKIP(s);
2650             };
2651         }
2652         else {
2653             do {
2654                 if (regtry(&reginfo, &s))
2655                     goto got_it;
2656             } while (s++ < strend);
2657         }
2658     }
2659
2660     /* Failure. */
2661     goto phooey;
2662
2663 got_it:
2664     DEBUG_BUFFERS_r(
2665         if (swap)
2666             PerlIO_printf(Perl_debug_log,
2667                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2668                 PTR2UV(prog),
2669                 PTR2UV(swap)
2670             );
2671     );
2672     Safefree(swap);
2673     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2674
2675     if (PL_reg_state.re_state_eval_setup_done)
2676         restore_pos(aTHX_ prog);
2677     if (RXp_PAREN_NAMES(prog))
2678         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2679
2680     /* make sure $`, $&, $', and $digit will work later */
2681     if ( !(flags & REXEC_NOT_FIRST) ) {
2682         if (flags & REXEC_COPY_STR) {
2683 #ifdef PERL_ANY_COW
2684             if (SvCANCOW(sv)) {
2685                 if (DEBUG_C_TEST) {
2686                     PerlIO_printf(Perl_debug_log,
2687                                   "Copy on write: regexp capture, type %d\n",
2688                                   (int) SvTYPE(sv));
2689                 }
2690                 RX_MATCH_COPY_FREE(rx);
2691                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2692                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2693                 assert (SvPOKp(prog->saved_copy));
2694                 prog->sublen  = PL_regeol - strbeg;
2695                 prog->suboffset = 0;
2696                 prog->subcoffset = 0;
2697             } else
2698 #endif
2699             {
2700                 I32 min = 0;
2701                 I32 max = PL_regeol - strbeg;
2702                 I32 sublen;
2703
2704                 if (    (flags & REXEC_COPY_SKIP_POST)
2705                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2706                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2707                 ) { /* don't copy $' part of string */
2708                     U32 n = 0;
2709                     max = -1;
2710                     /* calculate the right-most part of the string covered
2711                      * by a capture. Due to look-ahead, this may be to
2712                      * the right of $&, so we have to scan all captures */
2713                     while (n <= prog->lastparen) {
2714                         if (prog->offs[n].end > max)
2715                             max = prog->offs[n].end;
2716                         n++;
2717                     }
2718                     if (max == -1)
2719                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2720                                 ? prog->offs[0].start
2721                                 : 0;
2722                     assert(max >= 0 && max <= PL_regeol - strbeg);
2723                 }
2724
2725                 if (    (flags & REXEC_COPY_SKIP_PRE)
2726                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2727                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2728                 ) { /* don't copy $` part of string */
2729                     U32 n = 0;
2730                     min = max;
2731                     /* calculate the left-most part of the string covered
2732                      * by a capture. Due to look-behind, this may be to
2733                      * the left of $&, so we have to scan all captures */
2734                     while (min && n <= prog->lastparen) {
2735                         if (   prog->offs[n].start != -1
2736                             && prog->offs[n].start < min)
2737                         {
2738                             min = prog->offs[n].start;
2739                         }
2740                         n++;
2741                     }
2742                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2743                         && min >  prog->offs[0].end
2744                     )
2745                         min = prog->offs[0].end;
2746
2747                 }
2748
2749                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2750                 sublen = max - min;
2751
2752                 if (RX_MATCH_COPIED(rx)) {
2753                     if (sublen > prog->sublen)
2754                         prog->subbeg =
2755                                 (char*)saferealloc(prog->subbeg, sublen+1);
2756                 }
2757                 else
2758                     prog->subbeg = (char*)safemalloc(sublen+1);
2759                 Copy(strbeg + min, prog->subbeg, sublen, char);
2760                 prog->subbeg[sublen] = '\0';
2761                 prog->suboffset = min;
2762                 prog->sublen = sublen;
2763                 RX_MATCH_COPIED_on(rx);
2764             }
2765             prog->subcoffset = prog->suboffset;
2766             if (prog->suboffset && utf8_target) {
2767                 /* Convert byte offset to chars.
2768                  * XXX ideally should only compute this if @-/@+
2769                  * has been seen, a la PL_sawampersand ??? */
2770
2771                 /* If there's a direct correspondence between the
2772                  * string which we're matching and the original SV,
2773                  * then we can use the utf8 len cache associated with
2774                  * the SV. In particular, it means that under //g,
2775                  * sv_pos_b2u() will use the previously cached
2776                  * position to speed up working out the new length of
2777                  * subcoffset, rather than counting from the start of
2778                  * the string each time. This stops
2779                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2780                  * from going quadratic */
2781                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2782                     sv_pos_b2u(sv, &(prog->subcoffset));
2783                 else
2784                     prog->subcoffset = utf8_length((U8*)strbeg,
2785                                         (U8*)(strbeg+prog->suboffset));
2786             }
2787         }
2788         else {
2789             RX_MATCH_COPY_FREE(rx);
2790             prog->subbeg = strbeg;
2791             prog->suboffset = 0;
2792             prog->subcoffset = 0;
2793             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2794         }
2795     }
2796
2797     return 1;
2798
2799 phooey:
2800     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2801                           PL_colors[4], PL_colors[5]));
2802     if (PL_reg_state.re_state_eval_setup_done)
2803         restore_pos(aTHX_ prog);
2804     if (swap) {
2805         /* we failed :-( roll it back */
2806         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2807             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2808             PTR2UV(prog),
2809             PTR2UV(prog->offs),
2810             PTR2UV(swap)
2811         ));
2812         Safefree(prog->offs);
2813         prog->offs = swap;
2814     }
2815     return 0;
2816 }
2817
2818
2819 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2820  * Do inc before dec, in case old and new rex are the same */
2821 #define SET_reg_curpm(Re2) \
2822     if (PL_reg_state.re_state_eval_setup_done) {    \
2823         (void)ReREFCNT_inc(Re2);                    \
2824         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2825         PM_SETRE((PL_reg_curpm), (Re2));            \
2826     }
2827
2828
2829 /*
2830  - regtry - try match at specific point
2831  */
2832 STATIC I32                      /* 0 failure, 1 success */
2833 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2834 {
2835     dVAR;
2836     CHECKPOINT lastcp;
2837     REGEXP *const rx = reginfo->prog;
2838     regexp *const prog = ReANY(rx);
2839     I32 result;
2840     RXi_GET_DECL(prog,progi);
2841     GET_RE_DEBUG_FLAGS_DECL;
2842
2843     PERL_ARGS_ASSERT_REGTRY;
2844
2845     reginfo->cutpoint=NULL;
2846
2847     if ((prog->extflags & RXf_EVAL_SEEN)
2848         && !PL_reg_state.re_state_eval_setup_done)
2849     {
2850         MAGIC *mg;
2851
2852         PL_reg_state.re_state_eval_setup_done = TRUE;
2853         if (reginfo->sv) {
2854             /* Make $_ available to executed code. */
2855             if (reginfo->sv != DEFSV) {
2856                 SAVE_DEFSV;
2857                 DEFSV_set(reginfo->sv);
2858             }
2859
2860             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2861                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2862                 /* prepare for quick setting of pos */
2863 #ifdef PERL_OLD_COPY_ON_WRITE
2864                 if (SvIsCOW(reginfo->sv))
2865                     sv_force_normal_flags(reginfo->sv, 0);
2866 #endif
2867                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2868                                  &PL_vtbl_mglob, NULL, 0);
2869                 mg->mg_len = -1;
2870             }
2871             PL_reg_magic    = mg;
2872             PL_reg_oldpos   = mg->mg_len;
2873             SAVEDESTRUCTOR_X(restore_pos, prog);
2874         }
2875         if (!PL_reg_curpm) {
2876             Newxz(PL_reg_curpm, 1, PMOP);
2877 #ifdef USE_ITHREADS
2878             {
2879                 SV* const repointer = &PL_sv_undef;
2880                 /* this regexp is also owned by the new PL_reg_curpm, which
2881                    will try to free it.  */
2882                 av_push(PL_regex_padav, repointer);
2883                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2884                 PL_regex_pad = AvARRAY(PL_regex_padav);
2885             }
2886 #endif
2887         }
2888         SET_reg_curpm(rx);
2889         PL_reg_oldcurpm = PL_curpm;
2890         PL_curpm = PL_reg_curpm;
2891         if (RXp_MATCH_COPIED(prog)) {
2892             /*  Here is a serious problem: we cannot rewrite subbeg,
2893                 since it may be needed if this match fails.  Thus
2894                 $` inside (?{}) could fail... */
2895             PL_reg_oldsaved = prog->subbeg;
2896             PL_reg_oldsavedlen = prog->sublen;
2897             PL_reg_oldsavedoffset = prog->suboffset;
2898             PL_reg_oldsavedcoffset = prog->suboffset;
2899 #ifdef PERL_ANY_COW
2900             PL_nrs = prog->saved_copy;
2901 #endif
2902             RXp_MATCH_COPIED_off(prog);
2903         }
2904         else
2905             PL_reg_oldsaved = NULL;
2906         prog->subbeg = PL_bostr;
2907         prog->suboffset = 0;
2908         prog->subcoffset = 0;
2909         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2910     }
2911 #ifdef DEBUGGING
2912     PL_reg_starttry = *startposp;
2913 #endif
2914     prog->offs[0].start = *startposp - PL_bostr;
2915     prog->lastparen = 0;
2916     prog->lastcloseparen = 0;
2917
2918     /* XXXX What this code is doing here?!!!  There should be no need
2919        to do this again and again, prog->lastparen should take care of
2920        this!  --ilya*/
2921
2922     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2923      * Actually, the code in regcppop() (which Ilya may be meaning by
2924      * prog->lastparen), is not needed at all by the test suite
2925      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2926      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2927      * Meanwhile, this code *is* needed for the
2928      * above-mentioned test suite tests to succeed.  The common theme
2929      * on those tests seems to be returning null fields from matches.
2930      * --jhi updated by dapm */
2931 #if 1
2932     if (prog->nparens) {
2933         regexp_paren_pair *pp = prog->offs;
2934         I32 i;
2935         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2936             ++pp;
2937             pp->start = -1;
2938             pp->end = -1;
2939         }
2940     }
2941 #endif
2942     REGCP_SET(lastcp);
2943     result = regmatch(reginfo, *startposp, progi->program + 1);
2944     if (result != -1) {
2945         prog->offs[0].end = result;
2946         return 1;
2947     }
2948     if (reginfo->cutpoint)
2949         *startposp= reginfo->cutpoint;
2950     REGCP_UNWIND(lastcp);
2951     return 0;
2952 }
2953
2954
2955 #define sayYES goto yes
2956 #define sayNO goto no
2957 #define sayNO_SILENT goto no_silent
2958
2959 /* we dont use STMT_START/END here because it leads to
2960    "unreachable code" warnings, which are bogus, but distracting. */
2961 #define CACHEsayNO \
2962     if (ST.cache_mask) \
2963        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2964     sayNO
2965
2966 /* this is used to determine how far from the left messages like
2967    'failed...' are printed. It should be set such that messages
2968    are inline with the regop output that created them.
2969 */
2970 #define REPORT_CODE_OFF 32
2971
2972
2973 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2974 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2975 #define CHRTEST_NOT_A_CP_1 -999
2976 #define CHRTEST_NOT_A_CP_2 -998
2977
2978 #define SLAB_FIRST(s) (&(s)->states[0])
2979 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2980
2981 /* grab a new slab and return the first slot in it */
2982
2983 STATIC regmatch_state *
2984 S_push_slab(pTHX)
2985 {
2986 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2987     dMY_CXT;
2988 #endif
2989     regmatch_slab *s = PL_regmatch_slab->next;
2990     if (!s) {
2991         Newx(s, 1, regmatch_slab);
2992         s->prev = PL_regmatch_slab;
2993         s->next = NULL;
2994         PL_regmatch_slab->next = s;
2995     }
2996     PL_regmatch_slab = s;
2997     return SLAB_FIRST(s);
2998 }
2999
3000
3001 /* push a new state then goto it */
3002
3003 #define PUSH_STATE_GOTO(state, node, input) \
3004     pushinput = input; \
3005     scan = node; \
3006     st->resume_state = state; \
3007     goto push_state;
3008
3009 /* push a new state with success backtracking, then goto it */
3010
3011 #define PUSH_YES_STATE_GOTO(state, node, input) \
3012     pushinput = input; \
3013     scan = node; \
3014     st->resume_state = state; \
3015     goto push_yes_state;
3016
3017
3018
3019
3020 /*
3021
3022 regmatch() - main matching routine
3023
3024 This is basically one big switch statement in a loop. We execute an op,
3025 set 'next' to point the next op, and continue. If we come to a point which
3026 we may need to backtrack to on failure such as (A|B|C), we push a
3027 backtrack state onto the backtrack stack. On failure, we pop the top
3028 state, and re-enter the loop at the state indicated. If there are no more
3029 states to pop, we return failure.
3030
3031 Sometimes we also need to backtrack on success; for example /A+/, where
3032 after successfully matching one A, we need to go back and try to
3033 match another one; similarly for lookahead assertions: if the assertion
3034 completes successfully, we backtrack to the state just before the assertion
3035 and then carry on.  In these cases, the pushed state is marked as
3036 'backtrack on success too'. This marking is in fact done by a chain of
3037 pointers, each pointing to the previous 'yes' state. On success, we pop to
3038 the nearest yes state, discarding any intermediate failure-only states.
3039 Sometimes a yes state is pushed just to force some cleanup code to be
3040 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3041 it to free the inner regex.
3042
3043 Note that failure backtracking rewinds the cursor position, while
3044 success backtracking leaves it alone.
3045
3046 A pattern is complete when the END op is executed, while a subpattern
3047 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3048 ops trigger the "pop to last yes state if any, otherwise return true"
3049 behaviour.
3050
3051 A common convention in this function is to use A and B to refer to the two
3052 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3053 the subpattern to be matched possibly multiple times, while B is the entire
3054 rest of the pattern. Variable and state names reflect this convention.
3055
3056 The states in the main switch are the union of ops and failure/success of
3057 substates associated with with that op.  For example, IFMATCH is the op
3058 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3059 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3060 successfully matched A and IFMATCH_A_fail is a state saying that we have
3061 just failed to match A. Resume states always come in pairs. The backtrack
3062 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3063 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3064 on success or failure.
3065
3066 The struct that holds a backtracking state is actually a big union, with
3067 one variant for each major type of op. The variable st points to the
3068 top-most backtrack struct. To make the code clearer, within each
3069 block of code we #define ST to alias the relevant union.
3070
3071 Here's a concrete example of a (vastly oversimplified) IFMATCH
3072 implementation:
3073
3074     switch (state) {
3075     ....
3076
3077 #define ST st->u.ifmatch
3078
3079     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3080         ST.foo = ...; // some state we wish to save
3081         ...
3082         // push a yes backtrack state with a resume value of
3083         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3084         // first node of A:
3085         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3086         // NOTREACHED
3087
3088     case IFMATCH_A: // we have successfully executed A; now continue with B
3089         next = B;
3090         bar = ST.foo; // do something with the preserved value
3091         break;
3092
3093     case IFMATCH_A_fail: // A failed, so the assertion failed
3094         ...;   // do some housekeeping, then ...
3095         sayNO; // propagate the failure
3096
3097 #undef ST
3098
3099     ...
3100     }
3101
3102 For any old-timers reading this who are familiar with the old recursive
3103 approach, the code above is equivalent to:
3104
3105     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3106     {
3107         int foo = ...
3108         ...
3109         if (regmatch(A)) {
3110             next = B;
3111             bar = foo;
3112             break;
3113         }
3114         ...;   // do some housekeeping, then ...
3115         sayNO; // propagate the failure
3116     }
3117
3118 The topmost backtrack state, pointed to by st, is usually free. If you
3119 want to claim it, populate any ST.foo fields in it with values you wish to
3120 save, then do one of
3121
3122         PUSH_STATE_GOTO(resume_state, node, newinput);
3123         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3124
3125 which sets that backtrack state's resume value to 'resume_state', pushes a
3126 new free entry to the top of the backtrack stack, then goes to 'node'.
3127 On backtracking, the free slot is popped, and the saved state becomes the
3128 new free state. An ST.foo field in this new top state can be temporarily
3129 accessed to retrieve values, but once the main loop is re-entered, it
3130 becomes available for reuse.
3131
3132 Note that the depth of the backtrack stack constantly increases during the
3133 left-to-right execution of the pattern, rather than going up and down with
3134 the pattern nesting. For example the stack is at its maximum at Z at the
3135 end of the pattern, rather than at X in the following:
3136
3137     /(((X)+)+)+....(Y)+....Z/
3138
3139 The only exceptions to this are lookahead/behind assertions and the cut,
3140 (?>A), which pop all the backtrack states associated with A before
3141 continuing.
3142
3143 Backtrack state structs are allocated in slabs of about 4K in size.
3144 PL_regmatch_state and st always point to the currently active state,
3145 and PL_regmatch_slab points to the slab currently containing
3146 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3147 allocated, and is never freed until interpreter destruction. When the slab
3148 is full, a new one is allocated and chained to the end. At exit from
3149 regmatch(), slabs allocated since entry are freed.
3150
3151 */
3152
3153
3154 #define DEBUG_STATE_pp(pp)                                  \
3155     DEBUG_STATE_r({                                         \
3156         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3157         PerlIO_printf(Perl_debug_log,                       \
3158             "    %*s"pp" %s%s%s%s%s\n",                     \
3159             depth*2, "",                                    \
3160             PL_reg_name[st->resume_state],                     \
3161             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3162             ((st==yes_state) ? "Y" : ""),                   \
3163             ((st==mark_state) ? "M" : ""),                  \
3164             ((st==yes_state||st==mark_state) ? "]" : "")    \
3165         );                                                  \
3166     });
3167
3168
3169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3170
3171 #ifdef DEBUGGING
3172
3173 STATIC void
3174 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3175     const char *start, const char *end, const char *blurb)
3176 {
3177     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3178
3179     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3180
3181     if (!PL_colorset)
3182             reginitcolors();
3183     {
3184         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3185             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3186
3187         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3188             start, end - start, 60);
3189
3190         PerlIO_printf(Perl_debug_log,
3191             "%s%s REx%s %s against %s\n",
3192                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3193
3194         if (utf8_target||utf8_pat)
3195             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3196                 utf8_pat ? "pattern" : "",
3197                 utf8_pat && utf8_target ? " and " : "",
3198                 utf8_target ? "string" : ""
3199             );
3200     }
3201 }
3202
3203 STATIC void
3204 S_dump_exec_pos(pTHX_ const char *locinput,
3205                       const regnode *scan,
3206                       const char *loc_regeol,
3207                       const char *loc_bostr,
3208                       const char *loc_reg_starttry,
3209                       const bool utf8_target)
3210 {
3211     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3212     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3213     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3214     /* The part of the string before starttry has one color
3215        (pref0_len chars), between starttry and current
3216        position another one (pref_len - pref0_len chars),
3217        after the current position the third one.
3218        We assume that pref0_len <= pref_len, otherwise we
3219        decrease pref0_len.  */
3220     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3221         ? (5 + taill) - l : locinput - loc_bostr;
3222     int pref0_len;
3223
3224     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3225
3226     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3227         pref_len++;
3228     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3229     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3230         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3231               ? (5 + taill) - pref_len : loc_regeol - locinput);
3232     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3233         l--;
3234     if (pref0_len < 0)
3235         pref0_len = 0;
3236     if (pref0_len > pref_len)
3237         pref0_len = pref_len;
3238     {
3239         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3240
3241         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3242             (locinput - pref_len),pref0_len, 60, 4, 5);
3243
3244         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3245                     (locinput - pref_len + pref0_len),
3246                     pref_len - pref0_len, 60, 2, 3);
3247
3248         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3249                     locinput, loc_regeol - locinput, 10, 0, 1);
3250
3251         const STRLEN tlen=len0+len1+len2;
3252         PerlIO_printf(Perl_debug_log,
3253                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3254                     (IV)(locinput - loc_bostr),
3255                     len0, s0,
3256                     len1, s1,
3257                     (docolor ? "" : "> <"),
3258                     len2, s2,
3259                     (int)(tlen > 19 ? 0 :  19 - tlen),
3260                     "");
3261     }
3262 }
3263
3264 #endif
3265
3266 /* reg_check_named_buff_matched()
3267  * Checks to see if a named buffer has matched. The data array of
3268  * buffer numbers corresponding to the buffer is expected to reside
3269  * in the regexp->data->data array in the slot stored in the ARG() of
3270  * node involved. Note that this routine doesn't actually care about the
3271  * name, that information is not preserved from compilation to execution.
3272  * Returns the index of the leftmost defined buffer with the given name
3273  * or 0 if non of the buffers matched.
3274  */
3275 STATIC I32
3276 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3277 {
3278     I32 n;
3279     RXi_GET_DECL(rex,rexi);
3280     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3281     I32 *nums=(I32*)SvPVX(sv_dat);
3282
3283     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3284
3285     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3286         if ((I32)rex->lastparen >= nums[n] &&
3287             rex->offs[nums[n]].end != -1)
3288         {
3289             return nums[n];
3290         }
3291     }
3292     return 0;
3293 }
3294
3295
3296 /* free all slabs above current one  - called during LEAVE_SCOPE */
3297
3298 STATIC void
3299 S_clear_backtrack_stack(pTHX_ void *p)
3300 {
3301     regmatch_slab *s = PL_regmatch_slab->next;
3302     PERL_UNUSED_ARG(p);
3303
3304     if (!s)
3305         return;
3306     PL_regmatch_slab->next = NULL;
3307     while (s) {
3308         regmatch_slab * const osl = s;
3309         s = s->next;
3310         Safefree(osl);
3311     }
3312 }
3313 static bool
3314 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3315 {
3316     /* This function determines if there are one or two characters that match
3317      * the first character of the passed-in EXACTish node <text_node>, and if
3318      * so, returns them in the passed-in pointers.
3319      *
3320      * If it determines that no possible character in the target string can
3321      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3322      * the first character in <text_node> requires UTF-8 to represent, and the
3323      * target string isn't in UTF-8.)
3324      *
3325      * If there are more than two characters that could match the beginning of
3326      * <text_node>, or if more context is required to determine a match or not,
3327      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3328      *
3329      * The motiviation behind this function is to allow the caller to set up
3330      * tight loops for matching.  If <text_node> is of type EXACT, there is
3331      * only one possible character that can match its first character, and so
3332      * the situation is quite simple.  But things get much more complicated if
3333      * folding is involved.  It may be that the first character of an EXACTFish
3334      * node doesn't participate in any possible fold, e.g., punctuation, so it
3335      * can be matched only by itself.  The vast majority of characters that are
3336      * in folds match just two things, their lower and upper-case equivalents.
3337      * But not all are like that; some have multiple possible matches, or match
3338      * sequences of more than one character.  This function sorts all that out.
3339      *
3340      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3341      * loop of trying to match A*, we know we can't exit where the thing
3342      * following it isn't a B.  And something can't be a B unless it is the
3343      * beginning of B.  By putting a quick test for that beginning in a tight
3344      * loop, we can rule out things that can't possibly be B without having to
3345      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3346      * character, we can make a tight loop matching A*, using the outputs of
3347      * this function.
3348      *
3349      * If the target string to match isn't in UTF-8, and there aren't
3350      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3351      * the one or two possible octets (which are characters in this situation)
3352      * that can match.  In all cases, if there is only one character that can
3353      * match, *<c1p> and *<c2p> will be identical.
3354      *
3355      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3356      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3357      * can match the beginning of <text_node>.  They should be declared with at
3358      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3359      * undefined what these contain.)  If one or both of the buffers are
3360      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3361      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3362      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3363      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3364      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3365
3366     const bool utf8_target = PL_reg_match_utf8;
3367
3368     UV c1 = CHRTEST_NOT_A_CP_1;
3369     UV c2 = CHRTEST_NOT_A_CP_2;
3370     bool use_chrtest_void = FALSE;
3371
3372     /* Used when we have both utf8 input and utf8 output, to avoid converting
3373      * to/from code points */
3374     bool utf8_has_been_setup = FALSE;
3375
3376     dVAR;
3377
3378     U8 *pat = (U8*)STRING(text_node);
3379
3380     if (OP(text_node) == EXACT) {
3381
3382         /* In an exact node, only one thing can be matched, that first
3383          * character.  If both the pat and the target are UTF-8, we can just
3384          * copy the input to the output, avoiding finding the code point of
3385          * that character */
3386         if (! UTF_PATTERN) {
3387             c2 = c1 = *pat;
3388         }
3389         else if (utf8_target) {
3390             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3391             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3392             utf8_has_been_setup = TRUE;
3393         }
3394         else {
3395             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3396         }
3397     }
3398     else /* an EXACTFish node */
3399          if ((UTF_PATTERN
3400                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3401                                                     pat + STR_LEN(text_node)))
3402              || (! UTF_PATTERN
3403                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3404                                                     pat + STR_LEN(text_node))))
3405     {
3406         /* Multi-character folds require more context to sort out.  Also
3407          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3408          * handled outside this routine */
3409         use_chrtest_void = TRUE;
3410     }
3411     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3412         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3413         if (c1 > 256) {
3414             /* Load the folds hash, if not already done */
3415             SV** listp;
3416             if (! PL_utf8_foldclosures) {
3417                 if (! PL_utf8_tofold) {
3418                     U8 dummy[UTF8_MAXBYTES+1];
3419
3420                     /* Force loading this by folding an above-Latin1 char */
3421                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3422                     assert(PL_utf8_tofold); /* Verify that worked */
3423                 }
3424                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3425             }
3426
3427             /* The fold closures data structure is a hash with the keys being
3428              * the UTF-8 of every character that is folded to, like 'k', and
3429              * the values each an array of all code points that fold to its
3430              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3431              * not included */
3432             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3433                                      (char *) pat,
3434                                      UTF8SKIP(pat),
3435                                      FALSE))))
3436             {
3437                 /* Not found in the hash, therefore there are no folds
3438                  * containing it, so there is only a single character that
3439                  * could match */
3440                 c2 = c1;
3441             }
3442             else {  /* Does participate in folds */
3443                 AV* list = (AV*) *listp;
3444                 if (av_len(list) != 1) {
3445
3446                     /* If there aren't exactly two folds to this, it is outside
3447                      * the scope of this function */
3448                     use_chrtest_void = TRUE;
3449                 }
3450                 else {  /* There are two.  Get them */
3451                     SV** c_p = av_fetch(list, 0, FALSE);
3452                     if (c_p == NULL) {
3453                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3454                     }
3455                     c1 = SvUV(*c_p);
3456
3457                     c_p = av_fetch(list, 1, FALSE);
3458                     if (c_p == NULL) {
3459                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3460                     }
3461                     c2 = SvUV(*c_p);
3462
3463                     /* Folds that cross the 255/256 boundary are forbidden if
3464                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3465                      * pattern character is above 256, and its only other match
3466                      * is below 256, the only legal match will be to itself.
3467                      * We have thrown away the original, so have to compute
3468                      * which is the one above 255 */
3469                     if ((c1 < 256) != (c2 < 256)) {
3470                         if (OP(text_node) == EXACTFL
3471                             || (OP(text_node) == EXACTFA
3472                                 && (isASCII(c1) || isASCII(c2))))
3473                         {
3474                             if (c1 < 256) {
3475                                 c1 = c2;
3476                             }
3477                             else {
3478                                 c2 = c1;
3479                             }
3480                         }
3481                     }
3482                 }
3483             }
3484         }
3485         else /* Here, c1 is < 255 */
3486              if (utf8_target
3487                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3488                  && OP(text_node) != EXACTFL
3489                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3490         {
3491             /* Here, there could be something above Latin1 in the target which
3492              * folds to this character in the pattern.  All such cases except
3493              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3494              * involved in their folds, so are outside the scope of this
3495              * function */
3496             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3497                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3498             }
3499             else {
3500                 use_chrtest_void = TRUE;
3501             }
3502         }
3503         else { /* Here nothing above Latin1 can fold to the pattern character */
3504             switch (OP(text_node)) {
3505
3506                 case EXACTFL:   /* /l rules */
3507                     c2 = PL_fold_locale[c1];
3508                     break;
3509
3510                 case EXACTF:
3511                     if (! utf8_target) {    /* /d rules */
3512                         c2 = PL_fold[c1];
3513                         break;
3514                     }
3515                     /* FALLTHROUGH */
3516                     /* /u rules for all these.  This happens to work for
3517                      * EXACTFA as nothing in Latin1 folds to ASCII */
3518                 case EXACTFA:
3519                 case EXACTFU_TRICKYFOLD:
3520                 case EXACTFU_SS:
3521                 case EXACTFU:
3522                     c2 = PL_fold_latin1[c1];
3523                     break;
3524
3525                 default:
3526                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3527                     assert(0); /* NOTREACHED */
3528             }
3529         }
3530     }
3531
3532     /* Here have figured things out.  Set up the returns */
3533     if (use_chrtest_void) {
3534         *c2p = *c1p = CHRTEST_VOID;
3535     }
3536     else if (utf8_target) {
3537         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3538             uvchr_to_utf8(c1_utf8, c1);
3539             uvchr_to_utf8(c2_utf8, c2);
3540         }
3541
3542         /* Invariants are stored in both the utf8 and byte outputs; Use
3543          * negative numbers otherwise for the byte ones.  Make sure that the
3544          * byte ones are the same iff the utf8 ones are the same */
3545         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3546         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3547                 ? *c2_utf8
3548                 : (c1 == c2)
3549                   ? CHRTEST_NOT_A_CP_1
3550                   : CHRTEST_NOT_A_CP_2;
3551     }
3552     else if (c1 > 255) {
3553        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3554                            can represent */
3555            return FALSE;
3556        }
3557
3558        *c1p = *c2p = c2;    /* c2 is the only representable value */
3559     }
3560     else {  /* c1 is representable; see about c2 */
3561        *c1p = c1;
3562        *c2p = (c2 < 256) ? c2 : c1;
3563     }
3564
3565     return TRUE;
3566 }
3567
3568 /* returns -1 on failure, $+[0] on success */
3569 STATIC I32
3570 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3571 {
3572 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3573     dMY_CXT;
3574 #endif
3575     dVAR;
3576     const bool utf8_target = PL_reg_match_utf8;
3577     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3578     REGEXP *rex_sv = reginfo->prog;
3579     regexp *rex = ReANY(rex_sv);
3580     RXi_GET_DECL(rex,rexi);
3581     I32 oldsave;
3582     /* the current state. This is a cached copy of PL_regmatch_state */
3583     regmatch_state *st;
3584     /* cache heavy used fields of st in registers */
3585     regnode *scan;
3586     regnode *next;
3587     U32 n = 0;  /* general value; init to avoid compiler warning */
3588     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3589     char *locinput = startpos;
3590     char *pushinput; /* where to continue after a PUSH */
3591     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3592
3593     bool result = 0;        /* return value of S_regmatch */
3594     int depth = 0;          /* depth of backtrack stack */
3595     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3596     const U32 max_nochange_depth =
3597         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3598         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3599     regmatch_state *yes_state = NULL; /* state to pop to on success of
3600                                                             subpattern */
3601     /* mark_state piggy backs on the yes_state logic so that when we unwind
3602        the stack on success we can update the mark_state as we go */
3603     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3604     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3605     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3606     U32 state_num;
3607     bool no_final = 0;      /* prevent failure from backtracking? */
3608     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3609     char *startpoint = locinput;
3610     SV *popmark = NULL;     /* are we looking for a mark? */
3611     SV *sv_commit = NULL;   /* last mark name seen in failure */
3612     SV *sv_yes_mark = NULL; /* last mark name we have seen
3613                                during a successful match */
3614     U32 lastopen = 0;       /* last open we saw */
3615     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3616     SV* const oreplsv = GvSV(PL_replgv);
3617     /* these three flags are set by various ops to signal information to
3618      * the very next op. They have a useful lifetime of exactly one loop
3619      * iteration, and are not preserved or restored by state pushes/pops
3620      */
3621     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3622     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3623     int logical = 0;        /* the following EVAL is:
3624                                 0: (?{...})
3625                                 1: (?(?{...})X|Y)
3626                                 2: (??{...})
3627                                or the following IFMATCH/UNLESSM is:
3628                                 false: plain (?=foo)
3629                                 true:  used as a condition: (?(?=foo))
3630                             */
3631     PAD* last_pad = NULL;
3632     dMULTICALL;
3633     I32 gimme = G_SCALAR;
3634     CV *caller_cv = NULL;       /* who called us */
3635     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3636     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3637     U32 maxopenparen = 0;       /* max '(' index seen so far */
3638
3639 #ifdef DEBUGGING
3640     GET_RE_DEBUG_FLAGS_DECL;
3641 #endif
3642
3643     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3644     multicall_oldcatch = 0;
3645     multicall_cv = NULL;
3646     cx = NULL;
3647     PERL_UNUSED_VAR(multicall_cop);
3648     PERL_UNUSED_VAR(newsp);
3649
3650
3651     PERL_ARGS_ASSERT_REGMATCH;
3652
3653     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3654             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3655     }));
3656     /* on first ever call to regmatch, allocate first slab */
3657     if (!PL_regmatch_slab) {
3658         Newx(PL_regmatch_slab, 1, regmatch_slab);
3659         PL_regmatch_slab->prev = NULL;
3660         PL_regmatch_slab->next = NULL;
3661         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3662     }
3663
3664     oldsave = PL_savestack_ix;
3665     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3666     SAVEVPTR(PL_regmatch_slab);
3667     SAVEVPTR(PL_regmatch_state);
3668
3669     /* grab next free state slot */
3670     st = ++PL_regmatch_state;
3671     if (st >  SLAB_LAST(PL_regmatch_slab))
3672         st = PL_regmatch_state = S_push_slab(aTHX);
3673
3674     /* Note that nextchr is a byte even in UTF */
3675     SET_nextchr;
3676     scan = prog;
3677     while (scan != NULL) {
3678
3679         DEBUG_EXECUTE_r( {
3680             SV * const prop = sv_newmortal();
3681             regnode *rnext=regnext(scan);
3682             DUMP_EXEC_POS( locinput, scan, utf8_target );
3683             regprop(rex, prop, scan);
3684
3685             PerlIO_printf(Perl_debug_log,
3686                     "%3"IVdf":%*s%s(%"IVdf")\n",
3687                     (IV)(scan - rexi->program), depth*2, "",
3688                     SvPVX_const(prop),
3689                     (PL_regkind[OP(scan)] == END || !rnext) ?
3690                         0 : (IV)(rnext - rexi->program));
3691         });
3692
3693         next = scan + NEXT_OFF(scan);
3694         if (next == scan)
3695             next = NULL;
3696         state_num = OP(scan);
3697
3698       reenter_switch:
3699
3700         SET_nextchr;
3701         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3702
3703         switch (state_num) {
3704         case BOL: /*  /^../  */
3705             if (locinput == PL_bostr)
3706             {
3707                 /* reginfo->till = reginfo->bol; */
3708                 break;
3709             }
3710             sayNO;
3711
3712         case MBOL: /*  /^../m  */
3713             if (locinput == PL_bostr ||
3714                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3715             {
3716                 break;
3717             }
3718             sayNO;
3719
3720         case SBOL: /*  /^../s  */
3721             if (locinput == PL_bostr)
3722                 break;
3723             sayNO;
3724
3725         case GPOS: /*  \G  */
3726             if (locinput == reginfo->ganch)
3727                 break;
3728             sayNO;
3729
3730         case KEEPS: /*   \K  */
3731             /* update the startpoint */
3732             st->u.keeper.val = rex->offs[0].start;
3733             rex->offs[0].start = locinput - PL_bostr;
3734             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3735             assert(0); /*NOTREACHED*/
3736         case KEEPS_next_fail:
3737             /* rollback the start point change */
3738             rex->offs[0].start = st->u.keeper.val;
3739             sayNO_SILENT;
3740             assert(0); /*NOTREACHED*/
3741
3742         case EOL: /* /..$/  */
3743                 goto seol;
3744
3745         case MEOL: /* /..$/m  */
3746             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3747                 sayNO;
3748             break;
3749
3750         case SEOL: /* /..$/s  */
3751           seol:
3752             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3753                 sayNO;
3754             if (PL_regeol - locinput > 1)
3755                 sayNO;
3756             break;
3757
3758         case EOS: /*  \z  */
3759             if (!NEXTCHR_IS_EOS)
3760                 sayNO;
3761             break;
3762
3763         case SANY: /*  /./s  */
3764             if (NEXTCHR_IS_EOS)
3765                 sayNO;
3766             goto increment_locinput;
3767
3768         case CANY: /*  \C  */
3769             if (NEXTCHR_IS_EOS)
3770                 sayNO;
3771             locinput++;
3772             break;
3773
3774         case REG_ANY: /*  /./  */
3775             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3776                 sayNO;
3777             goto increment_locinput;
3778
3779
3780 #undef  ST
3781 #define ST st->u.trie
3782         case TRIEC: /* (ab|cd) with known charclass */
3783             /* In this case the charclass data is available inline so
3784                we can fail fast without a lot of extra overhead.
3785              */
3786             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3787                 DEBUG_EXECUTE_r(
3788                     PerlIO_printf(Perl_debug_log,
3789                               "%*s  %sfailed to match trie start class...%s\n",
3790                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3791                 );
3792                 sayNO_SILENT;
3793                 assert(0); /* NOTREACHED */
3794             }
3795             /* FALL THROUGH */
3796         case TRIE:  /* (ab|cd)  */
3797             /* the basic plan of execution of the trie is:
3798              * At the beginning, run though all the states, and
3799              * find the longest-matching word. Also remember the position
3800              * of the shortest matching word. For example, this pattern:
3801              *    1  2 3 4    5
3802              *    ab|a|x|abcd|abc
3803              * when matched against the string "abcde", will generate
3804              * accept states for all words except 3, with the longest
3805              * matching word being 4, and the shortest being 2 (with
3806              * the position being after char 1 of the string).
3807              *
3808              * Then for each matching word, in word order (i.e. 1,2,4,5),
3809              * we run the remainder of the pattern; on each try setting
3810              * the current position to the character following the word,
3811              * returning to try the next word on failure.
3812              *
3813              * We avoid having to build a list of words at runtime by
3814              * using a compile-time structure, wordinfo[].prev, which
3815              * gives, for each word, the previous accepting word (if any).
3816              * In the case above it would contain the mappings 1->2, 2->0,
3817              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3818              * the longest word (4 above), a list of all words, by
3819              * following the list of prev pointers; this gives us the
3820              * unordered list 4,5,1,2. Then given the current word we have
3821              * just tried, we can go through the list and find the
3822              * next-biggest word to try (so if we just failed on word 2,
3823              * the next in the list is 4).
3824              *
3825              * Since at runtime we don't record the matching position in
3826              * the string for each word, we have to work that out for
3827              * each word we're about to process. The wordinfo table holds
3828              * the character length of each word; given that we recorded
3829              * at the start: the position of the shortest word and its
3830              * length in chars, we just need to move the pointer the
3831              * difference between the two char lengths. Depending on
3832              * Unicode status and folding, that's cheap or expensive.
3833              *
3834              * This algorithm is optimised for the case where are only a
3835              * small number of accept states, i.e. 0,1, or maybe 2.
3836              * With lots of accepts states, and having to try all of them,
3837              * it becomes quadratic on number of accept states to find all
3838              * the next words.
3839              */
3840
3841             {
3842                 /* what type of TRIE am I? (utf8 makes this contextual) */
3843                 DECL_TRIE_TYPE(scan);
3844
3845                 /* what trie are we using right now */
3846                 reg_trie_data * const trie
3847                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3848                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3849                 U32 state = trie->startstate;
3850
3851                 if (   trie->bitmap
3852                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3853                 {
3854                     if (trie->states[ state ].wordnum) {
3855                          DEBUG_EXECUTE_r(
3856                             PerlIO_printf(Perl_debug_log,
3857                                           "%*s  %smatched empty string...%s\n",
3858                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3859                         );
3860                         if (!trie->jump)
3861                             break;
3862                     } else {
3863                         DEBUG_EXECUTE_r(
3864                             PerlIO_printf(Perl_debug_log,
3865                                           "%*s  %sfailed to match trie start class...%s\n",
3866                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3867                         );
3868                         sayNO_SILENT;
3869                    }
3870                 }
3871
3872             {
3873                 U8 *uc = ( U8* )locinput;
3874
3875                 STRLEN len = 0;
3876                 STRLEN foldlen = 0;
3877                 U8 *uscan = (U8*)NULL;
3878                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3879                 U32 charcount = 0; /* how many input chars we have matched */
3880                 U32 accepted = 0; /* have we seen any accepting states? */
3881
3882                 ST.jump = trie->jump;
3883                 ST.me = scan;
3884                 ST.firstpos = NULL;
3885                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3886                 ST.nextword = 0;
3887
3888                 /* fully traverse the TRIE; note the position of the
3889                    shortest accept state and the wordnum of the longest
3890                    accept state */
3891
3892                 while ( state && uc <= (U8*)PL_regeol ) {
3893                     U32 base = trie->states[ state ].trans.base;
3894                     UV uvc = 0;
3895                     U16 charid = 0;
3896                     U16 wordnum;
3897                     wordnum = trie->states[ state ].wordnum;
3898
3899                     if (wordnum) { /* it's an accept state */
3900                         if (!accepted) {
3901                             accepted = 1;
3902                             /* record first match position */
3903                             if (ST.longfold) {
3904                                 ST.firstpos = (U8*)locinput;
3905                                 ST.firstchars = 0;
3906                             }
3907                             else {
3908                                 ST.firstpos = uc;
3909                                 ST.firstchars = charcount;
3910                             }
3911                         }
3912                         if (!ST.nextword || wordnum < ST.nextword)
3913                             ST.nextword = wordnum;
3914                         ST.topword = wordnum;
3915                     }
3916
3917                     DEBUG_TRIE_EXECUTE_r({
3918                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3919                                 PerlIO_printf( Perl_debug_log,
3920                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3921                                     2+depth * 2, "", PL_colors[4],
3922                                     (UV)state, (accepted ? 'Y' : 'N'));
3923                     });
3924
3925                     /* read a char and goto next state */
3926                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3927                         I32 offset;
3928                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3929                                              uscan, len, uvc, charid, foldlen,
3930                                              foldbuf, uniflags);
3931                         charcount++;
3932                         if (foldlen>0)
3933                             ST.longfold = TRUE;
3934                         if (charid &&
3935                              ( ((offset =
3936                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3937
3938                              && ((U32)offset < trie->lasttrans)
3939                              && trie->trans[offset].check == state)
3940                         {
3941                             state = trie->trans[offset].next;
3942                         }
3943                         else {
3944                             state = 0;
3945                         }
3946                         uc += len;
3947
3948                     }
3949                     else {
3950                         state = 0;
3951                     }
3952                     DEBUG_TRIE_EXECUTE_r(
3953                         PerlIO_printf( Perl_debug_log,
3954                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3955                             charid, uvc, (UV)state, PL_colors[5] );
3956                     );
3957                 }
3958                 if (!accepted)
3959                    sayNO;
3960
3961                 /* calculate total number of accept states */
3962                 {
3963                     U16 w = ST.topword;
3964                     accepted = 0;
3965                     while (w) {
3966                         w = trie->wordinfo[w].prev;
3967                         accepted++;
3968                     }
3969                     ST.accepted = accepted;
3970                 }
3971
3972                 DEBUG_EXECUTE_r(
3973                     PerlIO_printf( Perl_debug_log,
3974                         "%*s  %sgot %"IVdf" possible matches%s\n",
3975                         REPORT_CODE_OFF + depth * 2, "",
3976                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3977                 );
3978                 goto trie_first_try; /* jump into the fail handler */
3979             }}
3980             assert(0); /* NOTREACHED */
3981
3982         case TRIE_next_fail: /* we failed - try next alternative */
3983         {
3984             U8 *uc;
3985             if ( ST.jump) {
3986                 REGCP_UNWIND(ST.cp);
3987                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3988             }
3989             if (!--ST.accepted) {
3990                 DEBUG_EXECUTE_r({
3991                     PerlIO_printf( Perl_debug_log,
3992                         "%*s  %sTRIE failed...%s\n",
3993                         REPORT_CODE_OFF+depth*2, "",
3994                         PL_colors[4],
3995                         PL_colors[5] );
3996                 });
3997                 sayNO_SILENT;
3998             }
3999             {
4000                 /* Find next-highest word to process.  Note that this code
4001                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
4002                 U16 min = 0;
4003                 U16 word;
4004                 U16 const nextword = ST.nextword;
4005                 reg_trie_wordinfo * const wordinfo
4006                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4007                 for (word=ST.topword; word; word=wordinfo[word].prev) {
4008                     if (word > nextword && (!min || word < min))
4009                         min = word;
4010                 }
4011                 ST.nextword = min;
4012             }
4013
4014           trie_first_try:
4015             if (do_cutgroup) {
4016                 do_cutgroup = 0;
4017                 no_final = 0;
4018             }
4019
4020             if ( ST.jump) {
4021                 ST.lastparen = rex->lastparen;
4022                 ST.lastcloseparen = rex->lastcloseparen;
4023                 REGCP_SET(ST.cp);
4024             }
4025
4026             /* find start char of end of current word */
4027             {
4028                 U32 chars; /* how many chars to skip */
4029                 reg_trie_data * const trie
4030                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4031
4032                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4033                             >=  ST.firstchars);
4034                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4035                             - ST.firstchars;
4036                 uc = ST.firstpos;
4037
4038                 if (ST.longfold) {
4039                     /* the hard option - fold each char in turn and find
4040                      * its folded length (which may be different */
4041                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4042                     STRLEN foldlen;
4043                     STRLEN len;
4044                     UV uvc;
4045                     U8 *uscan;
4046
4047                     while (chars) {
4048                         if (utf8_target) {
4049                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4050                                                     uniflags);
4051                             uc += len;
4052                         }
4053                         else {
4054                             uvc = *uc;
4055                             uc++;
4056                         }
4057                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4058                         uscan = foldbuf;
4059                         while (foldlen) {
4060                             if (!--chars)
4061                                 break;
4062                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4063                                             uniflags);
4064                             uscan += len;
4065                             foldlen -= len;
4066                         }
4067                     }
4068                 }
4069                 else {
4070                     if (utf8_target)
4071                         while (chars--)
4072                             uc += UTF8SKIP(uc);
4073                     else
4074                         uc += chars;
4075                 }
4076             }
4077
4078             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4079                             ? ST.jump[ST.nextword]
4080                             : NEXT_OFF(ST.me));
4081
4082             DEBUG_EXECUTE_r({
4083                 PerlIO_printf( Perl_debug_log,
4084                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4085                     REPORT_CODE_OFF+depth*2, "",
4086                     PL_colors[4],
4087                     ST.nextword,
4088                     PL_colors[5]
4089                     );
4090             });
4091
4092             if (ST.accepted > 1 || has_cutgroup) {
4093                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4094                 assert(0); /* NOTREACHED */
4095             }
4096             /* only one choice left - just continue */
4097             DEBUG_EXECUTE_r({
4098                 AV *const trie_words
4099                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4100                 SV ** const tmp = av_fetch( trie_words,
4101                     ST.nextword-1, 0 );
4102                 SV *sv= tmp ? sv_newmortal() : NULL;
4103
4104                 PerlIO_printf( Perl_debug_log,
4105                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4106                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4107                     ST.nextword,
4108                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4109                             PL_colors[0], PL_colors[1],
4110                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4111                         )
4112                     : "not compiled under -Dr",
4113                     PL_colors[5] );
4114             });
4115
4116             locinput = (char*)uc;
4117             continue; /* execute rest of RE */
4118             assert(0); /* NOTREACHED */
4119         }
4120 #undef  ST
4121
4122         case EXACT: {            /*  /abc/        */
4123             char *s = STRING(scan);
4124             ln = STR_LEN(scan);
4125             if (utf8_target != UTF_PATTERN) {
4126                 /* The target and the pattern have differing utf8ness. */
4127                 char *l = locinput;
4128                 const char * const e = s + ln;
4129
4130                 if (utf8_target) {
4131                     /* The target is utf8, the pattern is not utf8.
4132                      * Above-Latin1 code points can't match the pattern;
4133                      * invariants match exactly, and the other Latin1 ones need
4134                      * to be downgraded to a single byte in order to do the
4135                      * comparison.  (If we could be confident that the target
4136                      * is not malformed, this could be refactored to have fewer
4137                      * tests by just assuming that if the first bytes match, it
4138                      * is an invariant, but there are tests in the test suite
4139                      * dealing with (??{...}) which violate this) */
4140                     while (s < e) {
4141                         if (l >= PL_regeol)
4142                              sayNO;
4143                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4144                             sayNO;
4145                         }
4146                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4147                             if (*l != *s) {
4148                                 sayNO;
4149                             }
4150                             l++;
4151                         }
4152                         else {
4153                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4154                                 sayNO;
4155                             }
4156                             l += 2;
4157                         }
4158                         s++;
4159                     }
4160                 }
4161                 else {
4162                     /* The target is not utf8, the pattern is utf8. */
4163                     while (s < e) {
4164                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4165                         {
4166                             sayNO;
4167                         }
4168                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4169                             if (*s != *l) {
4170                                 sayNO;
4171                             }
4172                             s++;
4173                         }
4174                         else {
4175                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4176                                 sayNO;
4177                             }
4178                             s += 2;
4179                         }
4180                         l++;
4181                     }
4182                 }
4183                 locinput = l;
4184                 break;
4185             }
4186             /* The target and the pattern have the same utf8ness. */
4187             /* Inline the first character, for speed. */
4188             if (UCHARAT(s) != nextchr)
4189                 sayNO;
4190             if (PL_regeol - locinput < ln)
4191                 sayNO;
4192             if (ln > 1 && memNE(s, locinput, ln))
4193                 sayNO;
4194             locinput += ln;
4195             break;
4196             }
4197
4198         case EXACTFL: {          /*  /abc/il      */
4199             re_fold_t folder;
4200             const U8 * fold_array;
4201             const char * s;
4202             U32 fold_utf8_flags;
4203
4204             PL_reg_flags |= RF_tainted;
4205             folder = foldEQ_locale;
4206             fold_array = PL_fold_locale;
4207             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4208             goto do_exactf;
4209
4210         case EXACTFU_SS:         /*  /\x{df}/iu   */
4211         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4212         case EXACTFU:            /*  /abc/iu      */
4213             folder = foldEQ_latin1;
4214             fold_array = PL_fold_latin1;
4215             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4216             goto do_exactf;
4217
4218         case EXACTFA:            /*  /abc/iaa     */
4219             folder = foldEQ_latin1;
4220             fold_array = PL_fold_latin1;
4221             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4222             goto do_exactf;
4223
4224         case EXACTF:             /*  /abc/i       */
4225             folder = foldEQ;
4226             fold_array = PL_fold;
4227             fold_utf8_flags = 0;
4228
4229           do_exactf:
4230             s = STRING(scan);
4231             ln = STR_LEN(scan);
4232
4233             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4234               /* Either target or the pattern are utf8, or has the issue where
4235                * the fold lengths may differ. */
4236                 const char * const l = locinput;
4237                 char *e = PL_regeol;
4238
4239                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4240                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4241                 {
4242                     sayNO;
4243                 }
4244                 locinput = e;
4245                 break;
4246             }
4247
4248             /* Neither the target nor the pattern are utf8 */
4249             if (UCHARAT(s) != nextchr
4250                 && !NEXTCHR_IS_EOS
4251                 && UCHARAT(s) != fold_array[nextchr])
4252             {
4253                 sayNO;
4254             }
4255             if (PL_regeol - locinput < ln)
4256                 sayNO;
4257             if (ln > 1 && ! folder(s, locinput, ln))
4258                 sayNO;
4259             locinput += ln;
4260             break;
4261         }
4262
4263         /* XXX Could improve efficiency by separating these all out using a
4264          * macro or in-line function.  At that point regcomp.c would no longer
4265          * have to set the FLAGS fields of these */
4266         case BOUNDL:  /*  /\b/l  */
4267         case NBOUNDL: /*  /\B/l  */
4268             PL_reg_flags |= RF_tainted;
4269             /* FALL THROUGH */
4270         case BOUND:   /*  /\b/   */
4271         case BOUNDU:  /*  /\b/u  */
4272         case BOUNDA:  /*  /\b/a  */
4273         case NBOUND:  /*  /\B/   */
4274         case NBOUNDU: /*  /\B/u  */
4275         case NBOUNDA: /*  /\B/a  */
4276             /* was last char in word? */
4277             if (utf8_target
4278                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4279                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4280             {
4281                 if (locinput == PL_bostr)
4282                     ln = '\n';
4283                 else {
4284                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4285
4286                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4287                 }
4288                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4289                     ln = isALNUM_uni(ln);
4290                     if (NEXTCHR_IS_EOS)
4291                         n = 0;
4292                     else {
4293                         LOAD_UTF8_CHARCLASS_ALNUM();
4294                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4295                                                                 utf8_target);
4296                     }
4297                 }
4298                 else {
4299                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4300                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4301                 }
4302             }
4303             else {
4304
4305                 /* Here the string isn't utf8, or is utf8 and only ascii
4306                  * characters are to match \w.  In the latter case looking at
4307                  * the byte just prior to the current one may be just the final
4308                  * byte of a multi-byte character.  This is ok.  There are two
4309                  * cases:
4310                  * 1) it is a single byte character, and then the test is doing
4311                  *      just what it's supposed to.
4312                  * 2) it is a multi-byte character, in which case the final
4313                  *      byte is never mistakable for ASCII, and so the test
4314                  *      will say it is not a word character, which is the
4315                  *      correct answer. */
4316                 ln = (locinput != PL_bostr) ?
4317                     UCHARAT(locinput - 1) : '\n';
4318                 switch (FLAGS(scan)) {
4319                     case REGEX_UNICODE_CHARSET:
4320                         ln = isWORDCHAR_L1(ln);
4321                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4322                         break;
4323                     case REGEX_LOCALE_CHARSET:
4324                         ln = isALNUM_LC(ln);
4325                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4326                         break;
4327                     case REGEX_DEPENDS_CHARSET:
4328                         ln = isALNUM(ln);
4329                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4330                         break;
4331                     case REGEX_ASCII_RESTRICTED_CHARSET:
4332                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4333                         ln = isWORDCHAR_A(ln);
4334                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4335                         break;
4336                     default:
4337                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4338                         break;
4339                 }
4340             }
4341             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4342              * regcomp.sym */
4343             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4344                     sayNO;
4345             break;
4346
4347         case ANYOF:  /*  /[abc]/       */
4348             if (NEXTCHR_IS_EOS)
4349                 sayNO;
4350             if (utf8_target) {
4351                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4352                     sayNO;
4353                 locinput += UTF8SKIP(locinput);
4354                 break;
4355             }
4356             else {
4357                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4358                     sayNO;
4359                 locinput++;
4360                 break;
4361             }
4362             break;
4363
4364         /* Special char classes: \d, \w etc.
4365          * The defines start on line 166 or so */
4366         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4367                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4368                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4369                   ALNUMA, NALNUMA, isWORDCHAR_A,
4370                   alnum, "a");
4371
4372         case SPACEL:
4373             PL_reg_flags |= RF_tainted;
4374             if (NEXTCHR_IS_EOS) {
4375                 sayNO;
4376             }
4377             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4378                 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4379                     sayNO;
4380                 }
4381             }
4382             else if (! isSPACE_LC((U8) nextchr)) {
4383                     sayNO;
4384             }
4385             goto increment_locinput;
4386
4387         case NSPACEL:
4388             PL_reg_flags |= RF_tainted;
4389             if (NEXTCHR_IS_EOS) {
4390                 sayNO;
4391             }
4392             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4393                 if (isSPACE_LC_utf8((U8 *) locinput)) {
4394                     sayNO;
4395                 }
4396             }
4397             else if (isSPACE_LC(nextchr)) {
4398                     sayNO;
4399             }
4400             goto increment_locinput;
4401
4402         case SPACE:
4403             if (utf8_target) {
4404                 goto utf8_space;
4405             }
4406             /* FALL THROUGH */
4407         case SPACEA:
4408             if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4409                 sayNO;
4410             }
4411             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4412             locinput++;
4413             break;
4414
4415         case NSPACE:
4416             if (utf8_target) {
4417                 goto utf8_nspace;
4418             }
4419             /* FALL THROUGH */
4420         case NSPACEA:
4421             if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4422                 sayNO;
4423             }
4424             goto increment_locinput;
4425
4426         case SPACEU:
4427           utf8_space:
4428             if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4429                 sayNO;
4430             }
4431             goto increment_locinput;
4432
4433         case NSPACEU:
4434           utf8_nspace:
4435             if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4436                 sayNO;
4437             }
4438             goto increment_locinput;
4439
4440         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4441                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4442                 DIGITA, NDIGITA, isDIGIT_A,
4443                 digit, "0");
4444
4445         case POSIXA: /* /[[:ascii:]]/ etc */
4446             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4447                 sayNO;
4448             }
4449             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4450             locinput++;
4451             break;
4452
4453         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4454             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4455                 sayNO;
4456             }
4457             goto increment_locinput;
4458
4459         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4460                        a Unicode extended Grapheme Cluster */
4461             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4462               extended Grapheme Cluster is:
4463
4464                CR LF
4465                | Prepend* Begin Extend*
4466                | .
4467
4468                Begin is:           ( Special_Begin | ! Control )
4469                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4470                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4471                Control is:         [ GCB_Control | CR | LF ]
4472                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4473
4474                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4475                we can rewrite
4476
4477                    Begin is ( Regular_Begin + Special Begin )
4478
4479                It turns out that 98.4% of all Unicode code points match
4480                Regular_Begin.  Doing it this way eliminates a table match in
4481                the previous implementation for almost all Unicode code points.
4482
4483                There is a subtlety with Prepend* which showed up in testing.
4484                Note that the Begin, and only the Begin is required in:
4485                 | Prepend* Begin Extend*
4486                Also, Begin contains '! Control'.  A Prepend must be a
4487                '!  Control', which means it must also be a Begin.  What it
4488                comes down to is that if we match Prepend* and then find no
4489                suitable Begin afterwards, that if we backtrack the last
4490                Prepend, that one will be a suitable Begin.
4491             */
4492
4493             if (NEXTCHR_IS_EOS)
4494                 sayNO;
4495             if  (! utf8_target) {
4496
4497                 /* Match either CR LF  or '.', as all the other possibilities
4498                  * require utf8 */
4499                 locinput++;         /* Match the . or CR */
4500                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4501                                        match the LF */
4502                     && locinput < PL_regeol
4503                     && UCHARAT(locinput) == '\n')
4504                 {
4505                     locinput++;
4506                 }
4507             }
4508             else {
4509
4510                 /* Utf8: See if is ( CR LF ); already know that locinput <
4511                  * PL_regeol, so locinput+1 is in bounds */
4512                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4513                      && UCHARAT(locinput + 1) == '\n')
4514                 {
4515                     locinput += 2;
4516                 }
4517                 else {
4518                     STRLEN len;
4519
4520                     /* In case have to backtrack to beginning, then match '.' */
4521                     char *starting = locinput;
4522
4523                     /* In case have to backtrack the last prepend */
4524                     char *previous_prepend = NULL;
4525
4526                     LOAD_UTF8_CHARCLASS_GCB();
4527
4528                     /* Match (prepend)*   */
4529                     while (locinput < PL_regeol
4530                            && (len = is_GCB_Prepend_utf8(locinput)))
4531                     {
4532                         previous_prepend = locinput;
4533                         locinput += len;
4534                     }
4535
4536                     /* As noted above, if we matched a prepend character, but
4537                      * the next thing won't match, back off the last prepend we
4538                      * matched, as it is guaranteed to match the begin */
4539                     if (previous_prepend
4540                         && (locinput >=  PL_regeol
4541                             || (! swash_fetch(PL_utf8_X_regular_begin,
4542                                              (U8*)locinput, utf8_target)
4543                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4544                         )
4545                     {
4546                         locinput = previous_prepend;
4547                     }
4548
4549                     /* Note that here we know PL_regeol > locinput, as we
4550                      * tested that upon input to this switch case, and if we
4551                      * moved locinput forward, we tested the result just above
4552                      * and it either passed, or we backed off so that it will
4553                      * now pass */
4554                     if (swash_fetch(PL_utf8_X_regular_begin,
4555                                     (U8*)locinput, utf8_target)) {
4556                         locinput += UTF8SKIP(locinput);
4557                     }
4558                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4559
4560                         /* Here did not match the required 'Begin' in the
4561                          * second term.  So just match the very first
4562                          * character, the '.' of the final term of the regex */
4563                         locinput = starting + UTF8SKIP(starting);
4564                         goto exit_utf8;
4565                     } else {
4566
4567                         /* Here is a special begin.  It can be composed of
4568                          * several individual characters.  One possibility is
4569                          * RI+ */
4570                         if ((len = is_GCB_RI_utf8(locinput))) {
4571                             locinput += len;
4572                             while (locinput < PL_regeol
4573                                    && (len = is_GCB_RI_utf8(locinput)))
4574                             {
4575                                 locinput += len;
4576                             }
4577                         } else if ((len = is_GCB_T_utf8(locinput))) {
4578                             /* Another possibility is T+ */
4579                             locinput += len;
4580                             while (locinput < PL_regeol
4581                                 && (len = is_GCB_T_utf8(locinput)))
4582                             {
4583                                 locinput += len;
4584                             }
4585                         } else {
4586
4587                             /* Here, neither RI+ nor T+; must be some other
4588                              * Hangul.  That means it is one of the others: L,
4589                              * LV, LVT or V, and matches:
4590                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4591
4592                             /* Match L*           */
4593                             while (locinput < PL_regeol
4594                                    && (len = is_GCB_L_utf8(locinput)))
4595                             {
4596                                 locinput += len;
4597                             }
4598
4599                             /* Here, have exhausted L*.  If the next character
4600                              * is not an LV, LVT nor V, it means we had to have
4601                              * at least one L, so matches L+ in the original
4602                              * equation, we have a complete hangul syllable.
4603                              * Are done. */
4604
4605                             if (locinput < PL_regeol
4606                                 && is_GCB_LV_LVT_V_utf8(locinput))
4607                             {
4608                                 /* Otherwise keep going.  Must be LV, LVT or V.
4609                                  * See if LVT, by first ruling out V, then LV */
4610                                 if (! is_GCB_V_utf8(locinput)
4611                                         /* All but every TCount one is LV */
4612                                     && (valid_utf8_to_uvchr((U8 *) locinput,
4613                                                                          NULL)
4614                                                                         - SBASE)
4615                                         % TCount != 0)
4616                                 {
4617                                     locinput += UTF8SKIP(locinput);
4618                                 } else {
4619
4620                                     /* Must be  V or LV.  Take it, then match
4621                                      * V*     */
4622                                     locinput += UTF8SKIP(locinput);
4623                                     while (locinput < PL_regeol
4624                                            && (len = is_GCB_V_utf8(locinput)))
4625                                     {
4626                                         locinput += len;
4627                                     }
4628                                 }
4629
4630                                 /* And any of LV, LVT, or V can be followed
4631                                  * by T*            */
4632                                 while (locinput < PL_regeol
4633                                        && (len = is_GCB_T_utf8(locinput)))
4634                                 {
4635                                     locinput += len;
4636                                 }
4637                             }
4638                         }
4639                     }
4640
4641                     /* Match any extender */
4642                     while (locinput < PL_regeol
4643                             && swash_fetch(PL_utf8_X_extend,
4644                                             (U8*)locinput, utf8_target))
4645                     {
4646                         locinput += UTF8SKIP(locinput);
4647                     }
4648                 }
4649             exit_utf8:
4650                 if (locinput > PL_regeol) sayNO;
4651             }
4652             break;
4653
4654         case NREFFL:  /*  /\g{name}/il  */
4655         {   /* The capture buffer cases.  The ones beginning with N for the
4656                named buffers just convert to the equivalent numbered and
4657                pretend they were called as the corresponding numbered buffer
4658                op.  */
4659             /* don't initialize these in the declaration, it makes C++
4660                unhappy */
4661             char *s;
4662             char type;
4663             re_fold_t folder;
4664             const U8 *fold_array;
4665             UV utf8_fold_flags;
4666
4667             PL_reg_flags |= RF_tainted;
4668             folder = foldEQ_locale;
4669             fold_array = PL_fold_locale;
4670             type = REFFL;
4671             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4672             goto do_nref;
4673
4674         case NREFFA:  /*  /\g{name}/iaa  */
4675             folder = foldEQ_latin1;
4676             fold_array = PL_fold_latin1;
4677             type = REFFA;
4678             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4679             goto do_nref;
4680
4681         case NREFFU:  /*  /\g{name}/iu  */
4682             folder = foldEQ_latin1;
4683             fold_array = PL_fold_latin1;
4684             type = REFFU;
4685             utf8_fold_flags = 0;
4686             goto do_nref;
4687
4688         case NREFF:  /*  /\g{name}/i  */
4689             folder = foldEQ;
4690             fold_array = PL_fold;
4691             type = REFF;
4692             utf8_fold_flags = 0;
4693             goto do_nref;
4694
4695         case NREF:  /*  /\g{name}/   */
4696             type = REF;
4697             folder = NULL;
4698             fold_array = NULL;
4699             utf8_fold_flags = 0;
4700           do_nref:
4701
4702             /* For the named back references, find the corresponding buffer
4703              * number */
4704             n = reg_check_named_buff_matched(rex,scan);
4705
4706             if ( ! n ) {
4707                 sayNO;
4708             }
4709             goto do_nref_ref_common;
4710
4711         case REFFL:  /*  /\1/il  */
4712             PL_reg_flags |= RF_tainted;
4713             folder = foldEQ_locale;
4714             fold_array = PL_fold_locale;
4715             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4716             goto do_ref;
4717
4718         case REFFA:  /*  /\1/iaa  */
4719             folder = foldEQ_latin1;
4720             fold_array = PL_fold_latin1;
4721             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4722             goto do_ref;
4723
4724         case REFFU:  /*  /\1/iu  */
4725             folder = foldEQ_latin1;
4726             fold_array = PL_fold_latin1;
4727             utf8_fold_flags = 0;
4728             goto do_ref;
4729
4730         case REFF:  /*  /\1/i  */
4731             folder = foldEQ;
4732             fold_array = PL_fold;
4733             utf8_fold_flags = 0;
4734             goto do_ref;
4735
4736         case REF:  /*  /\1/    */
4737             folder = NULL;
4738             fold_array = NULL;
4739             utf8_fold_flags = 0;
4740
4741           do_ref:
4742             type = OP(scan);
4743             n = ARG(scan);  /* which paren pair */
4744
4745           do_nref_ref_common:
4746             ln = rex->offs[n].start;
4747             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4748             if (rex->lastparen < n || ln == -1)
4749                 sayNO;                  /* Do not match unless seen CLOSEn. */
4750             if (ln == rex->offs[n].end)
4751                 break;
4752
4753             s = PL_bostr + ln;
4754             if (type != REF     /* REF can do byte comparison */
4755                 && (utf8_target || type == REFFU))
4756             { /* XXX handle REFFL better */
4757                 char * limit = PL_regeol;
4758
4759                 /* This call case insensitively compares the entire buffer
4760                     * at s, with the current input starting at locinput, but
4761                     * not going off the end given by PL_regeol, and returns in
4762                     * <limit> upon success, how much of the current input was
4763                     * matched */
4764                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4765                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4766                 {
4767                     sayNO;
4768                 }
4769                 locinput = limit;
4770                 break;
4771             }
4772
4773             /* Not utf8:  Inline the first character, for speed. */
4774             if (!NEXTCHR_IS_EOS &&
4775                 UCHARAT(s) != nextchr &&
4776                 (type == REF ||
4777                  UCHARAT(s) != fold_array[nextchr]))
4778                 sayNO;
4779             ln = rex->offs[n].end - ln;
4780             if (locinput + ln > PL_regeol)
4781                 sayNO;
4782             if (ln > 1 && (type == REF
4783                            ? memNE(s, locinput, ln)
4784                            : ! folder(s, locinput, ln)))
4785                 sayNO;
4786             locinput += ln;
4787             break;
4788         }
4789
4790         case NOTHING: /* null op; e.g. the 'nothing' following
4791                        * the '*' in m{(a+|b)*}' */
4792             break;
4793         case TAIL: /* placeholder while compiling (A|B|C) */
4794             break;
4795
4796         case BACK: /* ??? doesn't appear to be used ??? */
4797             break;
4798
4799 #undef  ST
4800 #define ST st->u.eval
4801         {
4802             SV *ret;
4803             REGEXP *re_sv;
4804             regexp *re;
4805             regexp_internal *rei;
4806             regnode *startpoint;
4807
4808         case GOSTART: /*  (?R)  */
4809         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4810             if (cur_eval && cur_eval->locinput==locinput) {
4811                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4812                     Perl_croak(aTHX_ "Infinite recursion in regex");
4813                 if ( ++nochange_depth > max_nochange_depth )
4814                     Perl_croak(aTHX_
4815                         "Pattern subroutine nesting without pos change"
4816                         " exceeded limit in regex");
4817             } else {
4818                 nochange_depth = 0;
4819             }
4820             re_sv = rex_sv;
4821             re = rex;
4822             rei = rexi;
4823             if (OP(scan)==GOSUB) {
4824                 startpoint = scan + ARG2L(scan);
4825                 ST.close_paren = ARG(scan);
4826             } else {
4827                 startpoint = rei->program+1;
4828                 ST.close_paren = 0;
4829             }
4830             goto eval_recurse_doit;
4831             assert(0); /* NOTREACHED */
4832
4833         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4834             if (cur_eval && cur_eval->locinput==locinput) {
4835                 if ( ++nochange_depth > max_nochange_depth )
4836                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4837             } else {
4838                 nochange_depth = 0;
4839             }
4840             {
4841                 /* execute the code in the {...} */
4842
4843                 dSP;
4844                 IV before;
4845                 OP * const oop = PL_op;
4846                 COP * const ocurcop = PL_curcop;
4847                 OP *nop;
4848                 char *saved_regeol = PL_regeol;
4849                 struct re_save_state saved_state;
4850                 CV *newcv;
4851
4852                 /* save *all* paren positions */
4853                 regcppush(rex, 0, maxopenparen);
4854                 REGCP_SET(runops_cp);
4855
4856                 /* To not corrupt the existing regex state while executing the
4857                  * eval we would normally put it on the save stack, like with
4858                  * save_re_context. However, re-evals have a weird scoping so we
4859                  * can't just add ENTER/LEAVE here. With that, things like
4860                  *
4861                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4862                  *
4863                  * would break, as they expect the localisation to be unwound
4864                  * only when the re-engine backtracks through the bit that
4865                  * localised it.
4866                  *
4867                  * What we do instead is just saving the state in a local c
4868                  * variable.
4869                  */
4870                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4871
4872                 PL_reg_state.re_reparsing = FALSE;
4873
4874                 if (!caller_cv)
4875                     caller_cv = find_runcv(NULL);
4876
4877                 n = ARG(scan);
4878
4879                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4880                     newcv = (ReANY(
4881                                                 (REGEXP*)(rexi->data->data[n])
4882                                             ))->qr_anoncv
4883                                         ;
4884                     nop = (OP*)rexi->data->data[n+1];
4885                 }
4886                 else if (rexi->data->what[n] == 'l') { /* literal code */
4887                     newcv = caller_cv;
4888                     nop = (OP*)rexi->data->data[n];
4889                     assert(CvDEPTH(newcv));
4890                 }
4891                 else {
4892                     /* literal with own CV */
4893                     assert(rexi->data->what[n] == 'L');
4894                     newcv = rex->qr_anoncv;
4895                     nop = (OP*)rexi->data->data[n];
4896                 }
4897
4898                 /* normally if we're about to execute code from the same
4899                  * CV that we used previously, we just use the existing
4900                  * CX stack entry. However, its possible that in the
4901                  * meantime we may have backtracked, popped from the save
4902                  * stack, and undone the SAVECOMPPAD(s) associated with
4903                  * PUSH_MULTICALL; in which case PL_comppad no longer
4904                  * points to newcv's pad. */
4905                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4906                 {
4907                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4908                     if (last_pushed_cv) {
4909                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4910                     }
4911                     else {
4912                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4913                     }
4914                     last_pushed_cv = newcv;
4915                 }
4916                 else {
4917                     /* these assignments are just to silence compiler
4918                      * warnings */
4919                     multicall_cop = NULL;
4920                     newsp = NULL;
4921                 }
4922                 last_pad = PL_comppad;
4923
4924                 /* the initial nextstate you would normally execute
4925                  * at the start of an eval (which would cause error
4926                  * messages to come from the eval), may be optimised
4927                  * away from the execution path in the regex code blocks;
4928                  * so manually set PL_curcop to it initially */
4929                 {
4930                     OP *o = cUNOPx(nop)->op_first;
4931                     assert(o->op_type == OP_NULL);
4932                     if (o->op_targ == OP_SCOPE) {
4933                         o = cUNOPo->op_first;
4934                     }
4935                     else {
4936                         assert(o->op_targ == OP_LEAVE);
4937                         o = cUNOPo->op_first;
4938                         assert(o->op_type == OP_ENTER);
4939                         o = o->op_sibling;
4940                     }
4941
4942                     if (o->op_type != OP_STUB) {
4943                         assert(    o->op_type == OP_NEXTSTATE
4944                                 || o->op_type == OP_DBSTATE
4945                                 || (o->op_type == OP_NULL
4946                                     &&  (  o->op_targ == OP_NEXTSTATE
4947                                         || o->op_targ == OP_DBSTATE
4948                                         )
4949                                     )
4950                         );
4951                         PL_curcop = (COP*)o;
4952                     }
4953                 }
4954                 nop = nop->op_next;
4955
4956                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4957                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4958
4959                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4960
4961                 if (sv_yes_mark) {
4962                     SV *sv_mrk = get_sv("REGMARK", 1);
4963                     sv_setsv(sv_mrk, sv_yes_mark);
4964                 }
4965
4966                 /* we don't use MULTICALL here as we want to call the
4967                  * first op of the block of interest, rather than the
4968                  * first op of the sub */
4969                 before = (IV)(SP-PL_stack_base);
4970                 PL_op = nop;
4971                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4972                 SPAGAIN;
4973                 if ((IV)(SP-PL_stack_base) == before)
4974                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4975                 else {
4976                     ret = POPs;
4977                     PUTBACK;
4978                 }
4979
4980                 /* before restoring everything, evaluate the returned
4981                  * value, so that 'uninit' warnings don't use the wrong
4982                  * PL_op or pad. Also need to process any magic vars
4983                  * (e.g. $1) *before* parentheses are restored */
4984
4985                 PL_op = NULL;
4986
4987                 re_sv = NULL;
4988                 if (logical == 0)        /*   (?{})/   */
4989                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4990                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4991                     sw = cBOOL(SvTRUE(ret));
4992                     logical = 0;
4993                 }
4994                 else {                   /*  /(??{})  */
4995                     /*  if its overloaded, let the regex compiler handle
4996                      *  it; otherwise extract regex, or stringify  */
4997                     if (!SvAMAGIC(ret)) {
4998                         SV *sv = ret;
4999                         if (SvROK(sv))
5000                             sv = SvRV(sv);
5001                         if (SvTYPE(sv) == SVt_REGEXP)
5002                             re_sv = (REGEXP*) sv;
5003                         else if (SvSMAGICAL(sv)) {
5004                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5005                             if (mg)
5006                                 re_sv = (REGEXP *) mg->mg_obj;
5007                         }
5008
5009                         /* force any magic, undef warnings here */
5010                         if (!re_sv) {
5011                             ret = sv_mortalcopy(ret);
5012                             (void) SvPV_force_nolen(ret);
5013                         }
5014                     }
5015
5016                 }
5017
5018                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5019
5020                 /* *** Note that at this point we don't restore
5021                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5022                  * be used again soon. This is safe as long as nothing
5023                  * in the regexp code uses the pad ! */
5024                 PL_op = oop;
5025                 PL_curcop = ocurcop;
5026                 PL_regeol = saved_regeol;
5027                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5028
5029                 if (logical != 2)
5030                     break;
5031             }
5032
5033                 /* only /(??{})/  from now on */
5034                 logical = 0;
5035                 {
5036                     /* extract RE object from returned value; compiling if
5037                      * necessary */
5038
5039                     if (re_sv) {
5040                         re_sv = reg_temp_copy(NULL, re_sv);
5041                     }
5042                     else {
5043                         U32 pm_flags = 0;
5044
5045                         if (SvUTF8(ret) && IN_BYTES) {
5046                             /* In use 'bytes': make a copy of the octet
5047                              * sequence, but without the flag on */
5048                             STRLEN len;
5049                             const char *const p = SvPV(ret, len);
5050                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5051                         }
5052                         if (rex->intflags & PREGf_USE_RE_EVAL)
5053                             pm_flags |= PMf_USE_RE_EVAL;
5054
5055                         /* if we got here, it should be an engine which
5056                          * supports compiling code blocks and stuff */
5057                         assert(rex->engine && rex->engine->op_comp);
5058                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5059                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5060                                     rex->engine, NULL, NULL,
5061                                     /* copy /msix etc to inner pattern */
5062                                     scan->flags,
5063                                     pm_flags);
5064
5065                         if (!(SvFLAGS(ret)
5066                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5067                                  | SVs_GMG))) {
5068                             /* This isn't a first class regexp. Instead, it's
5069                                caching a regexp onto an existing, Perl visible
5070                                scalar.  */
5071                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5072                         }
5073                         /* safe to do now that any $1 etc has been
5074                          * interpolated into the new pattern string and
5075                          * compiled */
5076                         S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5077                     }
5078                     SAVEFREESV(re_sv);
5079                     re = ReANY(re_sv);
5080                 }
5081                 RXp_MATCH_COPIED_off(re);
5082                 re->subbeg = rex->subbeg;
5083                 re->sublen = rex->sublen;
5084                 re->suboffset = rex->suboffset;
5085                 re->subcoffset = rex->subcoffset;
5086                 rei = RXi_GET(re);
5087                 DEBUG_EXECUTE_r(
5088                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5089                         "Matching embedded");
5090                 );
5091                 startpoint = rei->program + 1;
5092                 ST.close_paren = 0; /* only used for GOSUB */
5093
5094         eval_recurse_doit: /* Share code with GOSUB below this line */
5095                 /* run the pattern returned from (??{...}) */
5096
5097                 /* Save *all* the positions. */
5098                 ST.cp = regcppush(rex, 0, maxopenparen);
5099                 REGCP_SET(ST.lastcp);
5100
5101                 re->lastparen = 0;
5102                 re->lastcloseparen = 0;
5103
5104                 maxopenparen = 0;
5105
5106                 /* XXXX This is too dramatic a measure... */
5107                 PL_reg_maxiter = 0;
5108
5109                 ST.toggle_reg_flags = PL_reg_flags;
5110                 if (RX_UTF8(re_sv))
5111                     PL_reg_flags |= RF_utf8;
5112                 else
5113                     PL_reg_flags &= ~RF_utf8;
5114                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5115
5116                 ST.prev_rex = rex_sv;
5117                 ST.prev_curlyx = cur_curlyx;
5118                 rex_sv = re_sv;
5119                 SET_reg_curpm(rex_sv);
5120                 rex = re;
5121                 rexi = rei;
5122                 cur_curlyx = NULL;
5123                 ST.B = next;
5124                 ST.prev_eval = cur_eval;
5125                 cur_eval = st;
5126                 /* now continue from first node in postoned RE */
5127                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5128                 assert(0); /* NOTREACHED */
5129         }
5130
5131         case EVAL_AB: /* cleanup after a successful (??{A})B */
5132             /* note: this is called twice; first after popping B, then A */
5133             PL_reg_flags ^= ST.toggle_reg_flags;
5134             rex_sv = ST.prev_rex;
5135             SET_reg_curpm(rex_sv);
5136             rex = ReANY(rex_sv);
5137             rexi = RXi_GET(rex);
5138             regcpblow(ST.cp);
5139             cur_eval = ST.prev_eval;
5140             cur_curlyx = ST.prev_curlyx;
5141
5142             /* XXXX This is too dramatic a measure... */
5143             PL_reg_maxiter = 0;
5144             if ( nochange_depth )
5145                 nochange_depth--;
5146             sayYES;
5147
5148
5149         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5150             /* note: this is called twice; first after popping B, then A */
5151             PL_reg_flags ^= ST.toggle_reg_flags;
5152             rex_sv = ST.prev_rex;
5153             SET_reg_curpm(rex_sv);
5154             rex = ReANY(rex_sv);
5155             rexi = RXi_GET(rex);
5156
5157             REGCP_UNWIND(ST.lastcp);
5158             regcppop(rex, &maxopenparen);
5159             cur_eval = ST.prev_eval;
5160             cur_curlyx = ST.prev_curlyx;
5161             /* XXXX This is too dramatic a measure... */
5162             PL_reg_maxiter = 0;
5163             if ( nochange_depth )
5164                 nochange_depth--;
5165             sayNO_SILENT;
5166 #undef ST
5167
5168         case OPEN: /*  (  */
5169             n = ARG(scan);  /* which paren pair */
5170             rex->offs[n].start_tmp = locinput - PL_bostr;
5171             if (n > maxopenparen)
5172                 maxopenparen = n;
5173             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5174                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5175                 PTR2UV(rex),
5176                 PTR2UV(rex->offs),
5177                 (UV)n,
5178                 (IV)rex->offs[n].start_tmp,
5179                 (UV)maxopenparen
5180             ));
5181             lastopen = n;
5182             break;
5183
5184 /* XXX really need to log other places start/end are set too */
5185 #define CLOSE_CAPTURE \
5186     rex->offs[n].start = rex->offs[n].start_tmp; \
5187     rex->offs[n].end = locinput - PL_bostr; \
5188     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5189         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5190         PTR2UV(rex), \
5191         PTR2UV(rex->offs), \
5192         (UV)n, \
5193         (IV)rex->offs[n].start, \
5194         (IV)rex->offs[n].end \
5195     ))
5196
5197         case CLOSE:  /*  )  */
5198             n = ARG(scan);  /* which paren pair */
5199             CLOSE_CAPTURE;
5200             if (n > rex->lastparen)
5201                 rex->lastparen = n;
5202             rex->lastcloseparen = n;
5203             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5204                 goto fake_end;
5205             }
5206             break;
5207
5208         case ACCEPT:  /*  (*ACCEPT)  */
5209             if (ARG(scan)){
5210                 regnode *cursor;
5211                 for (cursor=scan;
5212                      cursor && OP(cursor)!=END;
5213                      cursor=regnext(cursor))
5214                 {
5215                     if ( OP(cursor)==CLOSE ){
5216                         n = ARG(cursor);
5217                         if ( n <= lastopen ) {
5218                             CLOSE_CAPTURE;
5219                             if (n > rex->lastparen)
5220                                 rex->lastparen = n;
5221                             rex->lastcloseparen = n;
5222                             if ( n == ARG(scan) || (cur_eval &&
5223                                 cur_eval->u.eval.close_paren == n))
5224                                 break;
5225                         }
5226                     }
5227                 }
5228             }
5229             goto fake_end;
5230             /*NOTREACHED*/
5231
5232         case GROUPP:  /*  (?(1))  */
5233             n = ARG(scan);  /* which paren pair */
5234             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5235             break;
5236
5237         case NGROUPP:  /*  (?(<name>))  */
5238             /* reg_check_named_buff_matched returns 0 for no match */
5239             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5240             break;
5241
5242         case INSUBP:   /*  (?(R))  */
5243             n = ARG(scan);
5244             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5245             break;
5246
5247         case DEFINEP:  /*  (?(DEFINE))  */
5248             sw = 0;
5249             break;
5250
5251         case IFTHEN:   /*  (?(cond)A|B)  */
5252             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5253             if (sw)
5254                 next = NEXTOPER(NEXTOPER(scan));
5255             else {
5256                 next = scan + ARG(scan);
5257                 if (OP(next) == IFTHEN) /* Fake one. */
5258                     next = NEXTOPER(NEXTOPER(next));
5259             }
5260             break;
5261
5262         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5263             logical = scan->flags;
5264             break;
5265
5266 /*******************************************************************
5267
5268 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5269 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5270 STAR/PLUS/CURLY/CURLYN are used instead.)
5271
5272 A*B is compiled as <CURLYX><A><WHILEM><B>
5273
5274 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5275 state, which contains the current count, initialised to -1. It also sets
5276 cur_curlyx to point to this state, with any previous value saved in the
5277 state block.
5278
5279 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5280 since the pattern may possibly match zero times (i.e. it's a while {} loop
5281 rather than a do {} while loop).
5282
5283 Each entry to WHILEM represents a successful match of A. The count in the
5284 CURLYX block is incremented, another WHILEM state is pushed, and execution
5285 passes to A or B depending on greediness and the current count.
5286
5287 For example, if matching against the string a1a2a3b (where the aN are
5288 substrings that match /A/), then the match progresses as follows: (the
5289 pushed states are interspersed with the bits of strings matched so far):
5290
5291     <CURLYX cnt=-1>
5292     <CURLYX cnt=0><WHILEM>
5293     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5294     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5295     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5296     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5297
5298 (Contrast this with something like CURLYM, which maintains only a single
5299 backtrack state:
5300
5301     <CURLYM cnt=0> a1
5302     a1 <CURLYM cnt=1> a2
5303     a1 a2 <CURLYM cnt=2> a3
5304     a1 a2 a3 <CURLYM cnt=3> b
5305 )
5306
5307 Each WHILEM state block marks a point to backtrack to upon partial failure
5308 of A or B, and also contains some minor state data related to that
5309 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5310 overall state, such as the count, and pointers to the A and B ops.
5311
5312 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5313 must always point to the *current* CURLYX block, the rules are:
5314
5315 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5316 and set cur_curlyx to point the new block.
5317
5318 When popping the CURLYX block after a successful or unsuccessful match,
5319 restore the previous cur_curlyx.
5320
5321 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5322 to the outer one saved in the CURLYX block.
5323
5324 When popping the WHILEM block after a successful or unsuccessful B match,
5325 restore the previous cur_curlyx.
5326
5327 Here's an example for the pattern (AI* BI)*BO
5328 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5329
5330 cur_
5331 curlyx backtrack stack
5332 ------ ---------------
5333 NULL
5334 CO     <CO prev=NULL> <WO>
5335 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5336 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5337 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5338
5339 At this point the pattern succeeds, and we work back down the stack to
5340 clean up, restoring as we go:
5341
5342 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5343 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5344 CO     <CO prev=NULL> <WO>
5345 NULL
5346
5347 *******************************************************************/
5348
5349 #define ST st->u.curlyx
5350
5351         case CURLYX:    /* start of /A*B/  (for complex A) */
5352         {
5353             /* No need to save/restore up to this paren */
5354             I32 parenfloor = scan->flags;
5355
5356             assert(next); /* keep Coverity happy */
5357             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5358                 next += ARG(next);
5359
5360             /* XXXX Probably it is better to teach regpush to support
5361                parenfloor > maxopenparen ... */
5362             if (parenfloor > (I32)rex->lastparen)
5363                 parenfloor = rex->lastparen; /* Pessimization... */
5364
5365             ST.prev_curlyx= cur_curlyx;
5366             cur_curlyx = st;
5367             ST.cp = PL_savestack_ix;
5368
5369             /* these fields contain the state of the current curly.
5370              * they are accessed by subsequent WHILEMs */
5371             ST.parenfloor = parenfloor;
5372             ST.me = scan;
5373             ST.B = next;
5374             ST.minmod = minmod;
5375             minmod = 0;
5376             ST.count = -1;      /* this will be updated by WHILEM */
5377             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5378
5379             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5380             assert(0); /* NOTREACHED */
5381         }
5382
5383         case CURLYX_end: /* just finished matching all of A*B */
5384             cur_curlyx = ST.prev_curlyx;
5385             sayYES;
5386             assert(0); /* NOTREACHED */
5387
5388         case CURLYX_end_fail: /* just failed to match all of A*B */
5389             regcpblow(ST.cp);
5390             cur_curlyx = ST.prev_curlyx;
5391             sayNO;
5392             assert(0); /* NOTREACHED */
5393
5394
5395 #undef ST
5396 #define ST st->u.whilem
5397
5398         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5399         {
5400             /* see the discussion above about CURLYX/WHILEM */
5401             I32 n;
5402             int min = ARG1(cur_curlyx->u.curlyx.me);
5403             int max = ARG2(cur_curlyx->u.curlyx.me);
5404             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5405
5406             assert(cur_curlyx); /* keep Coverity happy */
5407             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5408             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5409             ST.cache_offset = 0;
5410             ST.cache_mask = 0;
5411
5412
5413             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5414                   "%*s  whilem: matched %ld out of %d..%d\n",
5415                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5416             );
5417
5418             /* First just match a string of min A's. */
5419
5420             if (n < min) {
5421                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5422                                     maxopenparen);
5423                 cur_curlyx->u.curlyx.lastloc = locinput;
5424                 REGCP_SET(ST.lastcp);
5425
5426                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5427                 assert(0); /* NOTREACHED */
5428             }
5429
5430             /* If degenerate A matches "", assume A done. */
5431
5432             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5433                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5434                    "%*s  whilem: empty match detected, trying continuation...\n",
5435                    REPORT_CODE_OFF+depth*2, "")
5436                 );
5437                 goto do_whilem_B_max;
5438             }
5439
5440             /* super-linear cache processing */
5441
5442             if (scan->flags) {
5443
5444                 if (!PL_reg_maxiter) {
5445                     /* start the countdown: Postpone detection until we
5446                      * know the match is not *that* much linear. */
5447                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5448                     /* possible overflow for long strings and many CURLYX's */
5449                     if (PL_reg_maxiter < 0)
5450                         PL_reg_maxiter = I32_MAX;
5451                     PL_reg_leftiter = PL_reg_maxiter;
5452                 }
5453
5454                 if (PL_reg_leftiter-- == 0) {
5455                     /* initialise cache */
5456                     const I32 size = (PL_reg_maxiter + 7)/8;
5457                     if (PL_reg_poscache) {
5458                         if ((I32)PL_reg_poscache_size < size) {
5459                             Renew(PL_reg_poscache, size, char);
5460                             PL_reg_poscache_size = size;
5461                         }
5462                         Zero(PL_reg_poscache, size, char);
5463                     }
5464                     else {
5465                         PL_reg_poscache_size = size;
5466                         Newxz(PL_reg_poscache, size, char);
5467                     }
5468                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5469       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5470                               PL_colors[4], PL_colors[5])
5471                     );
5472                 }
5473
5474                 if (PL_reg_leftiter < 0) {
5475                     /* have we already failed at this position? */
5476                     I32 offset, mask;
5477                     offset  = (scan->flags & 0xf) - 1
5478                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5479                     mask    = 1 << (offset % 8);
5480                     offset /= 8;
5481                     if (PL_reg_poscache[offset] & mask) {
5482                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5483                             "%*s  whilem: (cache) already tried at this position...\n",
5484                             REPORT_CODE_OFF+depth*2, "")
5485                         );
5486                         sayNO; /* cache records failure */
5487                     }
5488                     ST.cache_offset = offset;
5489                     ST.cache_mask   = mask;
5490                 }
5491             }
5492
5493             /* Prefer B over A for minimal matching. */
5494
5495             if (cur_curlyx->u.curlyx.minmod) {
5496                 ST.save_curlyx = cur_curlyx;
5497                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5498                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5499                             maxopenparen);
5500                 REGCP_SET(ST.lastcp);
5501                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5502                                     locinput);
5503                 assert(0); /* NOTREACHED */
5504             }
5505
5506             /* Prefer A over B for maximal matching. */
5507
5508             if (n < max) { /* More greed allowed? */
5509                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5510                             maxopenparen);
5511                 cur_curlyx->u.curlyx.lastloc = locinput;
5512                 REGCP_SET(ST.lastcp);
5513                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5514                 assert(0); /* NOTREACHED */
5515             }
5516             goto do_whilem_B_max;
5517         }
5518         assert(0); /* NOTREACHED */
5519
5520         case WHILEM_B_min: /* just matched B in a minimal match */
5521         case WHILEM_B_max: /* just matched B in a maximal match */
5522             cur_curlyx = ST.save_curlyx;
5523             sayYES;
5524             assert(0); /* NOTREACHED */
5525
5526         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5527             cur_curlyx = ST.save_curlyx;
5528             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5529             cur_curlyx->u.curlyx.count--;
5530             CACHEsayNO;
5531             assert(0); /* NOTREACHED */
5532
5533         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5534             /* FALL THROUGH */
5535         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5536             REGCP_UNWIND(ST.lastcp);
5537             regcppop(rex, &maxopenparen);
5538             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5539             cur_curlyx->u.curlyx.count--;
5540             CACHEsayNO;
5541             assert(0); /* NOTREACHED */
5542
5543         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5544             REGCP_UNWIND(ST.lastcp);
5545             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5546             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5547                 "%*s  whilem: failed, trying continuation...\n",
5548                 REPORT_CODE_OFF+depth*2, "")
5549             );
5550           do_whilem_B_max:
5551             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5552                 && ckWARN(WARN_REGEXP)
5553                 && !(PL_reg_flags & RF_warned))
5554             {
5555                 PL_reg_flags |= RF_warned;
5556                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5557                      "Complex regular subexpression recursion limit (%d) "
5558                      "exceeded",
5559                      REG_INFTY - 1);
5560             }
5561
5562             /* now try B */
5563             ST.save_curlyx = cur_curlyx;
5564             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5565             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5566                                 locinput);
5567             assert(0); /* NOTREACHED */
5568
5569         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5570             cur_curlyx = ST.save_curlyx;
5571             REGCP_UNWIND(ST.lastcp);
5572             regcppop(rex, &maxopenparen);
5573
5574             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5575                 /* Maximum greed exceeded */
5576                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5577                     && ckWARN(WARN_REGEXP)
5578                     && !(PL_reg_flags & RF_warned))
5579                 {
5580                     PL_reg_flags |= RF_warned;
5581                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5582                         "Complex regular subexpression recursion "
5583                         "limit (%d) exceeded",
5584                         REG_INFTY - 1);
5585                 }
5586                 cur_curlyx->u.curlyx.count--;
5587                 CACHEsayNO;
5588             }
5589
5590             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5591                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5592             );
5593             /* Try grabbing another A and see if it helps. */
5594             cur_curlyx->u.curlyx.lastloc = locinput;
5595             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5596                             maxopenparen);
5597             REGCP_SET(ST.lastcp);
5598             PUSH_STATE_GOTO(WHILEM_A_min,
5599                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5600                 locinput);
5601             assert(0); /* NOTREACHED */
5602
5603 #undef  ST
5604 #define ST st->u.branch
5605
5606         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5607             next = scan + ARG(scan);
5608             if (next == scan)
5609                 next = NULL;
5610             scan = NEXTOPER(scan);
5611             /* FALL THROUGH */
5612
5613         case BRANCH:        /*  /(...|A|...)/ */
5614             scan = NEXTOPER(scan); /* scan now points to inner node */
5615             ST.lastparen = rex->lastparen;
5616             ST.lastcloseparen = rex->lastcloseparen;
5617             ST.next_branch = next;
5618             REGCP_SET(ST.cp);
5619
5620             /* Now go into the branch */
5621             if (has_cutgroup) {
5622                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5623             } else {
5624                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5625             }
5626             assert(0); /* NOTREACHED */
5627
5628         case CUTGROUP:  /*  /(*THEN)/  */
5629             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5630                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5631             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5632             assert(0); /* NOTREACHED */
5633
5634         case CUTGROUP_next_fail:
5635             do_cutgroup = 1;
5636             no_final = 1;
5637             if (st->u.mark.mark_name)
5638                 sv_commit = st->u.mark.mark_name;
5639             sayNO;
5640             assert(0); /* NOTREACHED */
5641
5642         case BRANCH_next:
5643             sayYES;
5644             assert(0); /* NOTREACHED */
5645
5646         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5647             if (do_cutgroup) {
5648                 do_cutgroup = 0;
5649                 no_final = 0;
5650             }
5651             REGCP_UNWIND(ST.cp);
5652             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5653             scan = ST.next_branch;
5654             /* no more branches? */
5655             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5656                 DEBUG_EXECUTE_r({
5657                     PerlIO_printf( Perl_debug_log,
5658                         "%*s  %sBRANCH failed...%s\n",
5659                         REPORT_CODE_OFF+depth*2, "",
5660                         PL_colors[4],
5661                         PL_colors[5] );
5662                 });
5663                 sayNO_SILENT;
5664             }
5665             continue; /* execute next BRANCH[J] op */
5666             assert(0); /* NOTREACHED */
5667
5668         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5669             minmod = 1;
5670             break;
5671
5672 #undef  ST
5673 #define ST st->u.curlym
5674
5675         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5676
5677             /* This is an optimisation of CURLYX that enables us to push
5678              * only a single backtracking state, no matter how many matches
5679              * there are in {m,n}. It relies on the pattern being constant
5680              * length, with no parens to influence future backrefs
5681              */
5682
5683             ST.me = scan;
5684             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5685
5686             ST.lastparen      = rex->lastparen;
5687             ST.lastcloseparen = rex->lastcloseparen;
5688
5689             /* if paren positive, emulate an OPEN/CLOSE around A */
5690             if (ST.me->flags) {
5691                 U32 paren = ST.me->flags;
5692                 if (paren > maxopenparen)
5693                     maxopenparen = paren;
5694                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5695             }
5696             ST.A = scan;
5697             ST.B = next;
5698             ST.alen = 0;
5699             ST.count = 0;
5700             ST.minmod = minmod;
5701             minmod = 0;
5702             ST.c1 = CHRTEST_UNINIT;
5703             REGCP_SET(ST.cp);
5704
5705             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5706                 goto curlym_do_B;
5707
5708           curlym_do_A: /* execute the A in /A{m,n}B/  */
5709             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5710             assert(0); /* NOTREACHED */
5711
5712         case CURLYM_A: /* we've just matched an A */
5713             ST.count++;
5714             /* after first match, determine A's length: u.curlym.alen */
5715             if (ST.count == 1) {
5716                 if (PL_reg_match_utf8) {
5717                     char *s = st->locinput;
5718                     while (s < locinput) {
5719                         ST.alen++;
5720                         s += UTF8SKIP(s);
5721                     }
5722                 }
5723                 else {
5724                     ST.alen = locinput - st->locinput;
5725                 }
5726                 if (ST.alen == 0)
5727                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5728             }
5729             DEBUG_EXECUTE_r(
5730                 PerlIO_printf(Perl_debug_log,
5731                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5732                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5733                           (IV) ST.count, (IV)ST.alen)
5734             );
5735
5736             if (cur_eval && cur_eval->u.eval.close_paren &&
5737                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5738                 goto fake_end;
5739
5740             {
5741                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5742                 if ( max == REG_INFTY || ST.count < max )
5743                     goto curlym_do_A; /* try to match another A */
5744             }
5745             goto curlym_do_B; /* try to match B */
5746
5747         case CURLYM_A_fail: /* just failed to match an A */
5748             REGCP_UNWIND(ST.cp);
5749
5750             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5751                 || (cur_eval && cur_eval->u.eval.close_paren &&
5752                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5753                 sayNO;
5754
5755           curlym_do_B: /* execute the B in /A{m,n}B/  */
5756             if (ST.c1 == CHRTEST_UNINIT) {
5757                 /* calculate c1 and c2 for possible match of 1st char
5758                  * following curly */
5759                 ST.c1 = ST.c2 = CHRTEST_VOID;
5760                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5761                     regnode *text_node = ST.B;
5762                     if (! HAS_TEXT(text_node))
5763                         FIND_NEXT_IMPT(text_node);
5764                     /* this used to be
5765
5766                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5767
5768                         But the former is redundant in light of the latter.
5769
5770                         if this changes back then the macro for
5771                         IS_TEXT and friends need to change.
5772                      */
5773                     if (PL_regkind[OP(text_node)] == EXACT) {
5774                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5775                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5776                         {
5777                             sayNO;
5778                         }
5779                     }
5780                 }
5781             }
5782
5783             DEBUG_EXECUTE_r(
5784                 PerlIO_printf(Perl_debug_log,
5785                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5786                     (int)(REPORT_CODE_OFF+(depth*2)),
5787                     "", (IV)ST.count)
5788                 );
5789             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5790                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5791                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5792                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5793                     {
5794                         /* simulate B failing */
5795                         DEBUG_OPTIMISE_r(
5796                             PerlIO_printf(Perl_debug_log,
5797                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5798                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5799                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5800                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5801                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5802                         );
5803                         state_num = CURLYM_B_fail;
5804                         goto reenter_switch;
5805                     }
5806                 }
5807                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5808                     /* simulate B failing */
5809                     DEBUG_OPTIMISE_r(
5810                         PerlIO_printf(Perl_debug_log,
5811                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5812                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5813                             (int) nextchr, ST.c1, ST.c2)
5814                     );
5815                     state_num = CURLYM_B_fail;
5816                     goto reenter_switch;
5817                 }
5818             }
5819
5820             if (ST.me->flags) {
5821                 /* emulate CLOSE: mark current A as captured */
5822                 I32 paren = ST.me->flags;
5823                 if (ST.count) {
5824                     rex->offs[paren].start
5825                         = HOPc(locinput, -ST.alen) - PL_bostr;
5826                     rex->offs[paren].end = locinput - PL_bostr;
5827                     if ((U32)paren > rex->lastparen)
5828                         rex->lastparen = paren;
5829                     rex->lastcloseparen = paren;
5830                 }
5831                 else
5832                     rex->offs[paren].end = -1;
5833                 if (cur_eval && cur_eval->u.eval.close_paren &&
5834                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5835                 {
5836                     if (ST.count)
5837                         goto fake_end;
5838                     else
5839                         sayNO;
5840                 }
5841             }
5842
5843             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5844             assert(0); /* NOTREACHED */
5845
5846         case CURLYM_B_fail: /* just failed to match a B */
5847             REGCP_UNWIND(ST.cp);
5848             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5849             if (ST.minmod) {
5850                 I32 max = ARG2(ST.me);
5851                 if (max != REG_INFTY && ST.count == max)
5852                     sayNO;
5853                 goto curlym_do_A; /* try to match a further A */
5854             }
5855             /* backtrack one A */
5856             if (ST.count == ARG1(ST.me) /* min */)
5857                 sayNO;
5858             ST.count--;
5859             SET_locinput(HOPc(locinput, -ST.alen));
5860             goto curlym_do_B; /* try to match B */
5861
5862 #undef ST
5863 #define ST st->u.curly
5864
5865 #define CURLY_SETPAREN(paren, success) \
5866     if (paren) { \
5867         if (success) { \
5868             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5869             rex->offs[paren].end = locinput - PL_bostr; \
5870             if (paren > rex->lastparen) \
5871                 rex->lastparen = paren; \
5872             rex->lastcloseparen = paren; \
5873         } \
5874         else { \
5875             rex->offs[paren].end = -1; \
5876             rex->lastparen      = ST.lastparen; \
5877             rex->lastcloseparen = ST.lastcloseparen; \
5878         } \
5879     }
5880
5881         case STAR:              /*  /A*B/ where A is width 1 char */
5882             ST.paren = 0;
5883             ST.min = 0;
5884             ST.max = REG_INFTY;
5885             scan = NEXTOPER(scan);
5886             goto repeat;
5887
5888         case PLUS:              /*  /A+B/ where A is width 1 char */
5889             ST.paren = 0;
5890             ST.min = 1;
5891             ST.max = REG_INFTY;
5892             scan = NEXTOPER(scan);
5893             goto repeat;
5894
5895         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5896             ST.paren = scan->flags;     /* Which paren to set */
5897             ST.lastparen      = rex->lastparen;
5898             ST.lastcloseparen = rex->lastcloseparen;
5899             if (ST.paren > maxopenparen)
5900                 maxopenparen = ST.paren;
5901             ST.min = ARG1(scan);  /* min to match */
5902             ST.max = ARG2(scan);  /* max to match */
5903             if (cur_eval && cur_eval->u.eval.close_paren &&
5904                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5905                 ST.min=1;
5906                 ST.max=1;
5907             }
5908             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5909             goto repeat;
5910
5911         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5912             ST.paren = 0;
5913             ST.min = ARG1(scan);  /* min to match */
5914             ST.max = ARG2(scan);  /* max to match */
5915             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5916           repeat:
5917             /*
5918             * Lookahead to avoid useless match attempts
5919             * when we know what character comes next.
5920             *
5921             * Used to only do .*x and .*?x, but now it allows
5922             * for )'s, ('s and (?{ ... })'s to be in the way
5923             * of the quantifier and the EXACT-like node.  -- japhy
5924             */
5925
5926             assert(ST.min <= ST.max);
5927             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5928                 ST.c1 = ST.c2 = CHRTEST_VOID;
5929             }
5930             else {
5931                 regnode *text_node = next;
5932
5933                 if (! HAS_TEXT(text_node))
5934                     FIND_NEXT_IMPT(text_node);
5935
5936                 if (! HAS_TEXT(text_node))
5937                     ST.c1 = ST.c2 = CHRTEST_VOID;
5938                 else {
5939                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5940                         ST.c1 = ST.c2 = CHRTEST_VOID;
5941                     }
5942                     else {
5943
5944                     /*  Currently we only get here when
5945
5946                         PL_rekind[OP(text_node)] == EXACT
5947
5948                         if this changes back then the macro for IS_TEXT and
5949                         friends need to change. */
5950                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5951                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5952                         {
5953                             sayNO;
5954                         }
5955                     }
5956                 }
5957             }
5958
5959             ST.A = scan;
5960             ST.B = next;
5961             if (minmod) {
5962                 char *li = locinput;
5963                 minmod = 0;
5964                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5965                     sayNO;
5966                 SET_locinput(li);
5967                 ST.count = ST.min;
5968                 REGCP_SET(ST.cp);
5969                 if (ST.c1 == CHRTEST_VOID)
5970                     goto curly_try_B_min;
5971
5972                 ST.oldloc = locinput;
5973
5974                 /* set ST.maxpos to the furthest point along the
5975                  * string that could possibly match */
5976                 if  (ST.max == REG_INFTY) {
5977                     ST.maxpos = PL_regeol - 1;
5978                     if (utf8_target)
5979                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5980                             ST.maxpos--;
5981                 }
5982                 else if (utf8_target) {
5983                     int m = ST.max - ST.min;
5984                     for (ST.maxpos = locinput;
5985                          m >0 && ST.maxpos < PL_regeol; m--)
5986                         ST.maxpos += UTF8SKIP(ST.maxpos);
5987                 }
5988                 else {
5989                     ST.maxpos = locinput + ST.max - ST.min;
5990                     if (ST.maxpos >= PL_regeol)
5991                         ST.maxpos = PL_regeol - 1;
5992                 }
5993                 goto curly_try_B_min_known;
5994
5995             }
5996             else {
5997                 /* avoid taking address of locinput, so it can remain
5998                  * a register var */
5999                 char *li = locinput;
6000                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
6001                 if (ST.count < ST.min)
6002                     sayNO;
6003                 SET_locinput(li);
6004                 if ((ST.count > ST.min)
6005                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6006                 {
6007                     /* A{m,n} must come at the end of the string, there's
6008                      * no point in backing off ... */
6009                     ST.min = ST.count;
6010                     /* ...except that $ and \Z can match before *and* after
6011                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6012                        We may back off by one in this case. */
6013                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6014                         ST.min--;
6015                 }
6016                 REGCP_SET(ST.cp);
6017                 goto curly_try_B_max;
6018             }
6019             assert(0); /* NOTREACHED */
6020
6021
6022         case CURLY_B_min_known_fail:
6023             /* failed to find B in a non-greedy match where c1,c2 valid */
6024
6025             REGCP_UNWIND(ST.cp);
6026             if (ST.paren) {
6027                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6028             }
6029             /* Couldn't or didn't -- move forward. */
6030             ST.oldloc = locinput;
6031             if (utf8_target)
6032                 locinput += UTF8SKIP(locinput);
6033             else
6034                 locinput++;
6035             ST.count++;
6036           curly_try_B_min_known:
6037              /* find the next place where 'B' could work, then call B */
6038             {
6039                 int n;
6040                 if (utf8_target) {
6041                     n = (ST.oldloc == locinput) ? 0 : 1;
6042                     if (ST.c1 == ST.c2) {
6043                         /* set n to utf8_distance(oldloc, locinput) */
6044                         while (locinput <= ST.maxpos
6045                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6046                         {
6047                             locinput += UTF8SKIP(locinput);
6048                             n++;
6049                         }
6050                     }
6051                     else {
6052                         /* set n to utf8_distance(oldloc, locinput) */
6053                         while (locinput <= ST.maxpos
6054                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6055                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6056                         {
6057                             locinput += UTF8SKIP(locinput);
6058                             n++;
6059                         }
6060                     }
6061                 }
6062                 else {  /* Not utf8_target */
6063                     if (ST.c1 == ST.c2) {
6064                         while (locinput <= ST.maxpos &&
6065                                UCHARAT(locinput) != ST.c1)
6066                             locinput++;
6067                     }
6068                     else {
6069                         while (locinput <= ST.maxpos
6070                                && UCHARAT(locinput) != ST.c1
6071                                && UCHARAT(locinput) != ST.c2)
6072                             locinput++;
6073                     }
6074                     n = locinput - ST.oldloc;
6075                 }
6076                 if (locinput > ST.maxpos)
6077                     sayNO;
6078                 if (n) {
6079                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6080                      * at b; check that everything between oldloc and
6081                      * locinput matches */
6082                     char *li = ST.oldloc;
6083                     ST.count += n;
6084                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
6085                         sayNO;
6086                     assert(n == REG_INFTY || locinput == li);
6087                 }
6088                 CURLY_SETPAREN(ST.paren, ST.count);
6089                 if (cur_eval && cur_eval->u.eval.close_paren &&
6090                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6091                     goto fake_end;
6092                 }
6093                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6094             }
6095             assert(0); /* NOTREACHED */
6096
6097
6098         case CURLY_B_min_fail:
6099             /* failed to find B in a non-greedy match where c1,c2 invalid */
6100
6101             REGCP_UNWIND(ST.cp);
6102             if (ST.paren) {
6103                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6104             }
6105             /* failed -- move forward one */
6106             {
6107                 char *li = locinput;
6108                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6109                     sayNO;
6110                 }
6111                 locinput = li;
6112             }
6113             {
6114                 ST.count++;
6115                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6116                         ST.count > 0)) /* count overflow ? */
6117                 {
6118                   curly_try_B_min:
6119                     CURLY_SETPAREN(ST.paren, ST.count);
6120                     if (cur_eval && cur_eval->u.eval.close_paren &&
6121                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6122                         goto fake_end;
6123                     }
6124                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6125                 }
6126             }
6127             sayNO;
6128             assert(0); /* NOTREACHED */
6129
6130
6131         curly_try_B_max:
6132             /* a successful greedy match: now try to match B */
6133             if (cur_eval && cur_eval->u.eval.close_paren &&
6134                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6135                 goto fake_end;
6136             }
6137             {
6138                 bool could_match = locinput < PL_regeol;
6139
6140                 /* If it could work, try it. */
6141                 if (ST.c1 != CHRTEST_VOID && could_match) {
6142                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6143                     {
6144                         could_match = memEQ(locinput,
6145                                             ST.c1_utf8,
6146                                             UTF8SKIP(locinput))
6147                                     || memEQ(locinput,
6148                                              ST.c2_utf8,
6149                                              UTF8SKIP(locinput));
6150                     }
6151                     else {
6152                         could_match = UCHARAT(locinput) == ST.c1
6153                                       || UCHARAT(locinput) == ST.c2;
6154                     }
6155                 }
6156                 if (ST.c1 == CHRTEST_VOID || could_match) {
6157                     CURLY_SETPAREN(ST.paren, ST.count);
6158                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6159                     assert(0); /* NOTREACHED */
6160                 }
6161             }
6162             /* FALL THROUGH */
6163
6164         case CURLY_B_max_fail:
6165             /* failed to find B in a greedy match */
6166
6167             REGCP_UNWIND(ST.cp);
6168             if (ST.paren) {
6169                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6170             }
6171             /*  back up. */
6172             if (--ST.count < ST.min)
6173                 sayNO;
6174             locinput = HOPc(locinput, -1);
6175             goto curly_try_B_max;
6176
6177 #undef ST
6178
6179         case END: /*  last op of main pattern  */
6180             fake_end:
6181             if (cur_eval) {
6182                 /* we've just finished A in /(??{A})B/; now continue with B */
6183                 st->u.eval.toggle_reg_flags
6184                             = cur_eval->u.eval.toggle_reg_flags;
6185                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6186
6187                 st->u.eval.prev_rex = rex_sv;           /* inner */
6188
6189                 /* Save *all* the positions. */
6190                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6191                 rex_sv = cur_eval->u.eval.prev_rex;
6192                 SET_reg_curpm(rex_sv);
6193                 rex = ReANY(rex_sv);
6194                 rexi = RXi_GET(rex);
6195                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6196
6197                 REGCP_SET(st->u.eval.lastcp);
6198
6199                 /* Restore parens of the outer rex without popping the
6200                  * savestack */
6201                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6202                                         &maxopenparen);
6203
6204                 st->u.eval.prev_eval = cur_eval;
6205                 cur_eval = cur_eval->u.eval.prev_eval;
6206                 DEBUG_EXECUTE_r(
6207                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6208                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6209                 if ( nochange_depth )
6210                     nochange_depth--;
6211
6212                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6213                                     locinput); /* match B */
6214             }
6215
6216             if (locinput < reginfo->till) {
6217                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6218                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6219                                       PL_colors[4],
6220                                       (long)(locinput - PL_reg_starttry),
6221                                       (long)(reginfo->till - PL_reg_starttry),
6222                                       PL_colors[5]));
6223
6224                 sayNO_SILENT;           /* Cannot match: too short. */
6225             }
6226             sayYES;                     /* Success! */
6227
6228         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6229             DEBUG_EXECUTE_r(
6230             PerlIO_printf(Perl_debug_log,
6231                 "%*s  %ssubpattern success...%s\n",
6232                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6233             sayYES;                     /* Success! */
6234
6235 #undef  ST
6236 #define ST st->u.ifmatch
6237
6238         {
6239             char *newstart;
6240
6241         case SUSPEND:   /* (?>A) */
6242             ST.wanted = 1;
6243             newstart = locinput;
6244             goto do_ifmatch;
6245
6246         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6247             ST.wanted = 0;
6248             goto ifmatch_trivial_fail_test;
6249
6250         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6251             ST.wanted = 1;
6252           ifmatch_trivial_fail_test:
6253             if (scan->flags) {
6254                 char * const s = HOPBACKc(locinput, scan->flags);
6255                 if (!s) {
6256                     /* trivial fail */
6257                     if (logical) {
6258                         logical = 0;
6259                         sw = 1 - cBOOL(ST.wanted);
6260                     }
6261                     else if (ST.wanted)
6262                         sayNO;
6263                     next = scan + ARG(scan);
6264                     if (next == scan)
6265                         next = NULL;
6266                     break;
6267                 }
6268                 newstart = s;
6269             }
6270             else
6271                 newstart = locinput;
6272
6273           do_ifmatch:
6274             ST.me = scan;
6275             ST.logical = logical;
6276             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6277
6278             /* execute body of (?...A) */
6279             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6280             assert(0); /* NOTREACHED */
6281         }
6282
6283         case IFMATCH_A_fail: /* body of (?...A) failed */
6284             ST.wanted = !ST.wanted;
6285             /* FALL THROUGH */
6286
6287         case IFMATCH_A: /* body of (?...A) succeeded */
6288             if (ST.logical) {
6289                 sw = cBOOL(ST.wanted);
6290             }
6291             else if (!ST.wanted)
6292                 sayNO;
6293
6294             if (OP(ST.me) != SUSPEND) {
6295                 /* restore old position except for (?>...) */
6296                 locinput = st->locinput;
6297             }
6298             scan = ST.me + ARG(ST.me);
6299             if (scan == ST.me)
6300                 scan = NULL;
6301             continue; /* execute B */
6302
6303 #undef ST
6304
6305         case LONGJMP: /*  alternative with many branches compiles to
6306                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6307             next = scan + ARG(scan);
6308             if (next == scan)
6309                 next = NULL;
6310             break;
6311
6312         case COMMIT:  /*  (*COMMIT)  */
6313             reginfo->cutpoint = PL_regeol;
6314             /* FALLTHROUGH */
6315
6316         case PRUNE:   /*  (*PRUNE)   */
6317             if (!scan->flags)
6318                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6319             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6320             assert(0); /* NOTREACHED */
6321
6322         case COMMIT_next_fail:
6323             no_final = 1;
6324             /* FALLTHROUGH */
6325
6326         case OPFAIL:   /* (*FAIL)  */
6327             sayNO;
6328             assert(0); /* NOTREACHED */
6329
6330 #define ST st->u.mark
6331         case MARKPOINT: /*  (*MARK:foo)  */
6332             ST.prev_mark = mark_state;
6333             ST.mark_name = sv_commit = sv_yes_mark
6334                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6335             mark_state = st;
6336             ST.mark_loc = locinput;
6337             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6338             assert(0); /* NOTREACHED */
6339
6340         case MARKPOINT_next:
6341             mark_state = ST.prev_mark;
6342             sayYES;
6343             assert(0); /* NOTREACHED */
6344
6345         case MARKPOINT_next_fail:
6346             if (popmark && sv_eq(ST.mark_name,popmark))
6347             {
6348                 if (ST.mark_loc > startpoint)
6349                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6350                 popmark = NULL; /* we found our mark */
6351                 sv_commit = ST.mark_name;
6352
6353                 DEBUG_EXECUTE_r({
6354                         PerlIO_printf(Perl_debug_log,
6355                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6356                             REPORT_CODE_OFF+depth*2, "",
6357                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6358                 });
6359             }
6360             mark_state = ST.prev_mark;
6361             sv_yes_mark = mark_state ?
6362                 mark_state->u.mark.mark_name : NULL;
6363             sayNO;
6364             assert(0); /* NOTREACHED */
6365
6366         case SKIP:  /*  (*SKIP)  */
6367             if (scan->flags) {
6368                 /* (*SKIP) : if we fail we cut here*/
6369                 ST.mark_name = NULL;
6370                 ST.mark_loc = locinput;
6371                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6372             } else {
6373                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6374                    otherwise do nothing.  Meaning we need to scan
6375                  */
6376                 regmatch_state *cur = mark_state;
6377                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6378
6379                 while (cur) {
6380                     if ( sv_eq( cur->u.mark.mark_name,
6381                                 find ) )
6382                     {
6383                         ST.mark_name = find;
6384                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6385                     }
6386                     cur = cur->u.mark.prev_mark;
6387                 }
6388             }
6389             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6390             break;
6391
6392         case SKIP_next_fail:
6393             if (ST.mark_name) {
6394                 /* (*CUT:NAME) - Set up to search for the name as we
6395                    collapse the stack*/
6396                 popmark = ST.mark_name;
6397             } else {
6398                 /* (*CUT) - No name, we cut here.*/
6399                 if (ST.mark_loc > startpoint)
6400                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6401                 /* but we set sv_commit to latest mark_name if there
6402                    is one so they can test to see how things lead to this
6403                    cut */
6404                 if (mark_state)
6405                     sv_commit=mark_state->u.mark.mark_name;
6406             }
6407             no_final = 1;
6408             sayNO;
6409             assert(0); /* NOTREACHED */
6410 #undef ST
6411
6412         case LNBREAK: /* \R */
6413             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6414                 locinput += n;
6415             } else
6416                 sayNO;
6417             break;
6418
6419 #define CASE_CLASS(nAmE)                              \
6420         case nAmE:                                    \
6421             if (NEXTCHR_IS_EOS)                       \
6422                 sayNO;                                \
6423             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6424                 locinput += n;                        \
6425             } else                                    \
6426                 sayNO;                                \
6427             break;                                    \
6428         case N##nAmE:                                 \
6429             if (NEXTCHR_IS_EOS)                       \
6430                 sayNO;                                \
6431             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6432                 sayNO;                                \
6433             } else {                                  \
6434                 locinput += UTF8SKIP(locinput);       \
6435             }                                         \
6436             break
6437
6438         CASE_CLASS(VERTWS);  /*  \v \V  */
6439         CASE_CLASS(HORIZWS); /*  \h \H  */
6440 #undef CASE_CLASS
6441
6442         default:
6443             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6444                           PTR2UV(scan), OP(scan));
6445             Perl_croak(aTHX_ "regexp memory corruption");
6446
6447         /* this is a point to jump to in order to increment
6448          * locinput by one character */
6449         increment_locinput:
6450             assert(!NEXTCHR_IS_EOS);
6451             if (utf8_target) {
6452                 locinput += PL_utf8skip[nextchr];
6453                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6454                 if (locinput > PL_regeol)
6455                     sayNO;
6456             }
6457             else
6458                 locinput++;
6459             break;
6460
6461         } /* end switch */
6462
6463         /* switch break jumps here */
6464         scan = next; /* prepare to execute the next op and ... */
6465         continue;    /* ... jump back to the top, reusing st */
6466         assert(0); /* NOTREACHED */
6467
6468       push_yes_state:
6469         /* push a state that backtracks on success */
6470         st->u.yes.prev_yes_state = yes_state;
6471         yes_state = st;
6472         /* FALL THROUGH */
6473       push_state:
6474         /* push a new regex state, then continue at scan  */
6475         {
6476             regmatch_state *newst;
6477
6478             DEBUG_STACK_r({
6479                 regmatch_state *cur = st;
6480                 regmatch_state *curyes = yes_state;
6481                 int curd = depth;
6482                 regmatch_slab *slab = PL_regmatch_slab;
6483                 for (;curd > -1;cur--,curd--) {
6484                     if (cur < SLAB_FIRST(slab)) {
6485                         slab = slab->prev;
6486                         cur = SLAB_LAST(slab);
6487                     }
6488                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6489                         REPORT_CODE_OFF + 2 + depth * 2,"",
6490                         curd, PL_reg_name[cur->resume_state],
6491                         (curyes == cur) ? "yes" : ""
6492                     );
6493                     if (curyes == cur)
6494                         curyes = cur->u.yes.prev_yes_state;
6495                 }
6496             } else
6497                 DEBUG_STATE_pp("push")
6498             );
6499             depth++;
6500             st->locinput = locinput;
6501             newst = st+1;
6502             if (newst >  SLAB_LAST(PL_regmatch_slab))
6503                 newst = S_push_slab(aTHX);
6504             PL_regmatch_state = newst;
6505
6506             locinput = pushinput;
6507             st = newst;
6508             continue;
6509             assert(0); /* NOTREACHED */
6510         }
6511     }
6512
6513     /*
6514     * We get here only if there's trouble -- normally "case END" is
6515     * the terminating point.
6516     */
6517     Perl_croak(aTHX_ "corrupted regexp pointers");
6518     /*NOTREACHED*/
6519     sayNO;
6520
6521 yes:
6522     if (yes_state) {
6523         /* we have successfully completed a subexpression, but we must now
6524          * pop to the state marked by yes_state and continue from there */
6525         assert(st != yes_state);
6526 #ifdef DEBUGGING
6527         while (st != yes_state) {
6528             st--;
6529             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6530                 PL_regmatch_slab = PL_regmatch_slab->prev;
6531                 st = SLAB_LAST(PL_regmatch_slab);
6532             }
6533             DEBUG_STATE_r({
6534                 if (no_final) {
6535                     DEBUG_STATE_pp("pop (no final)");
6536                 } else {
6537                     DEBUG_STATE_pp("pop (yes)");
6538                 }
6539             });
6540             depth--;
6541         }
6542 #else
6543         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6544             || yes_state > SLAB_LAST(PL_regmatch_slab))
6545         {
6546             /* not in this slab, pop slab */
6547             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6548             PL_regmatch_slab = PL_regmatch_slab->prev;
6549             st = SLAB_LAST(PL_regmatch_slab);
6550         }
6551         depth -= (st - yes_state);
6552 #endif
6553         st = yes_state;
6554         yes_state = st->u.yes.prev_yes_state;
6555         PL_regmatch_state = st;
6556
6557         if (no_final)
6558             locinput= st->locinput;
6559         state_num = st->resume_state + no_final;
6560         goto reenter_switch;
6561     }
6562
6563     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6564                           PL_colors[4], PL_colors[5]));
6565
6566     if (PL_reg_state.re_state_eval_setup_done) {
6567         /* each successfully executed (?{...}) block does the equivalent of
6568          *   local $^R = do {...}
6569          * When popping the save stack, all these locals would be undone;
6570          * bypass this by setting the outermost saved $^R to the latest
6571          * value */
6572         if (oreplsv != GvSV(PL_replgv))
6573             sv_setsv(oreplsv, GvSV(PL_replgv));
6574     }
6575     result = 1;
6576     goto final_exit;
6577
6578 no:
6579     DEBUG_EXECUTE_r(
6580         PerlIO_printf(Perl_debug_log,
6581             "%*s  %sfailed...%s\n",
6582             REPORT_CODE_OFF+depth*2, "",
6583             PL_colors[4], PL_colors[5])
6584         );
6585
6586 no_silent:
6587     if (no_final) {
6588         if (yes_state) {
6589             goto yes;
6590         } else {
6591             goto final_exit;
6592         }
6593     }
6594     if (depth) {
6595         /* there's a previous state to backtrack to */
6596         st--;
6597         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6598             PL_regmatch_slab = PL_regmatch_slab->prev;
6599             st = SLAB_LAST(PL_regmatch_slab);
6600         }
6601         PL_regmatch_state = st;
6602         locinput= st->locinput;
6603
6604         DEBUG_STATE_pp("pop");
6605         depth--;
6606         if (yes_state == st)
6607             yes_state = st->u.yes.prev_yes_state;
6608
6609         state_num = st->resume_state + 1; /* failure = success + 1 */
6610         goto reenter_switch;
6611     }
6612     result = 0;
6613
6614   final_exit:
6615     if (rex->intflags & PREGf_VERBARG_SEEN) {
6616         SV *sv_err = get_sv("REGERROR", 1);
6617         SV *sv_mrk = get_sv("REGMARK", 1);
6618         if (result) {
6619             sv_commit = &PL_sv_no;
6620             if (!sv_yes_mark)
6621                 sv_yes_mark = &PL_sv_yes;
6622         } else {
6623             if (!sv_commit)
6624                 sv_commit = &PL_sv_yes;
6625             sv_yes_mark = &PL_sv_no;
6626         }
6627         sv_setsv(sv_err, sv_commit);
6628         sv_setsv(sv_mrk, sv_yes_mark);
6629     }
6630
6631
6632     if (last_pushed_cv) {
6633         dSP;
6634         POP_MULTICALL;
6635         PERL_UNUSED_VAR(SP);
6636     }
6637
6638     /* clean up; in particular, free all slabs above current one */
6639     LEAVE_SCOPE(oldsave);
6640
6641     assert(!result ||  locinput - PL_bostr >= 0);
6642     return result ?  locinput - PL_bostr : -1;
6643 }
6644
6645 /*
6646  - regrepeat - repeatedly match something simple, report how many
6647  *
6648  * What 'simple' means is a node which can be the operand of a quantifier like
6649  * '+', or {1,3}
6650  *
6651  * startposp - pointer a pointer to the start position.  This is updated
6652  *             to point to the byte following the highest successful
6653  *             match.
6654  * p         - the regnode to be repeatedly matched against.
6655  * max       - maximum number of things to match.
6656  * depth     - (for debugging) backtracking depth.
6657  */
6658 STATIC I32
6659 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6660 {
6661     dVAR;
6662     char *scan;     /* Pointer to current position in target string */
6663     I32 c;
6664     char *loceol = PL_regeol;   /* local version */
6665     I32 hardcount = 0;  /* How many matches so far */
6666     bool utf8_target = PL_reg_match_utf8;
6667     UV utf8_flags;
6668 #ifndef DEBUGGING
6669     PERL_UNUSED_ARG(depth);
6670 #endif
6671
6672     PERL_ARGS_ASSERT_REGREPEAT;
6673
6674     scan = *startposp;
6675     if (max == REG_INFTY)
6676         max = I32_MAX;
6677     else if (! utf8_target && scan + max < loceol)
6678         loceol = scan + max;
6679
6680     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6681      * to the maximum of how far we should go in it (leaving it set to the real
6682      * end, if the maximum permissible would take us beyond that).  This allows
6683      * us to make the loop exit condition that we haven't gone past <loceol> to
6684      * also mean that we haven't exceeded the max permissible count, saving a
6685      * test each time through the loop.  But it assumes that the OP matches a
6686      * single byte, which is true for most of the OPs below when applied to a
6687      * non-UTF-8 target.  Those relatively few OPs that don't have this
6688      * characteristic will have to compensate.
6689      *
6690      * There is no adjustment for UTF-8 targets, as the number of bytes per
6691      * character varies.  OPs will have to test both that the count is less
6692      * than the max permissible (using <hardcount> to keep track), and that we
6693      * are still within the bounds of the string (using <loceol>.  A few OPs
6694      * match a single byte no matter what the encoding.  They can omit the max
6695      * test if, for the UTF-8 case, they do the adjustment that was skipped
6696      * above.
6697      *
6698      * Thus, the code above sets things up for the common case; and exceptional
6699      * cases need extra work; the common case is to make sure <scan> doesn't
6700      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6701      * count doesn't exceed the maximum permissible */
6702
6703     switch (OP(p)) {
6704     case REG_ANY:
6705         if (utf8_target) {
6706             while (scan < loceol && hardcount < max && *scan != '\n') {
6707                 scan += UTF8SKIP(scan);
6708                 hardcount++;
6709             }
6710         } else {
6711             while (scan < loceol && *scan != '\n')
6712                 scan++;
6713         }
6714         break;
6715     case SANY:
6716         if (utf8_target) {
6717             while (scan < loceol && hardcount < max) {
6718                 scan += UTF8SKIP(scan);
6719                 hardcount++;
6720             }
6721         }
6722         else
6723             scan = loceol;
6724         break;
6725     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6726         if (utf8_target && scan + max < loceol) {
6727
6728             /* <loceol> hadn't been adjusted in the UTF-8 case */
6729             scan +=  max;
6730         }
6731         else {
6732             scan = loceol;
6733         }
6734         break;
6735     case EXACT:
6736         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6737
6738         c = (U8)*STRING(p);
6739
6740         /* Can use a simple loop if the pattern char to match on is invariant
6741          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6742          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6743          * true iff it doesn't matter if the argument is in UTF-8 or not */
6744         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6745             if (utf8_target && scan + max < loceol) {
6746                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6747                  * since here, to match at all, 1 char == 1 byte */
6748                 loceol = scan + max;
6749             }
6750             while (scan < loceol && UCHARAT(scan) == c) {
6751                 scan++;
6752             }
6753         }
6754         else if (UTF_PATTERN) {
6755             if (utf8_target) {
6756                 STRLEN scan_char_len;
6757
6758                 /* When both target and pattern are UTF-8, we have to do
6759                  * string EQ */
6760                 while (hardcount < max
6761                        && scan < loceol
6762                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6763                        && memEQ(scan, STRING(p), scan_char_len))
6764                 {
6765                     scan += scan_char_len;
6766                     hardcount++;
6767                 }
6768             }
6769             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6770
6771                 /* Target isn't utf8; convert the character in the UTF-8
6772                  * pattern to non-UTF8, and do a simple loop */
6773                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6774                 while (scan < loceol && UCHARAT(scan) == c) {
6775                     scan++;
6776                 }
6777             } /* else pattern char is above Latin1, can't possibly match the
6778                  non-UTF-8 target */
6779         }
6780         else {
6781
6782             /* Here, the string must be utf8; pattern isn't, and <c> is
6783              * different in utf8 than not, so can't compare them directly.
6784              * Outside the loop, find the two utf8 bytes that represent c, and
6785              * then look for those in sequence in the utf8 string */
6786             U8 high = UTF8_TWO_BYTE_HI(c);
6787             U8 low = UTF8_TWO_BYTE_LO(c);
6788
6789             while (hardcount < max
6790                     && scan + 1 < loceol
6791                     && UCHARAT(scan) == high
6792                     && UCHARAT(scan + 1) == low)
6793             {
6794                 scan += 2;
6795                 hardcount++;
6796             }
6797         }
6798         break;
6799
6800     case EXACTFA:
6801         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6802         goto do_exactf;
6803
6804     case EXACTFL:
6805         PL_reg_flags |= RF_tainted;
6806         utf8_flags = FOLDEQ_UTF8_LOCALE;
6807         goto do_exactf;
6808
6809     case EXACTF:
6810             utf8_flags = 0;
6811             goto do_exactf;
6812
6813     case EXACTFU_SS:
6814     case EXACTFU_TRICKYFOLD:
6815     case EXACTFU:
6816         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6817
6818     do_exactf: {
6819         int c1, c2;
6820         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6821
6822         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6823
6824         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6825             if (c1 == CHRTEST_VOID) {
6826                 /* Use full Unicode fold matching */
6827                 char *tmpeol = PL_regeol;
6828                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6829                 while (hardcount < max
6830                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6831                                              STRING(p), NULL, pat_len,
6832                                              cBOOL(UTF_PATTERN), utf8_flags))
6833                 {
6834                     scan = tmpeol;
6835                     tmpeol = PL_regeol;
6836                     hardcount++;
6837                 }
6838             }
6839             else if (utf8_target) {
6840                 if (c1 == c2) {
6841                     while (scan < loceol
6842                            && hardcount < max
6843                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6844                     {
6845                         scan += UTF8SKIP(scan);
6846                         hardcount++;
6847                     }
6848                 }
6849                 else {
6850                     while (scan < loceol
6851                            && hardcount < max
6852                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6853                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6854                     {
6855                         scan += UTF8SKIP(scan);
6856                         hardcount++;
6857                     }
6858                 }
6859             }
6860             else if (c1 == c2) {
6861                 while (scan < loceol && UCHARAT(scan) == c1) {
6862                     scan++;
6863                 }
6864             }
6865             else {
6866                 while (scan < loceol &&
6867                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6868                 {
6869                     scan++;
6870                 }
6871             }
6872         }
6873         break;
6874     }
6875     case ANYOF:
6876         if (utf8_target) {
6877             while (hardcount < max
6878                    && scan < loceol
6879                    && reginclass(prog, p, (U8*)scan, utf8_target))
6880             {
6881                 scan += UTF8SKIP(scan);
6882                 hardcount++;
6883             }
6884         } else {
6885             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6886                 scan++;
6887         }
6888         break;
6889     case ALNUMU:
6890         if (utf8_target) {
6891     utf8_wordchar:
6892             LOAD_UTF8_CHARCLASS_ALNUM();
6893             while (hardcount < max && scan < loceol &&
6894                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6895             {
6896                 scan += UTF8SKIP(scan);
6897                 hardcount++;
6898             }
6899         } else {
6900             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6901                 scan++;
6902             }
6903         }
6904         break;
6905     case ALNUM:
6906         if (utf8_target)
6907             goto utf8_wordchar;
6908         while (scan < loceol && isALNUM((U8) *scan)) {
6909             scan++;
6910         }
6911         break;
6912     case ALNUMA:
6913         if (utf8_target && scan + max < loceol) {
6914
6915             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6916              * since here, to match, 1 char == 1 byte */
6917             loceol = scan + max;
6918         }
6919         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6920             scan++;
6921         }
6922         break;
6923     case ALNUML:
6924         PL_reg_flags |= RF_tainted;
6925         if (utf8_target) {
6926             while (hardcount < max && scan < loceol &&
6927                    isALNUM_LC_utf8((U8*)scan)) {
6928                 scan += UTF8SKIP(scan);
6929                 hardcount++;
6930             }
6931         } else {
6932             while (scan < loceol && isALNUM_LC(*scan))
6933                 scan++;
6934         }
6935         break;
6936     case NALNUMU:
6937         if (utf8_target) {
6938
6939     utf8_Nwordchar:
6940
6941             LOAD_UTF8_CHARCLASS_ALNUM();
6942             while (hardcount < max && scan < loceol &&
6943                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6944             {
6945                 scan += UTF8SKIP(scan);
6946                 hardcount++;
6947             }
6948         } else {
6949             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6950                 scan++;
6951             }
6952         }
6953         break;
6954     case NALNUM:
6955         if (utf8_target)
6956             goto utf8_Nwordchar;
6957         while (scan < loceol && ! isALNUM((U8) *scan)) {
6958             scan++;
6959         }
6960         break;
6961
6962     case POSIXA:
6963         if (utf8_target && scan + max < loceol) {
6964
6965             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6966              * since here, to match, 1 char == 1 byte */
6967             loceol = scan + max;
6968         }
6969         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6970             scan++;
6971         }
6972         break;
6973     case NPOSIXA:
6974         if (utf8_target) {
6975             while (scan < loceol && hardcount < max
6976                    && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6977             {
6978                 scan += UTF8SKIP(scan);
6979                 hardcount++;
6980             }
6981         }
6982         else {
6983             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6984                 scan++;
6985             }
6986         }
6987         break;
6988     case NALNUMA:
6989         if (utf8_target) {
6990             while (scan < loceol && hardcount < max
6991                    && ! isWORDCHAR_A((U8) *scan))
6992             {
6993                 scan += UTF8SKIP(scan);
6994                 hardcount++;
6995             }
6996         }
6997         else {
6998             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6999                 scan++;
7000             }
7001         }
7002         break;
7003     case NALNUML:
7004         PL_reg_flags |= RF_tainted;
7005         if (utf8_target) {
7006             while (hardcount < max && scan < loceol &&
7007                    !isALNUM_LC_utf8((U8*)scan)) {
7008                 scan += UTF8SKIP(scan);
7009                 hardcount++;
7010             }
7011         } else {
7012             while (scan < loceol && !isALNUM_LC(*scan))
7013                 scan++;
7014         }
7015         break;
7016     case SPACEU:
7017         if (utf8_target) {
7018
7019     utf8_space:
7020
7021             while (hardcount < max && scan < loceol
7022                    && is_XPERLSPACE_utf8((U8*)scan))
7023             {
7024                 scan += UTF8SKIP(scan);
7025                 hardcount++;
7026             }
7027             break;
7028         }
7029         else {
7030             while (scan < loceol && isSPACE_L1((U8) *scan)) {
7031                 scan++;
7032             }
7033             break;
7034         }
7035     case SPACE:
7036         if (utf8_target)
7037             goto utf8_space;
7038
7039         while (scan < loceol && isSPACE((U8) *scan)) {
7040             scan++;
7041         }
7042         break;
7043     case SPACEA:
7044         if (utf8_target && scan + max < loceol) {
7045
7046             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7047              * since here, to match, 1 char == 1 byte */
7048             loceol = scan + max;
7049         }
7050         while (scan < loceol && isSPACE_A((U8) *scan)) {
7051             scan++;
7052         }
7053         break;
7054     case SPACEL:
7055         PL_reg_flags |= RF_tainted;
7056         if (utf8_target) {
7057             while (hardcount < max && scan < loceol &&
7058                    isSPACE_LC_utf8((U8*)scan)) {
7059                 scan += UTF8SKIP(scan);
7060                 hardcount++;
7061             }
7062         } else {
7063             while (scan < loceol && isSPACE_LC(*scan))
7064                 scan++;
7065         }
7066         break;
7067     case NSPACEU:
7068         if (utf8_target) {
7069
7070     utf8_Nspace:
7071
7072             while (hardcount < max && scan < loceol
7073                    && ! is_XPERLSPACE_utf8((U8*)scan))
7074             {
7075                 scan += UTF8SKIP(scan);
7076                 hardcount++;
7077             }
7078             break;
7079         }
7080         else {
7081             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7082                 scan++;
7083             }
7084         }
7085         break;
7086     case NSPACE:
7087         if (utf8_target)
7088             goto utf8_Nspace;
7089
7090         while (scan < loceol && ! isSPACE((U8) *scan)) {
7091             scan++;
7092         }
7093         break;
7094     case NSPACEA:
7095         if (utf8_target) {
7096             while (hardcount < max && scan < loceol
7097                    && ! isSPACE_A((U8) *scan))
7098             {
7099                 scan += UTF8SKIP(scan);
7100                 hardcount++;
7101             }
7102         }
7103         else {
7104             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7105                 scan++;
7106             }
7107         }
7108         break;
7109     case NSPACEL:
7110         PL_reg_flags |= RF_tainted;
7111         if (utf8_target) {
7112             while (hardcount < max && scan < loceol &&
7113                    !isSPACE_LC_utf8((U8*)scan)) {
7114                 scan += UTF8SKIP(scan);
7115                 hardcount++;
7116             }
7117         } else {
7118             while (scan < loceol && !isSPACE_LC(*scan))
7119                 scan++;
7120         }
7121         break;
7122     case DIGIT:
7123         if (utf8_target) {
7124             LOAD_UTF8_CHARCLASS_DIGIT();
7125             while (hardcount < max && scan < loceol &&
7126                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7127                 scan += UTF8SKIP(scan);
7128                 hardcount++;
7129             }
7130         } else {
7131             while (scan < loceol && isDIGIT(*scan))
7132                 scan++;
7133         }
7134         break;
7135     case DIGITA:
7136         if (utf8_target && scan + max < loceol) {
7137
7138             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7139              * since here, to match, 1 char == 1 byte */
7140             loceol = scan + max;
7141         }
7142         while (scan < loceol && isDIGIT_A((U8) *scan)) {
7143             scan++;
7144         }
7145         break;
7146     case DIGITL:
7147         PL_reg_flags |= RF_tainted;
7148         if (utf8_target) {
7149             while (hardcount < max && scan < loceol &&
7150                    isDIGIT_LC_utf8((U8*)scan)) {
7151                 scan += UTF8SKIP(scan);
7152                 hardcount++;
7153             }
7154         } else {
7155             while (scan < loceol && isDIGIT_LC(*scan))
7156                 scan++;
7157         }
7158         break;
7159     case NDIGIT:
7160         if (utf8_target) {
7161             LOAD_UTF8_CHARCLASS_DIGIT();
7162             while (hardcount < max && scan < loceol &&
7163                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7164                 scan += UTF8SKIP(scan);
7165                 hardcount++;
7166             }
7167         } else {
7168             while (scan < loceol && !isDIGIT(*scan))
7169                 scan++;
7170         }
7171         break;
7172     case NDIGITA:
7173         if (utf8_target) {
7174             while (hardcount < max && scan < loceol
7175                    && ! isDIGIT_A((U8) *scan)) {
7176                 scan += UTF8SKIP(scan);
7177                 hardcount++;
7178             }
7179         }
7180         else {
7181             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7182                 scan++;
7183             }
7184         }
7185         break;
7186     case NDIGITL:
7187         PL_reg_flags |= RF_tainted;
7188         if (utf8_target) {
7189             while (hardcount < max && scan < loceol &&
7190                    !isDIGIT_LC_utf8((U8*)scan)) {
7191                 scan += UTF8SKIP(scan);
7192                 hardcount++;
7193             }
7194         } else {
7195             while (scan < loceol && !isDIGIT_LC(*scan))
7196                 scan++;
7197         }
7198         break;
7199     case LNBREAK:
7200         if (utf8_target) {
7201             while (hardcount < max && scan < loceol &&
7202                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7203                 scan += c;
7204                 hardcount++;
7205             }
7206         } else {
7207             /* LNBREAK can match one or two latin chars, which is ok, but we
7208              * have to use hardcount in this situation, and throw away the
7209              * adjustment to <loceol> done before the switch statement */
7210             loceol = PL_regeol;
7211             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7212                 scan+=c;
7213                 hardcount++;
7214             }
7215         }
7216         break;
7217     case HORIZWS:
7218         if (utf8_target) {
7219             while (hardcount < max && scan < loceol &&
7220                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7221             {
7222                 scan += c;
7223                 hardcount++;
7224             }
7225         } else {
7226             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7227                 scan++;
7228         }
7229         break;
7230     case NHORIZWS:
7231         if (utf8_target) {
7232             while (hardcount < max && scan < loceol &&
7233                         !is_HORIZWS_utf8_safe(scan, loceol))
7234             {
7235                 scan += UTF8SKIP(scan);
7236                 hardcount++;
7237             }
7238         } else {
7239             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7240                 scan++;
7241
7242         }
7243         break;
7244     case VERTWS:
7245         if (utf8_target) {
7246             while (hardcount < max && scan < loceol &&
7247                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7248             {
7249                 scan += c;
7250                 hardcount++;
7251             }
7252         } else {
7253             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7254                 scan++;
7255
7256         }
7257         break;
7258     case NVERTWS:
7259         if (utf8_target) {
7260             while (hardcount < max && scan < loceol &&
7261                                 !is_VERTWS_utf8_safe(scan, loceol))
7262             {
7263                 scan += UTF8SKIP(scan);
7264                 hardcount++;
7265             }
7266         } else {
7267             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7268                 scan++;
7269
7270         }
7271         break;
7272
7273     case BOUND:
7274     case BOUNDA:
7275     case BOUNDL:
7276     case BOUNDU:
7277     case EOS:
7278     case GPOS:
7279     case KEEPS:
7280     case NBOUND:
7281     case NBOUNDA:
7282     case NBOUNDL:
7283     case NBOUNDU:
7284     case OPFAIL:
7285     case SBOL:
7286     case SEOL:
7287         /* These are all 0 width, so match right here or not at all. */
7288         break;
7289
7290     default:
7291         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7292         assert(0); /* NOTREACHED */
7293
7294     }
7295
7296     if (hardcount)
7297         c = hardcount;
7298     else
7299         c = scan - *startposp;
7300     *startposp = scan;
7301
7302     DEBUG_r({
7303         GET_RE_DEBUG_FLAGS_DECL;
7304         DEBUG_EXECUTE_r({
7305             SV * const prop = sv_newmortal();
7306             regprop(prog, prop, p);
7307             PerlIO_printf(Perl_debug_log,
7308                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7309                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7310         });
7311     });
7312
7313     return(c);
7314 }
7315
7316
7317 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7318 /*
7319 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7320 create a copy so that changes the caller makes won't change the shared one.
7321 If <altsvp> is non-null, will return NULL in it, for back-compat.
7322  */
7323 SV *
7324 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7325 {
7326     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7327
7328     if (altsvp) {
7329         *altsvp = NULL;
7330     }
7331
7332     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7333 }
7334 #endif
7335
7336 STATIC SV *
7337 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7338 {
7339     /* Returns the swash for the input 'node' in the regex 'prog'.
7340      * If <doinit> is true, will attempt to create the swash if not already
7341      *    done.
7342      * If <listsvp> is non-null, will return the swash initialization string in
7343      *    it.
7344      * Tied intimately to how regcomp.c sets up the data structure */
7345
7346     dVAR;
7347     SV *sw  = NULL;
7348     SV *si  = NULL;
7349     SV*  invlist = NULL;
7350
7351     RXi_GET_DECL(prog,progi);
7352     const struct reg_data * const data = prog ? progi->data : NULL;
7353
7354     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7355
7356     assert(ANYOF_NONBITMAP(node));
7357
7358     if (data && data->count) {
7359         const U32 n = ARG(node);
7360
7361         if (data->what[n] == 's') {
7362             SV * const rv = MUTABLE_SV(data->data[n]);
7363             AV * const av = MUTABLE_AV(SvRV(rv));
7364             SV **const ary = AvARRAY(av);
7365             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7366
7367             si = *ary;  /* ary[0] = the string to initialize the swash with */
7368
7369             /* Elements 2 and 3 are either both present or both absent. [2] is
7370              * any inversion list generated at compile time; [3] indicates if
7371              * that inversion list has any user-defined properties in it. */
7372             if (av_len(av) >= 2) {
7373                 invlist = ary[2];
7374                 if (SvUV(ary[3])) {
7375                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7376                 }
7377             }
7378             else {
7379                 invlist = NULL;
7380             }
7381
7382             /* Element [1] is reserved for the set-up swash.  If already there,
7383              * return it; if not, create it and store it there */
7384             if (SvROK(ary[1])) {
7385                 sw = ary[1];
7386             }
7387             else if (si && doinit) {
7388
7389                 sw = _core_swash_init("utf8", /* the utf8 package */
7390                                       "", /* nameless */
7391                                       si,
7392                                       1, /* binary */
7393                                       0, /* not from tr/// */
7394                                       invlist,
7395                                       &swash_init_flags);
7396                 (void)av_store(av, 1, sw);
7397             }
7398         }
7399     }
7400
7401     if (listsvp) {
7402         SV* matches_string = newSVpvn("", 0);
7403
7404         /* Use the swash, if any, which has to have incorporated into it all
7405          * possibilities */
7406         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7407             && (si && si != &PL_sv_undef))
7408         {
7409
7410             /* If no swash, use the input initialization string, if available */
7411             sv_catsv(matches_string, si);
7412         }
7413
7414         /* Add the inversion list to whatever we have.  This may have come from
7415          * the swash, or from an input parameter */
7416         if (invlist) {
7417             sv_catsv(matches_string, _invlist_contents(invlist));
7418         }
7419         *listsvp = matches_string;
7420     }
7421
7422     return sw;
7423 }
7424
7425 /*
7426  - reginclass - determine if a character falls into a character class
7427
7428   n is the ANYOF regnode
7429   p is the target string
7430   utf8_target tells whether p is in UTF-8.
7431
7432   Returns true if matched; false otherwise.
7433
7434   Note that this can be a synthetic start class, a combination of various
7435   nodes, so things you think might be mutually exclusive, such as locale,
7436   aren't.  It can match both locale and non-locale
7437
7438  */
7439
7440 STATIC bool
7441 S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7442 {
7443     dVAR;
7444     const char flags = ANYOF_FLAGS(n);
7445     bool match = FALSE;
7446     UV c = *p;
7447
7448     PERL_ARGS_ASSERT_REGINCLASS;
7449
7450     /* If c is not already the code point, get it.  Note that
7451      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7452     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7453         STRLEN c_len = 0;
7454         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7455                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7456                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7457                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7458                  * UTF8_ALLOW_FFFF */
7459         if (c_len == (STRLEN)-1)
7460             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7461     }
7462
7463     /* If this character is potentially in the bitmap, check it */
7464     if (c < 256) {
7465         if (ANYOF_BITMAP_TEST(n, c))
7466             match = TRUE;
7467         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7468                 && ! utf8_target
7469                 && ! isASCII(c))
7470         {
7471             match = TRUE;
7472         }
7473         else if (flags & ANYOF_LOCALE) {
7474             PL_reg_flags |= RF_tainted;
7475
7476             if ((flags & ANYOF_LOC_FOLD)
7477                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7478             {
7479                 match = TRUE;
7480             }
7481             else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7482
7483                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7484                  * if the class includes the Posix character class given by
7485                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7486                  * complemented Posix class given by int(bit/2).  So we loop
7487                  * through the bits, each time changing whether we complement
7488                  * the result or not.  Suppose for the sake of illustration
7489                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7490                  * is set, it means there is a match for this ANYOF node if the
7491                  * character is in the class given by the expression (0 / 2 = 0
7492                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7493                  * and since 'to_complement' is 0, the result will stay TRUE,
7494                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7495                  * bit 1 is 1.  That means there is a match if the character
7496                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7497                  * but will on bit 1.  On the second iteration 'to_complement'
7498                  * will be 1, so the exclusive or will reverse things, so we
7499                  * are testing for \W.  On the third iteration, 'to_complement'
7500                  * will be 0, and we would be testing for \s; the fourth
7501                  * iteration would test for \S, etc. */
7502
7503                 int count = 0;
7504                 int to_complement = 0;
7505                 while (count < ANYOF_MAX) {
7506                     if (ANYOF_CLASS_TEST(n, count)
7507                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7508                     {
7509                         match = TRUE;
7510                         break;
7511                     }
7512                     count++;
7513                     to_complement ^= 1;
7514                 }
7515             }
7516         }
7517     }
7518
7519     /* If the bitmap didn't (or couldn't) match, and something outside the
7520      * bitmap could match, try that.  Locale nodes specify completely the
7521      * behavior of code points in the bit map (otherwise, a utf8 target would
7522      * cause them to be treated as Unicode and not locale), except in
7523      * the very unlikely event when this node is a synthetic start class, which
7524      * could be a combination of locale and non-locale nodes.  So allow locale
7525      * to match for the synthetic start class, which will give a false
7526      * positive that will be resolved when the match is done again as not part
7527      * of the synthetic start class */
7528     if (!match) {
7529         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7530             match = TRUE;       /* Everything above 255 matches */
7531         }
7532         else if (ANYOF_NONBITMAP(n)
7533                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7534                      || (utf8_target
7535                          && (c >=256
7536                              || (! (flags & ANYOF_LOCALE))
7537                              || (flags & ANYOF_IS_SYNTHETIC)))))
7538         {
7539             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7540             if (sw) {
7541                 U8 * utf8_p;
7542                 if (utf8_target) {
7543                     utf8_p = (U8 *) p;
7544                 } else { /* Convert to utf8 */
7545                     STRLEN len = 1;
7546                     utf8_p = bytes_to_utf8(p, &len);
7547                 }
7548
7549                 if (swash_fetch(sw, utf8_p, TRUE)) {
7550                     match = TRUE;
7551                 }
7552
7553                 /* If we allocated a string above, free it */
7554                 if (! utf8_target) Safefree(utf8_p);
7555             }
7556         }
7557
7558         if (UNICODE_IS_SUPER(c)
7559             && (flags & ANYOF_WARN_SUPER)
7560             && ckWARN_d(WARN_NON_UNICODE))
7561         {
7562             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7563                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7564         }
7565     }
7566
7567     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7568     return cBOOL(flags & ANYOF_INVERT) ^ match;
7569 }
7570
7571 STATIC U8 *
7572 S_reghop3(U8 *s, I32 off, const U8* lim)
7573 {
7574     /* return the position 'off' UTF-8 characters away from 's', forward if
7575      * 'off' >= 0, backwards if negative.  But don't go outside of position
7576      * 'lim', which better be < s  if off < 0 */
7577
7578     dVAR;
7579
7580     PERL_ARGS_ASSERT_REGHOP3;
7581
7582     if (off >= 0) {
7583         while (off-- && s < lim) {
7584             /* XXX could check well-formedness here */
7585             s += UTF8SKIP(s);
7586         }
7587     }
7588     else {
7589         while (off++ && s > lim) {
7590             s--;
7591             if (UTF8_IS_CONTINUED(*s)) {
7592                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7593                     s--;
7594             }
7595             /* XXX could check well-formedness here */
7596         }
7597     }
7598     return s;
7599 }
7600
7601 #ifdef XXX_dmq
7602 /* there are a bunch of places where we use two reghop3's that should
7603    be replaced with this routine. but since thats not done yet
7604    we ifdef it out - dmq
7605 */
7606 STATIC U8 *
7607 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7608 {
7609     dVAR;
7610
7611     PERL_ARGS_ASSERT_REGHOP4;
7612
7613     if (off >= 0) {
7614         while (off-- && s < rlim) {
7615             /* XXX could check well-formedness here */
7616             s += UTF8SKIP(s);
7617         }
7618     }
7619     else {
7620         while (off++ && s > llim) {
7621             s--;
7622             if (UTF8_IS_CONTINUED(*s)) {
7623                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7624                     s--;
7625             }
7626             /* XXX could check well-formedness here */
7627         }
7628     }
7629     return s;
7630 }
7631 #endif
7632
7633 STATIC U8 *
7634 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7635 {
7636     dVAR;
7637
7638     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7639
7640     if (off >= 0) {
7641         while (off-- && s < lim) {
7642             /* XXX could check well-formedness here */
7643             s += UTF8SKIP(s);
7644         }
7645         if (off >= 0)
7646             return NULL;
7647     }
7648     else {
7649         while (off++ && s > lim) {
7650             s--;
7651             if (UTF8_IS_CONTINUED(*s)) {
7652                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7653                     s--;
7654             }
7655             /* XXX could check well-formedness here */
7656         }
7657         if (off <= 0)
7658             return NULL;
7659     }
7660     return s;
7661 }
7662
7663 static void
7664 restore_pos(pTHX_ void *arg)
7665 {
7666     dVAR;
7667     regexp * const rex = (regexp *)arg;
7668     if (PL_reg_state.re_state_eval_setup_done) {
7669         if (PL_reg_oldsaved) {
7670             rex->subbeg = PL_reg_oldsaved;
7671             rex->sublen = PL_reg_oldsavedlen;
7672             rex->suboffset = PL_reg_oldsavedoffset;
7673             rex->subcoffset = PL_reg_oldsavedcoffset;
7674 #ifdef PERL_ANY_COW
7675             rex->saved_copy = PL_nrs;
7676 #endif
7677             RXp_MATCH_COPIED_on(rex);
7678         }
7679         PL_reg_magic->mg_len = PL_reg_oldpos;
7680         PL_reg_state.re_state_eval_setup_done = FALSE;
7681         PL_curpm = PL_reg_oldcurpm;
7682     }
7683 }
7684
7685 STATIC void
7686 S_to_utf8_substr(pTHX_ regexp *prog)
7687 {
7688     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7689      * on the converted value */
7690
7691     int i = 1;
7692
7693     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7694
7695     do {
7696         if (prog->substrs->data[i].substr
7697             && !prog->substrs->data[i].utf8_substr) {
7698             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7699             prog->substrs->data[i].utf8_substr = sv;
7700             sv_utf8_upgrade(sv);
7701             if (SvVALID(prog->substrs->data[i].substr)) {
7702                 if (SvTAIL(prog->substrs->data[i].substr)) {
7703                     /* Trim the trailing \n that fbm_compile added last
7704                        time.  */
7705                     SvCUR_set(sv, SvCUR(sv) - 1);
7706                     /* Whilst this makes the SV technically "invalid" (as its
7707                        buffer is no longer followed by "\0") when fbm_compile()
7708                        adds the "\n" back, a "\0" is restored.  */
7709                     fbm_compile(sv, FBMcf_TAIL);
7710                 } else
7711                     fbm_compile(sv, 0);
7712             }
7713             if (prog->substrs->data[i].substr == prog->check_substr)
7714                 prog->check_utf8 = sv;
7715         }
7716     } while (i--);
7717 }
7718
7719 STATIC bool
7720 S_to_byte_substr(pTHX_ regexp *prog)
7721 {
7722     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7723      * on the converted value; returns FALSE if can't be converted. */
7724
7725     dVAR;
7726     int i = 1;
7727
7728     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7729
7730     do {
7731         if (prog->substrs->data[i].utf8_substr
7732             && !prog->substrs->data[i].substr) {
7733             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7734             if (! sv_utf8_downgrade(sv, TRUE)) {
7735                 return FALSE;
7736             }
7737             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7738                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7739                     /* Trim the trailing \n that fbm_compile added last
7740                         time.  */
7741                     SvCUR_set(sv, SvCUR(sv) - 1);
7742                     fbm_compile(sv, FBMcf_TAIL);
7743                 } else
7744                     fbm_compile(sv, 0);
7745             }
7746             prog->substrs->data[i].substr = sv;
7747             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7748                 prog->check_substr = sv;
7749         }
7750     } while (i--);
7751
7752     return TRUE;
7753 }
7754
7755 /*
7756  * Local variables:
7757  * c-indentation-style: bsd
7758  * c-basic-offset: 4
7759  * indent-tabs-mode: nil
7760  * End:
7761  *
7762  * ex: set ts=8 sts=4 sw=4 et:
7763  */