regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #ifndef STATIC
  91 #define STATIC  static
  92 #endif
  93
  94 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  95  * call if there are no complications: i.e., if everything matchable is
  96  * straight forward in the bitmap */
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
  98                                               : ANYOF_BITMAP_TEST(p,*(c)))
  99
 100 /*
 101  * Forwards.
 102  */
 103
 104 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 106
 107 #define HOPc(pos,off) \
 108         (char *)(PL_reg_match_utf8 \
 109             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 110             : (U8*)(pos + off))
 111 #define HOPBACKc(pos, off) \
 112         (char*)(PL_reg_match_utf8\
 113             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 114             : (pos - off >= PL_bostr)           \
 115                 ? (U8*)pos - off                \
 116                 : NULL)
 117
 118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 120
 121 /* these are unrolled below in the CCC_TRY_XXX defined */
 122 #ifdef EBCDIC
 123     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 124      * skip the check on EBCDIC platforms */
 125 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 126 #else
 127 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 128     if (!CAT2(PL_utf8_,class)) { \
 129         bool ok; \
 130         ENTER; save_re_context(); \
 131         ok=CAT2(is_utf8_,class)((const U8*)str); \
 132         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 133 #endif
 134
 135 /* Doesn't do an assert to verify that is correct */
 136 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 137     if (!CAT2(PL_utf8_,class)) { \
 138         bool throw_away PERL_UNUSED_DECL; \
 139         ENTER; save_re_context(); \
 140         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 141         LEAVE; } } STMT_END
 142
 143 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 144 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 145 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 146
 147 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 148         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 149         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 150         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 151             * assert should likely and hopefully fail on an EBCDIC machine */ \
 152         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 153                                                                             \
 154         /* No asserts are done for these, in case called on an early        \
 155             * Unicode version in which they map to nothing */               \
 156         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 157         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 158         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 163
 164 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 165
 166 /* The actual code for CCC_TRY, which uses several variables from the routine
 167  * it's callable from.  It is designed to be the bulk of a case statement.
 168  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 169  *      nextchr matches the class.
 170  * UTF8_TEST is the whole test string to use for utf8 targets
 171  * LOAD is what to use to test, and if not present to load in the swash for the
 172  *      class
 173  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 174  *      UTF8_TEST test.
 175  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 176  * utf8 and a variant, load the swash if necessary and test using the utf8
 177  * test.  Advance to the next character if test is ok, otherwise fail; If not
 178  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 179  * fails, or advance to the next character */
 180
 181 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 182     if (locinput >= PL_regeol) {                                              \
 183         sayNO;                                                                \
 184     }                                                                         \
 185     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 186         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 187         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 188             sayNO;                                                            \
 189         }                                                                     \
 190         locinput += PL_utf8skip[nextchr];                                     \
 191         nextchr = UCHARAT(locinput);                                          \
 192         break;                                                                \
 193     }                                                                         \
 194     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 195         sayNO;                                                                \
 196     }                                                                         \
 197     nextchr = UCHARAT(++locinput);                                            \
 198     break;
 199
 200 /* Handle the non-locale cases for a character class and its complement.  It
 201  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 202  * This is because that code fails when the test succeeds, so we want to have
 203  * the test fail so that the code succeeds.  The swash is stored in a
 204  * predictable PL_ place */
 205 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 206                            CLASS, STR)                                        \
 207     case NAME:                                                                \
 208         _CCC_TRY_CODE( !, FUNC,                                               \
 209                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 210                                             (U8*)locinput, TRUE)),            \
 211                           CLASS, STR)                                         \
 212     case NNAME:                                                               \
 213         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 214                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 215                                             (U8*)locinput, TRUE)),            \
 216                           CLASS, STR)                                         \
 217
 218 /* Generate the case statements for both locale and non-locale character
 219  * classes in regmatch for classes that don't have special unicode semantics.
 220  * Locales don't use an immediate swash, but an intermediary special locale
 221  * function that is called on the pointer to the current place in the input
 222  * string.  That function will resolve to needing the same swash.  One might
 223  * think that because we don't know what the locale will match, we shouldn't
 224  * check with the swash loading function that it loaded properly; ie, that we
 225  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 226  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 227  * irrelevant here */
 228 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 229                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 230                 NAMEA, NNAMEA, FUNCA,                                         \
 231                 CLASS, STR)                                                   \
 232     case NAMEL:                                                               \
 233         PL_reg_flags |= RF_tainted;                                           \
 234         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 235     case NNAMEL:                                                              \
 236         PL_reg_flags |= RF_tainted;                                           \
 237         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 238                        CLASS, STR)                                            \
 239     case NAMEA:                                                               \
 240         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 241             sayNO;                                                            \
 242         }                                                                     \
 243         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 244         nextchr = UCHARAT(++locinput);                                        \
 245         break;                                                                \
 246     case NNAMEA:                                                              \
 247         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 248             sayNO;                                                            \
 249         }                                                                     \
 250         if (utf8_target) {                                                    \
 251             locinput += PL_utf8skip[nextchr];                                 \
 252             nextchr = UCHARAT(locinput);                                      \
 253         }                                                                     \
 254         else {                                                                \
 255             nextchr = UCHARAT(++locinput);                                    \
 256         }                                                                     \
 257         break;                                                                \
 258     /* Generate the non-locale cases */                                       \
 259     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 260
 261 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 262  * statements to handle separate Unicode semantics nodes */
 263 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 264                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 265                   NAMEU, NNAMEU, FUNCU,                                        \
 266                   NAMEA, NNAMEA, FUNCA,                                        \
 267                   CLASS, STR)                                                  \
 268     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 269             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 270             NAMEA, NNAMEA, FUNCA,                                              \
 271             CLASS, STR)                                                        \
 272     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 273
 274 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 275
 276 /* for use after a quantifier and before an EXACT-like node -- japhy */
 277 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 278  *
 279  * NOTE that *nothing* that affects backtracking should be in here, specifically
 280  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 281  * node that is in between two EXACT like nodes when ascertaining what the required
 282  * "follow" character is. This should probably be moved to regex compile time
 283  * although it may be done at run time beause of the REF possibility - more
 284  * investigation required. -- demerphq
 285 */
 286 #define JUMPABLE(rn) (      \
 287     OP(rn) == OPEN ||       \
 288     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 289     OP(rn) == EVAL ||   \
 290     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 291     OP(rn) == PLUS || OP(rn) == MINMOD || \
 292     OP(rn) == KEEPS || \
 293     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 294 )
 295 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 296
 297 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 298
 299 #if 0
 300 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 301    we don't need this definition. */
 302 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 303 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 304 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 305
 306 #else
 307 /* ... so we use this as its faster. */
 308 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 309 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 310 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 311 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 312
 313 #endif
 314
 315 /*
 316   Search for mandatory following text node; for lookahead, the text must
 317   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 318 */
 319 #define FIND_NEXT_IMPT(rn) STMT_START { \
 320     while (JUMPABLE(rn)) { \
 321         const OPCODE type = OP(rn); \
 322         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 323             rn = NEXTOPER(NEXTOPER(rn)); \
 324         else if (type == PLUS) \
 325             rn = NEXTOPER(rn); \
 326         else if (type == IFMATCH) \
 327             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 328         else rn += NEXT_OFF(rn); \
 329     } \
 330 } STMT_END
 331
 332
 333 static void restore_pos(pTHX_ void *arg);
 334
 335 #define REGCP_PAREN_ELEMS 4
 336 #define REGCP_OTHER_ELEMS 3
 337 #define REGCP_FRAME_ELEMS 1
 338 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 339  * are needed for the regexp context stack bookkeeping. */
 340
 341 STATIC CHECKPOINT
 342 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 343 {
 344     dVAR;
 345     const int retval = PL_savestack_ix;
 346     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 347     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 348     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 349     int p;
 350     GET_RE_DEBUG_FLAGS_DECL;
 351
 352     PERL_ARGS_ASSERT_REGCPPUSH;
 353
 354     if (paren_elems_to_push < 0)
 355         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 356                    paren_elems_to_push);
 357
 358     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 359         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 360                    " out of range (%lu-%ld)",
 361                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 362
 363     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 364
 365     DEBUG_BUFFERS_r(
 366         if ((int)PL_regsize > (int)parenfloor)
 367             PerlIO_printf(Perl_debug_log,
 368                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 369                 PTR2UV(rex),
 370                 PTR2UV(rex->offs)
 371             );
 372     );
 373     for (p = PL_regsize; p > parenfloor; p--) {
 374 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 375         SSPUSHINT(rex->offs[p].end);
 376         SSPUSHINT(rex->offs[p].start);
 377         SSPUSHINT(rex->offs[p].start_tmp);
 378         SSPUSHINT(p);
 379         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 380             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 381             (UV)p,
 382             (IV)rex->offs[p].start,
 383             (IV)rex->offs[p].start_tmp,
 384             (IV)rex->offs[p].end
 385         ));
 386     }
 387 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 388     SSPUSHINT(PL_regsize);
 389     SSPUSHINT(rex->lastparen);
 390     SSPUSHINT(rex->lastcloseparen);
 391     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 392
 393     return retval;
 394 }
 395
 396 /* These are needed since we do not localize EVAL nodes: */
 397 #define REGCP_SET(cp)                                           \
 398     DEBUG_STATE_r(                                              \
 399             PerlIO_printf(Perl_debug_log,                       \
 400                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 401                 (IV)PL_savestack_ix));                          \
 402     cp = PL_savestack_ix
 403
 404 #define REGCP_UNWIND(cp)                                        \
 405     DEBUG_STATE_r(                                              \
 406         if (cp != PL_savestack_ix)                              \
 407             PerlIO_printf(Perl_debug_log,                       \
 408                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 409                 (IV)(cp), (IV)PL_savestack_ix));                \
 410     regcpblow(cp)
 411
 412 STATIC void
 413 S_regcppop(pTHX_ regexp *rex)
 414 {
 415     dVAR;
 416     UV i;
 417     GET_RE_DEBUG_FLAGS_DECL;
 418
 419     PERL_ARGS_ASSERT_REGCPPOP;
 420
 421     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 422     i = SSPOPUV;
 423     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 424     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 425     rex->lastcloseparen = SSPOPINT;
 426     rex->lastparen = SSPOPINT;
 427     PL_regsize = SSPOPINT;
 428
 429     i -= REGCP_OTHER_ELEMS;
 430     /* Now restore the parentheses context. */
 431     DEBUG_BUFFERS_r(
 432         if (i || rex->lastparen + 1 <= rex->nparens)
 433             PerlIO_printf(Perl_debug_log,
 434                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 435                 PTR2UV(rex),
 436                 PTR2UV(rex->offs)
 437             );
 438     );
 439     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 440         I32 tmps;
 441         U32 paren = (U32)SSPOPINT;
 442         rex->offs[paren].start_tmp = SSPOPINT;
 443         rex->offs[paren].start = SSPOPINT;
 444         tmps = SSPOPINT;
 445         if (paren <= rex->lastparen)
 446             rex->offs[paren].end = tmps;
 447         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 448             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 449             (UV)paren,
 450             (IV)rex->offs[paren].start,
 451             (IV)rex->offs[paren].start_tmp,
 452             (IV)rex->offs[paren].end,
 453             (paren > rex->lastparen ? "(skipped)" : ""));
 454         );
 455     }
 456 #if 1
 457     /* It would seem that the similar code in regtry()
 458      * already takes care of this, and in fact it is in
 459      * a better location to since this code can #if 0-ed out
 460      * but the code in regtry() is needed or otherwise tests
 461      * requiring null fields (pat.t#187 and split.t#{13,14}
 462      * (as of patchlevel 7877)  will fail.  Then again,
 463      * this code seems to be necessary or otherwise
 464      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 465      * --jhi updated by dapm */
 466     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 467         if (i > PL_regsize)
 468             rex->offs[i].start = -1;
 469         rex->offs[i].end = -1;
 470         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 471             "    \\%"UVuf": %s   ..-1 undeffing\n",
 472             (UV)i,
 473             (i > PL_regsize) ? "-1" : "  "
 474         ));
 475     }
 476 #endif
 477 }
 478
 479 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 480
 481 /*
 482  * pregexec and friends
 483  */
 484
 485 #ifndef PERL_IN_XSUB_RE
 486 /*
 487  - pregexec - match a regexp against a string
 488  */
 489 I32
 490 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 491          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 492 /* strend: pointer to null at end of string */
 493 /* strbeg: real beginning of string */
 494 /* minend: end of match must be >=minend after stringarg. */
 495 /* nosave: For optimizations. */
 496 {
 497     PERL_ARGS_ASSERT_PREGEXEC;
 498
 499     return
 500         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 501                       nosave ? 0 : REXEC_COPY_STR);
 502 }
 503 #endif
 504
 505 /*
 506  * Need to implement the following flags for reg_anch:
 507  *
 508  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 509  * USE_INTUIT_ML
 510  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 511  * INTUIT_AUTORITATIVE_ML
 512  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 513  * INTUIT_ONCE_ML
 514  *
 515  * Another flag for this function: SECOND_TIME (so that float substrs
 516  * with giant delta may be not rechecked).
 517  */
 518
 519 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 520
 521 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 522    Otherwise, only SvCUR(sv) is used to get strbeg. */
 523
 524 /* XXXX We assume that strpos is strbeg unless sv. */
 525
 526 /* XXXX Some places assume that there is a fixed substring.
 527         An update may be needed if optimizer marks as "INTUITable"
 528         RExen without fixed substrings.  Similarly, it is assumed that
 529         lengths of all the strings are no more than minlen, thus they
 530         cannot come from lookahead.
 531         (Or minlen should take into account lookahead.)
 532   NOTE: Some of this comment is not correct. minlen does now take account
 533   of lookahead/behind. Further research is required. -- demerphq
 534
 535 */
 536
 537 /* A failure to find a constant substring means that there is no need to make
 538    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 539    finding a substring too deep into the string means that less calls to
 540    regtry() should be needed.
 541
 542    REx compiler's optimizer found 4 possible hints:
 543         a) Anchored substring;
 544         b) Fixed substring;
 545         c) Whether we are anchored (beginning-of-line or \G);
 546         d) First node (of those at offset 0) which may distinguish positions;
 547    We use a)b)d) and multiline-part of c), and try to find a position in the
 548    string which does not contradict any of them.
 549  */
 550
 551 /* Most of decisions we do here should have been done at compile time.
 552    The nodes of the REx which we used for the search should have been
 553    deleted from the finite automaton. */
 554
 555 char *
 556 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 557                      char *strend, const U32 flags, re_scream_pos_data *data)
 558 {
 559     dVAR;
 560     struct regexp *const prog = (struct regexp *)SvANY(rx);
 561     register I32 start_shift = 0;
 562     /* Should be nonnegative! */
 563     register I32 end_shift   = 0;
 564     register char *s;
 565     register SV *check;
 566     char *strbeg;
 567     char *t;
 568     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 569     I32 ml_anch;
 570     register char *other_last = NULL;   /* other substr checked before this */
 571     char *check_at = NULL;              /* check substr found at this pos */
 572     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 573     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 574     RXi_GET_DECL(prog,progi);
 575 #ifdef DEBUGGING
 576     const char * const i_strpos = strpos;
 577 #endif
 578     GET_RE_DEBUG_FLAGS_DECL;
 579
 580     PERL_ARGS_ASSERT_RE_INTUIT_START;
 581     PERL_UNUSED_ARG(flags);
 582     PERL_UNUSED_ARG(data);
 583
 584     RX_MATCH_UTF8_set(rx,utf8_target);
 585
 586     if (RX_UTF8(rx)) {
 587         PL_reg_flags |= RF_utf8;
 588     }
 589     DEBUG_EXECUTE_r(
 590         debug_start_match(rx, utf8_target, strpos, strend,
 591             sv ? "Guessing start of match in sv for"
 592                : "Guessing start of match in string for");
 593               );
 594
 595     /* CHR_DIST() would be more correct here but it makes things slow. */
 596     if (prog->minlen > strend - strpos) {
 597         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 598                               "String too short... [re_intuit_start]\n"));
 599         goto fail;
 600     }
 601
 602     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 603     PL_regeol = strend;
 604     if (utf8_target) {
 605         if (!prog->check_utf8 && prog->check_substr)
 606             to_utf8_substr(prog);
 607         check = prog->check_utf8;
 608     } else {
 609         if (!prog->check_substr && prog->check_utf8)
 610             to_byte_substr(prog);
 611         check = prog->check_substr;
 612     }
 613     if (check == &PL_sv_undef) {
 614         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 615                 "Non-utf8 string cannot match utf8 check string\n"));
 616         goto fail;
 617     }
 618     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 619         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 620                      || ( (prog->extflags & RXf_ANCH_BOL)
 621                           && !multiline ) );    /* Check after \n? */
 622
 623         if (!ml_anch) {
 624           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 625                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 626                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 627                && sv && !SvROK(sv)
 628                && (strpos != strbeg)) {
 629               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 630               goto fail;
 631           }
 632           if (prog->check_offset_min == prog->check_offset_max &&
 633               !(prog->extflags & RXf_CANY_SEEN)) {
 634             /* Substring at constant offset from beg-of-str... */
 635             I32 slen;
 636
 637             s = HOP3c(strpos, prog->check_offset_min, strend);
 638
 639             if (SvTAIL(check)) {
 640                 slen = SvCUR(check);    /* >= 1 */
 641
 642                 if ( strend - s > slen || strend - s < slen - 1
 643                      || (strend - s == slen && strend[-1] != '\n')) {
 644                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 645                     goto fail_finish;
 646                 }
 647                 /* Now should match s[0..slen-2] */
 648                 slen--;
 649                 if (slen && (*SvPVX_const(check) != *s
 650                              || (slen > 1
 651                                  && memNE(SvPVX_const(check), s, slen)))) {
 652                   report_neq:
 653                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 654                     goto fail_finish;
 655                 }
 656             }
 657             else if (*SvPVX_const(check) != *s
 658                      || ((slen = SvCUR(check)) > 1
 659                          && memNE(SvPVX_const(check), s, slen)))
 660                 goto report_neq;
 661             check_at = s;
 662             goto success_at_start;
 663           }
 664         }
 665         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 666         s = strpos;
 667         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 668         end_shift = prog->check_end_shift;
 669
 670         if (!ml_anch) {
 671             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 672                                          - (SvTAIL(check) != 0);
 673             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 674
 675             if (end_shift < eshift)
 676                 end_shift = eshift;
 677         }
 678     }
 679     else {                              /* Can match at random position */
 680         ml_anch = 0;
 681         s = strpos;
 682         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 683         end_shift = prog->check_end_shift;
 684
 685         /* end shift should be non negative here */
 686     }
 687
 688 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 689     if (end_shift < 0)
 690         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 691                    (IV)end_shift, RX_PRECOMP(prog));
 692 #endif
 693
 694   restart:
 695     /* Find a possible match in the region s..strend by looking for
 696        the "check" substring in the region corrected by start/end_shift. */
 697
 698     {
 699         I32 srch_start_shift = start_shift;
 700         I32 srch_end_shift = end_shift;
 701         U8* start_point;
 702         U8* end_point;
 703         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 704             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 705             srch_start_shift = strbeg - s;
 706         }
 707     DEBUG_OPTIMISE_MORE_r({
 708         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 709             (IV)prog->check_offset_min,
 710             (IV)srch_start_shift,
 711             (IV)srch_end_shift,
 712             (IV)prog->check_end_shift);
 713     });
 714
 715         if (prog->extflags & RXf_CANY_SEEN) {
 716             start_point= (U8*)(s + srch_start_shift);
 717             end_point= (U8*)(strend - srch_end_shift);
 718         } else {
 719             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 720             end_point= HOP3(strend, -srch_end_shift, strbeg);
 721         }
 722         DEBUG_OPTIMISE_MORE_r({
 723             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 724                 (int)(end_point - start_point),
 725                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 726                 start_point);
 727         });
 728
 729         s = fbm_instr( start_point, end_point,
 730                       check, multiline ? FBMrf_MULTILINE : 0);
 731     }
 732     /* Update the count-of-usability, remove useless subpatterns,
 733         unshift s.  */
 734
 735     DEBUG_EXECUTE_r({
 736         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 737             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 738         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 739                           (s ? "Found" : "Did not find"),
 740             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 741                 ? "anchored" : "floating"),
 742             quoted,
 743             RE_SV_TAIL(check),
 744             (s ? " at offset " : "...\n") );
 745     });
 746
 747     if (!s)
 748         goto fail_finish;
 749     /* Finish the diagnostic message */
 750     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 751
 752     /* XXX dmq: first branch is for positive lookbehind...
 753        Our check string is offset from the beginning of the pattern.
 754        So we need to do any stclass tests offset forward from that
 755        point. I think. :-(
 756      */
 757
 758
 759
 760     check_at=s;
 761
 762
 763     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 764        Start with the other substr.
 765        XXXX no SCREAM optimization yet - and a very coarse implementation
 766        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 767                 *always* match.  Probably should be marked during compile...
 768        Probably it is right to do no SCREAM here...
 769      */
 770
 771     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 772                 : (prog->float_substr && prog->anchored_substr))
 773     {
 774         /* Take into account the "other" substring. */
 775         /* XXXX May be hopelessly wrong for UTF... */
 776         if (!other_last)
 777             other_last = strpos;
 778         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 779           do_other_anchored:
 780             {
 781                 char * const last = HOP3c(s, -start_shift, strbeg);
 782                 char *last1, *last2;
 783                 char * const saved_s = s;
 784                 SV* must;
 785
 786                 t = s - prog->check_offset_max;
 787                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 788                     && (!utf8_target
 789                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 790                             && t > strpos)))
 791                     NOOP;
 792                 else
 793                     t = strpos;
 794                 t = HOP3c(t, prog->anchored_offset, strend);
 795                 if (t < other_last)     /* These positions already checked */
 796                     t = other_last;
 797                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 798                 if (last < last1)
 799                     last1 = last;
 800                 /* XXXX It is not documented what units *_offsets are in.
 801                    We assume bytes, but this is clearly wrong.
 802                    Meaning this code needs to be carefully reviewed for errors.
 803                    dmq.
 804                   */
 805
 806                 /* On end-of-str: see comment below. */
 807                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 808                 if (must == &PL_sv_undef) {
 809                     s = (char*)NULL;
 810                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 811                 }
 812                 else
 813                     s = fbm_instr(
 814                         (unsigned char*)t,
 815                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 816                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 817                         must,
 818                         multiline ? FBMrf_MULTILINE : 0
 819                     );
 820                 DEBUG_EXECUTE_r({
 821                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 822                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 823                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 824                         (s ? "Found" : "Contradicts"),
 825                         quoted, RE_SV_TAIL(must));
 826                 });
 827
 828
 829                 if (!s) {
 830                     if (last1 >= last2) {
 831                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 832                                                 ", giving up...\n"));
 833                         goto fail_finish;
 834                     }
 835                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 836                         ", trying floating at offset %ld...\n",
 837                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 838                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 839                     s = HOP3c(last, 1, strend);
 840                     goto restart;
 841                 }
 842                 else {
 843                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 844                           (long)(s - i_strpos)));
 845                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 846                     other_last = HOP3c(s, 1, strend);
 847                     s = saved_s;
 848                     if (t == strpos)
 849                         goto try_at_start;
 850                     goto try_at_offset;
 851                 }
 852             }
 853         }
 854         else {          /* Take into account the floating substring. */
 855             char *last, *last1;
 856             char * const saved_s = s;
 857             SV* must;
 858
 859             t = HOP3c(s, -start_shift, strbeg);
 860             last1 = last =
 861                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 862             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 863                 last = HOP3c(t, prog->float_max_offset, strend);
 864             s = HOP3c(t, prog->float_min_offset, strend);
 865             if (s < other_last)
 866                 s = other_last;
 867  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 868             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 869             /* fbm_instr() takes into account exact value of end-of-str
 870                if the check is SvTAIL(ed).  Since false positives are OK,
 871                and end-of-str is not later than strend we are OK. */
 872             if (must == &PL_sv_undef) {
 873                 s = (char*)NULL;
 874                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 875             }
 876             else
 877                 s = fbm_instr((unsigned char*)s,
 878                               (unsigned char*)last + SvCUR(must)
 879                                   - (SvTAIL(must)!=0),
 880                               must, multiline ? FBMrf_MULTILINE : 0);
 881             DEBUG_EXECUTE_r({
 882                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 883                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 884                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 885                     (s ? "Found" : "Contradicts"),
 886                     quoted, RE_SV_TAIL(must));
 887             });
 888             if (!s) {
 889                 if (last1 == last) {
 890                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 891                                             ", giving up...\n"));
 892                     goto fail_finish;
 893                 }
 894                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 895                     ", trying anchored starting at offset %ld...\n",
 896                     (long)(saved_s + 1 - i_strpos)));
 897                 other_last = last;
 898                 s = HOP3c(t, 1, strend);
 899                 goto restart;
 900             }
 901             else {
 902                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 903                       (long)(s - i_strpos)));
 904                 other_last = s; /* Fix this later. --Hugo */
 905                 s = saved_s;
 906                 if (t == strpos)
 907                     goto try_at_start;
 908                 goto try_at_offset;
 909             }
 910         }
 911     }
 912
 913
 914     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 915
 916     DEBUG_OPTIMISE_MORE_r(
 917         PerlIO_printf(Perl_debug_log,
 918             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 919             (IV)prog->check_offset_min,
 920             (IV)prog->check_offset_max,
 921             (IV)(s-strpos),
 922             (IV)(t-strpos),
 923             (IV)(t-s),
 924             (IV)(strend-strpos)
 925         )
 926     );
 927
 928     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 929         && (!utf8_target
 930             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 931                  && t > strpos)))
 932     {
 933         /* Fixed substring is found far enough so that the match
 934            cannot start at strpos. */
 935       try_at_offset:
 936         if (ml_anch && t[-1] != '\n') {
 937             /* Eventually fbm_*() should handle this, but often
 938                anchored_offset is not 0, so this check will not be wasted. */
 939             /* XXXX In the code below we prefer to look for "^" even in
 940                presence of anchored substrings.  And we search even
 941                beyond the found float position.  These pessimizations
 942                are historical artefacts only.  */
 943           find_anchor:
 944             while (t < strend - prog->minlen) {
 945                 if (*t == '\n') {
 946                     if (t < check_at - prog->check_offset_min) {
 947                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 948                             /* Since we moved from the found position,
 949                                we definitely contradict the found anchored
 950                                substr.  Due to the above check we do not
 951                                contradict "check" substr.
 952                                Thus we can arrive here only if check substr
 953                                is float.  Redo checking for "other"=="fixed".
 954                              */
 955                             strpos = t + 1;
 956                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 957                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 958                             goto do_other_anchored;
 959                         }
 960                         /* We don't contradict the found floating substring. */
 961                         /* XXXX Why not check for STCLASS? */
 962                         s = t + 1;
 963                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 964                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 965                         goto set_useful;
 966                     }
 967                     /* Position contradicts check-string */
 968                     /* XXXX probably better to look for check-string
 969                        than for "\n", so one should lower the limit for t? */
 970                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 971                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 972                     other_last = strpos = s = t + 1;
 973                     goto restart;
 974                 }
 975                 t++;
 976             }
 977             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 978                         PL_colors[0], PL_colors[1]));
 979             goto fail_finish;
 980         }
 981         else {
 982             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 983                         PL_colors[0], PL_colors[1]));
 984         }
 985         s = t;
 986       set_useful:
 987         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 988     }
 989     else {
 990         /* The found string does not prohibit matching at strpos,
 991            - no optimization of calling REx engine can be performed,
 992            unless it was an MBOL and we are not after MBOL,
 993            or a future STCLASS check will fail this. */
 994       try_at_start:
 995         /* Even in this situation we may use MBOL flag if strpos is offset
 996            wrt the start of the string. */
 997         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 998             && (strpos != strbeg) && strpos[-1] != '\n'
 999             /* May be due to an implicit anchor of m{.*foo}  */
1000             && !(prog->intflags & PREGf_IMPLICIT))
1001         {
1002             t = strpos;
1003             goto find_anchor;
1004         }
1005         DEBUG_EXECUTE_r( if (ml_anch)
1006             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1007                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1008         );
1009       success_at_start:
1010         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1011             && (utf8_target ? (
1012                 prog->check_utf8                /* Could be deleted already */
1013                 && --BmUSEFUL(prog->check_utf8) < 0
1014                 && (prog->check_utf8 == prog->float_utf8)
1015             ) : (
1016                 prog->check_substr              /* Could be deleted already */
1017                 && --BmUSEFUL(prog->check_substr) < 0
1018                 && (prog->check_substr == prog->float_substr)
1019             )))
1020         {
1021             /* If flags & SOMETHING - do not do it many times on the same match */
1022             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1023             /* XXX Does the destruction order has to change with utf8_target? */
1024             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1025             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1026             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1027             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1028             check = NULL;                       /* abort */
1029             s = strpos;
1030             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1031                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1032             if (prog->intflags & PREGf_IMPLICIT)
1033                 prog->extflags &= ~RXf_ANCH_MBOL;
1034             /* XXXX This is a remnant of the old implementation.  It
1035                     looks wasteful, since now INTUIT can use many
1036                     other heuristics. */
1037             prog->extflags &= ~RXf_USE_INTUIT;
1038             /* XXXX What other flags might need to be cleared in this branch? */
1039         }
1040         else
1041             s = strpos;
1042     }
1043
1044     /* Last resort... */
1045     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1046     /* trie stclasses are too expensive to use here, we are better off to
1047        leave it to regmatch itself */
1048     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1049         /* minlen == 0 is possible if regstclass is \b or \B,
1050            and the fixed substr is ''$.
1051            Since minlen is already taken into account, s+1 is before strend;
1052            accidentally, minlen >= 1 guaranties no false positives at s + 1
1053            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1054            regstclass does not come from lookahead...  */
1055         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1056            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1057         const U8* const str = (U8*)STRING(progi->regstclass);
1058         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1059                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1060                     : 1);
1061         char * endpos;
1062         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1063             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1064         else if (prog->float_substr || prog->float_utf8)
1065             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1066         else
1067             endpos= strend;
1068
1069         if (checked_upto < s)
1070            checked_upto = s;
1071         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1072                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1073
1074         t = s;
1075         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1076         if (s) {
1077             checked_upto = s;
1078         } else {
1079 #ifdef DEBUGGING
1080             const char *what = NULL;
1081 #endif
1082             if (endpos == strend) {
1083                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1084                                 "Could not match STCLASS...\n") );
1085                 goto fail;
1086             }
1087             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1088                                    "This position contradicts STCLASS...\n") );
1089             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1090                 goto fail;
1091             checked_upto = HOPBACKc(endpos, start_shift);
1092             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1093                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1094             /* Contradict one of substrings */
1095             if (prog->anchored_substr || prog->anchored_utf8) {
1096                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1097                     DEBUG_EXECUTE_r( what = "anchored" );
1098                   hop_and_restart:
1099                     s = HOP3c(t, 1, strend);
1100                     if (s + start_shift + end_shift > strend) {
1101                         /* XXXX Should be taken into account earlier? */
1102                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1103                                                "Could not match STCLASS...\n") );
1104                         goto fail;
1105                     }
1106                     if (!check)
1107                         goto giveup;
1108                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1109                                 "Looking for %s substr starting at offset %ld...\n",
1110                                  what, (long)(s + start_shift - i_strpos)) );
1111                     goto restart;
1112                 }
1113                 /* Have both, check_string is floating */
1114                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1115                     goto retry_floating_check;
1116                 /* Recheck anchored substring, but not floating... */
1117                 s = check_at;
1118                 if (!check)
1119                     goto giveup;
1120                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1121                           "Looking for anchored substr starting at offset %ld...\n",
1122                           (long)(other_last - i_strpos)) );
1123                 goto do_other_anchored;
1124             }
1125             /* Another way we could have checked stclass at the
1126                current position only: */
1127             if (ml_anch) {
1128                 s = t = t + 1;
1129                 if (!check)
1130                     goto giveup;
1131                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1132                           "Looking for /%s^%s/m starting at offset %ld...\n",
1133                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1134                 goto try_at_offset;
1135             }
1136             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1137                 goto fail;
1138             /* Check is floating substring. */
1139           retry_floating_check:
1140             t = check_at - start_shift;
1141             DEBUG_EXECUTE_r( what = "floating" );
1142             goto hop_and_restart;
1143         }
1144         if (t != s) {
1145             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1146                         "By STCLASS: moving %ld --> %ld\n",
1147                                   (long)(t - i_strpos), (long)(s - i_strpos))
1148                    );
1149         }
1150         else {
1151             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1152                                   "Does not contradict STCLASS...\n");
1153                    );
1154         }
1155     }
1156   giveup:
1157     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1158                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1159                           PL_colors[5], (long)(s - i_strpos)) );
1160     return s;
1161
1162   fail_finish:                          /* Substring not found */
1163     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1164         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1165   fail:
1166     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1167                           PL_colors[4], PL_colors[5]));
1168     return NULL;
1169 }
1170
1171 #define DECL_TRIE_TYPE(scan) \
1172     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1173                     trie_type = ((scan->flags == EXACT) \
1174                               ? (utf8_target ? trie_utf8 : trie_plain) \
1175                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1176
1177 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1178 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1179     STRLEN skiplen;                                                                 \
1180     switch (trie_type) {                                                            \
1181     case trie_utf8_fold:                                                            \
1182         if ( foldlen>0 ) {                                                          \
1183             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1184             foldlen -= len;                                                         \
1185             uscan += len;                                                           \
1186             len=0;                                                                  \
1187         } else {                                                                    \
1188             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1189             len = UTF8SKIP(uc);                                                     \
1190             skiplen = UNISKIP( uvc );                                               \
1191             foldlen -= skiplen;                                                     \
1192             uscan = foldbuf + skiplen;                                              \
1193         }                                                                           \
1194         break;                                                                      \
1195     case trie_latin_utf8_fold:                                                      \
1196         if ( foldlen>0 ) {                                                          \
1197             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1198             foldlen -= len;                                                         \
1199             uscan += len;                                                           \
1200             len=0;                                                                  \
1201         } else {                                                                    \
1202             len = 1;                                                                \
1203             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1204             skiplen = UNISKIP( uvc );                                               \
1205             foldlen -= skiplen;                                                     \
1206             uscan = foldbuf + skiplen;                                              \
1207         }                                                                           \
1208         break;                                                                      \
1209     case trie_utf8:                                                                 \
1210         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1211         break;                                                                      \
1212     case trie_plain:                                                                \
1213         uvc = (UV)*uc;                                                              \
1214         len = 1;                                                                    \
1215     }                                                                               \
1216     if (uvc < 256) {                                                                \
1217         charid = trie->charmap[ uvc ];                                              \
1218     }                                                                               \
1219     else {                                                                          \
1220         charid = 0;                                                                 \
1221         if (widecharmap) {                                                          \
1222             SV** const svpp = hv_fetch(widecharmap,                                 \
1223                         (char*)&uvc, sizeof(UV), 0);                                \
1224             if (svpp)                                                               \
1225                 charid = (U16)SvIV(*svpp);                                          \
1226         }                                                                           \
1227     }                                                                               \
1228 } STMT_END
1229
1230 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1231 STMT_START {                                              \
1232     while (s <= e) {                                      \
1233         if ( (CoNd)                                       \
1234              && (ln == 1 || folder(s, pat_string, ln))    \
1235              && (!reginfo || regtry(reginfo, &s)) )       \
1236             goto got_it;                                  \
1237         s++;                                              \
1238     }                                                     \
1239 } STMT_END
1240
1241 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1242 STMT_START {                                          \
1243     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1244         CoDe                                          \
1245         s += uskip;                                   \
1246     }                                                 \
1247 } STMT_END
1248
1249 #define REXEC_FBC_SCAN(CoDe)                          \
1250 STMT_START {                                          \
1251     while (s < strend) {                              \
1252         CoDe                                          \
1253         s++;                                          \
1254     }                                                 \
1255 } STMT_END
1256
1257 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1258 REXEC_FBC_UTF8_SCAN(                                  \
1259     if (CoNd) {                                       \
1260         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1261             goto got_it;                              \
1262         else                                          \
1263             tmp = doevery;                            \
1264     }                                                 \
1265     else                                              \
1266         tmp = 1;                                      \
1267 )
1268
1269 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1270 REXEC_FBC_SCAN(                                       \
1271     if (CoNd) {                                       \
1272         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1273             goto got_it;                              \
1274         else                                          \
1275             tmp = doevery;                            \
1276     }                                                 \
1277     else                                              \
1278         tmp = 1;                                      \
1279 )
1280
1281 #define REXEC_FBC_TRYIT               \
1282 if ((!reginfo || regtry(reginfo, &s))) \
1283     goto got_it
1284
1285 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1286     if (utf8_target) {                                             \
1287         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1288     }                                                          \
1289     else {                                                     \
1290         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1291     }
1292
1293 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1294     if (utf8_target) {                                             \
1295         UtFpReLoAd;                                            \
1296         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1297     }                                                          \
1298     else {                                                     \
1299         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1300     }
1301
1302 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1303     PL_reg_flags |= RF_tainted;                                \
1304     if (utf8_target) {                                             \
1305         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1306     }                                                          \
1307     else {                                                     \
1308         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1309     }
1310
1311 #define DUMP_EXEC_POS(li,s,doutf8) \
1312     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1313
1314
1315 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1316         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1317         tmp = TEST_NON_UTF8(tmp);                                              \
1318         REXEC_FBC_UTF8_SCAN(                                                   \
1319             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1320                 tmp = !tmp;                                                    \
1321                 IF_SUCCESS;                                                    \
1322             }                                                                  \
1323             else {                                                             \
1324                 IF_FAIL;                                                       \
1325             }                                                                  \
1326         );                                                                     \
1327
1328 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1329         if (s == PL_bostr) {                                                   \
1330             tmp = '\n';                                                        \
1331         }                                                                      \
1332         else {                                                                 \
1333             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1334             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1335         }                                                                      \
1336         tmp = TeSt1_UtF8;                                                      \
1337         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1338         REXEC_FBC_UTF8_SCAN(                                                   \
1339             if (tmp == ! (TeSt2_UtF8)) { \
1340                 tmp = !tmp;                                                    \
1341                 IF_SUCCESS;                                                    \
1342             }                                                                  \
1343             else {                                                             \
1344                 IF_FAIL;                                                       \
1345             }                                                                  \
1346         );                                                                     \
1347
1348 /* The only difference between the BOUND and NBOUND cases is that
1349  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1350  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1351  * with the other one being empty */
1352 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1353     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1354
1355 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1356     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1357
1358 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1359     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1360
1361 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1362     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1363
1364
1365 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1366  * be passed in completely with the variable name being tested, which isn't
1367  * such a clean interface, but this is easier to read than it was before.  We
1368  * are looking for the boundary (or non-boundary between a word and non-word
1369  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1370  * must be different.  Find the "wordness" of the character just prior to this
1371  * one, and compare it with the wordness of this one.  If they differ, we have
1372  * a boundary.  At the beginning of the string, pretend that the previous
1373  * character was a new-line */
1374 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1375     if (utf8_target) {                                                         \
1376                 UTF8_CODE \
1377     }                                                                          \
1378     else {  /* Not utf8 */                                                     \
1379         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1380         tmp = TEST_NON_UTF8(tmp);                                              \
1381         REXEC_FBC_SCAN(                                                        \
1382             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1383                 tmp = !tmp;                                                    \
1384                 IF_SUCCESS;                                                    \
1385             }                                                                  \
1386             else {                                                             \
1387                 IF_FAIL;                                                       \
1388             }                                                                  \
1389         );                                                                     \
1390     }                                                                          \
1391     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1392         goto got_it;
1393
1394 /* We know what class REx starts with.  Try to find this position... */
1395 /* if reginfo is NULL, its a dryrun */
1396 /* annoyingly all the vars in this routine have different names from their counterparts
1397    in regmatch. /grrr */
1398
1399 STATIC char *
1400 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1401     const char *strend, regmatch_info *reginfo)
1402 {
1403         dVAR;
1404         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1405         char *pat_string;   /* The pattern's exactish string */
1406         char *pat_end;      /* ptr to end char of pat_string */
1407         re_fold_t folder;       /* Function for computing non-utf8 folds */
1408         const U8 *fold_array;   /* array for folding ords < 256 */
1409         STRLEN ln;
1410         STRLEN lnc;
1411         register STRLEN uskip;
1412         U8 c1;
1413         U8 c2;
1414         char *e;
1415         register I32 tmp = 1;   /* Scratch variable? */
1416         register const bool utf8_target = PL_reg_match_utf8;
1417         UV utf8_fold_flags = 0;
1418         RXi_GET_DECL(prog,progi);
1419
1420         PERL_ARGS_ASSERT_FIND_BYCLASS;
1421
1422         /* We know what class it must start with. */
1423         switch (OP(c)) {
1424         case ANYOFV:
1425         case ANYOF:
1426             if (utf8_target || OP(c) == ANYOFV) {
1427                 STRLEN inclasslen = strend - s;
1428                 REXEC_FBC_UTF8_CLASS_SCAN(
1429                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1430             }
1431             else {
1432                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1433             }
1434             break;
1435         case CANY:
1436             REXEC_FBC_SCAN(
1437                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1438                     goto got_it;
1439                 else
1440                     tmp = doevery;
1441             );
1442             break;
1443
1444         case EXACTFA:
1445             if (UTF_PATTERN || utf8_target) {
1446                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1447                 goto do_exactf_utf8;
1448             }
1449             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1450             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1451             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1452
1453         case EXACTF:
1454             if (utf8_target) {
1455
1456                 /* regcomp.c already folded this if pattern is in UTF-8 */
1457                 utf8_fold_flags = 0;
1458                 goto do_exactf_utf8;
1459             }
1460             fold_array = PL_fold;
1461             folder = foldEQ;
1462             goto do_exactf_non_utf8;
1463
1464         case EXACTFL:
1465             if (UTF_PATTERN || utf8_target) {
1466                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1467                 goto do_exactf_utf8;
1468             }
1469             fold_array = PL_fold_locale;
1470             folder = foldEQ_locale;
1471             goto do_exactf_non_utf8;
1472
1473         case EXACTFU_SS:
1474             if (UTF_PATTERN) {
1475                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1476             }
1477             goto do_exactf_utf8;
1478
1479         case EXACTFU_TRICKYFOLD:
1480         case EXACTFU:
1481             if (UTF_PATTERN || utf8_target) {
1482                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1483                 goto do_exactf_utf8;
1484             }
1485
1486             /* Any 'ss' in the pattern should have been replaced by regcomp,
1487              * so we don't have to worry here about this single special case
1488              * in the Latin1 range */
1489             fold_array = PL_fold_latin1;
1490             folder = foldEQ_latin1;
1491
1492             /* FALL THROUGH */
1493
1494         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1495                                are no glitches with fold-length differences
1496                                between the target string and pattern */
1497
1498             /* The idea in the non-utf8 EXACTF* cases is to first find the
1499              * first character of the EXACTF* node and then, if necessary,
1500              * case-insensitively compare the full text of the node.  c1 is the
1501              * first character.  c2 is its fold.  This logic will not work for
1502              * Unicode semantics and the german sharp ss, which hence should
1503              * not be compiled into a node that gets here. */
1504             pat_string = STRING(c);
1505             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1506
1507             /* We know that we have to match at least 'ln' bytes (which is the
1508              * same as characters, since not utf8).  If we have to match 3
1509              * characters, and there are only 2 availabe, we know without
1510              * trying that it will fail; so don't start a match past the
1511              * required minimum number from the far end */
1512             e = HOP3c(strend, -((I32)ln), s);
1513
1514             if (!reginfo && e < s) {
1515                 e = s;                  /* Due to minlen logic of intuit() */
1516             }
1517
1518             c1 = *pat_string;
1519             c2 = fold_array[c1];
1520             if (c1 == c2) { /* If char and fold are the same */
1521                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1522             }
1523             else {
1524                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1525             }
1526             break;
1527
1528         do_exactf_utf8:
1529         {
1530             unsigned expansion;
1531
1532
1533             /* If one of the operands is in utf8, we can't use the simpler
1534              * folding above, due to the fact that many different characters
1535              * can have the same fold, or portion of a fold, or different-
1536              * length fold */
1537             pat_string = STRING(c);
1538             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1539             pat_end = pat_string + ln;
1540             lnc = (UTF_PATTERN) /* length to match in characters */
1541                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1542                     : ln;
1543
1544             /* We have 'lnc' characters to match in the pattern, but because of
1545              * multi-character folding, each character in the target can match
1546              * up to 3 characters (Unicode guarantees it will never exceed
1547              * this) if it is utf8-encoded; and up to 2 if not (based on the
1548              * fact that the Latin 1 folds are already determined, and the
1549              * only multi-char fold in that range is the sharp-s folding to
1550              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1551              * string character.  Adjust lnc accordingly, rounding up, so that
1552              * if we need to match at least 4+1/3 chars, that really is 5. */
1553             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1554             lnc = (lnc + expansion - 1) / expansion;
1555
1556             /* As in the non-UTF8 case, if we have to match 3 characters, and
1557              * only 2 are left, it's guaranteed to fail, so don't start a
1558              * match that would require us to go beyond the end of the string
1559              */
1560             e = HOP3c(strend, -((I32)lnc), s);
1561
1562             if (!reginfo && e < s) {
1563                 e = s;                  /* Due to minlen logic of intuit() */
1564             }
1565
1566             /* XXX Note that we could recalculate e to stop the loop earlier,
1567              * as the worst case expansion above will rarely be met, and as we
1568              * go along we would usually find that e moves further to the left.
1569              * This would happen only after we reached the point in the loop
1570              * where if there were no expansion we should fail.  Unclear if
1571              * worth the expense */
1572
1573             while (s <= e) {
1574                 char *my_strend= (char *)strend;
1575                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1576                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1577                     && (!reginfo || regtry(reginfo, &s)) )
1578                 {
1579                     goto got_it;
1580                 }
1581                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1582             }
1583             break;
1584         }
1585         case BOUNDL:
1586             PL_reg_flags |= RF_tainted;
1587             FBC_BOUND(isALNUM_LC,
1588                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1589                       isALNUM_LC_utf8((U8*)s));
1590             break;
1591         case NBOUNDL:
1592             PL_reg_flags |= RF_tainted;
1593             FBC_NBOUND(isALNUM_LC,
1594                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1595                        isALNUM_LC_utf8((U8*)s));
1596             break;
1597         case BOUND:
1598             FBC_BOUND(isWORDCHAR,
1599                       isALNUM_uni(tmp),
1600                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1601             break;
1602         case BOUNDA:
1603             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1604                              isWORDCHAR_A(tmp),
1605                              isWORDCHAR_A((U8*)s));
1606             break;
1607         case NBOUND:
1608             FBC_NBOUND(isWORDCHAR,
1609                        isALNUM_uni(tmp),
1610                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1611             break;
1612         case NBOUNDA:
1613             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1614                               isWORDCHAR_A(tmp),
1615                               isWORDCHAR_A((U8*)s));
1616             break;
1617         case BOUNDU:
1618             FBC_BOUND(isWORDCHAR_L1,
1619                       isALNUM_uni(tmp),
1620                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1621             break;
1622         case NBOUNDU:
1623             FBC_NBOUND(isWORDCHAR_L1,
1624                        isALNUM_uni(tmp),
1625                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1626             break;
1627         case ALNUML:
1628             REXEC_FBC_CSCAN_TAINT(
1629                 isALNUM_LC_utf8((U8*)s),
1630                 isALNUM_LC(*s)
1631             );
1632             break;
1633         case ALNUMU:
1634             REXEC_FBC_CSCAN_PRELOAD(
1635                 LOAD_UTF8_CHARCLASS_ALNUM(),
1636                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1637                 isWORDCHAR_L1((U8) *s)
1638             );
1639             break;
1640         case ALNUM:
1641             REXEC_FBC_CSCAN_PRELOAD(
1642                 LOAD_UTF8_CHARCLASS_ALNUM(),
1643                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1644                 isWORDCHAR((U8) *s)
1645             );
1646             break;
1647         case ALNUMA:
1648             /* Don't need to worry about utf8, as it can match only a single
1649              * byte invariant character */
1650             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1651             break;
1652         case NALNUMU:
1653             REXEC_FBC_CSCAN_PRELOAD(
1654                 LOAD_UTF8_CHARCLASS_ALNUM(),
1655                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1656                 ! isWORDCHAR_L1((U8) *s)
1657             );
1658             break;
1659         case NALNUM:
1660             REXEC_FBC_CSCAN_PRELOAD(
1661                 LOAD_UTF8_CHARCLASS_ALNUM(),
1662                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1663                 ! isALNUM(*s)
1664             );
1665             break;
1666         case NALNUMA:
1667             REXEC_FBC_CSCAN(
1668                 !isWORDCHAR_A(*s),
1669                 !isWORDCHAR_A(*s)
1670             );
1671             break;
1672         case NALNUML:
1673             REXEC_FBC_CSCAN_TAINT(
1674                 !isALNUM_LC_utf8((U8*)s),
1675                 !isALNUM_LC(*s)
1676             );
1677             break;
1678         case SPACEU:
1679             REXEC_FBC_CSCAN_PRELOAD(
1680                 LOAD_UTF8_CHARCLASS_SPACE(),
1681                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1682                 isSPACE_L1((U8) *s)
1683             );
1684             break;
1685         case SPACE:
1686             REXEC_FBC_CSCAN_PRELOAD(
1687                 LOAD_UTF8_CHARCLASS_SPACE(),
1688                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1689                 isSPACE((U8) *s)
1690             );
1691             break;
1692         case SPACEA:
1693             /* Don't need to worry about utf8, as it can match only a single
1694              * byte invariant character */
1695             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1696             break;
1697         case SPACEL:
1698             REXEC_FBC_CSCAN_TAINT(
1699                 isSPACE_LC_utf8((U8*)s),
1700                 isSPACE_LC(*s)
1701             );
1702             break;
1703         case NSPACEU:
1704             REXEC_FBC_CSCAN_PRELOAD(
1705                 LOAD_UTF8_CHARCLASS_SPACE(),
1706                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1707                 ! isSPACE_L1((U8) *s)
1708             );
1709             break;
1710         case NSPACE:
1711             REXEC_FBC_CSCAN_PRELOAD(
1712                 LOAD_UTF8_CHARCLASS_SPACE(),
1713                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1714                 ! isSPACE((U8) *s)
1715             );
1716             break;
1717         case NSPACEA:
1718             REXEC_FBC_CSCAN(
1719                 !isSPACE_A(*s),
1720                 !isSPACE_A(*s)
1721             );
1722             break;
1723         case NSPACEL:
1724             REXEC_FBC_CSCAN_TAINT(
1725                 !isSPACE_LC_utf8((U8*)s),
1726                 !isSPACE_LC(*s)
1727             );
1728             break;
1729         case DIGIT:
1730             REXEC_FBC_CSCAN_PRELOAD(
1731                 LOAD_UTF8_CHARCLASS_DIGIT(),
1732                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1733                 isDIGIT(*s)
1734             );
1735             break;
1736         case DIGITA:
1737             /* Don't need to worry about utf8, as it can match only a single
1738              * byte invariant character */
1739             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1740             break;
1741         case DIGITL:
1742             REXEC_FBC_CSCAN_TAINT(
1743                 isDIGIT_LC_utf8((U8*)s),
1744                 isDIGIT_LC(*s)
1745             );
1746             break;
1747         case NDIGIT:
1748             REXEC_FBC_CSCAN_PRELOAD(
1749                 LOAD_UTF8_CHARCLASS_DIGIT(),
1750                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1751                 !isDIGIT(*s)
1752             );
1753             break;
1754         case NDIGITA:
1755             REXEC_FBC_CSCAN(
1756                 !isDIGIT_A(*s),
1757                 !isDIGIT_A(*s)
1758             );
1759             break;
1760         case NDIGITL:
1761             REXEC_FBC_CSCAN_TAINT(
1762                 !isDIGIT_LC_utf8((U8*)s),
1763                 !isDIGIT_LC(*s)
1764             );
1765             break;
1766         case LNBREAK:
1767             REXEC_FBC_CSCAN(
1768                 is_LNBREAK_utf8(s),
1769                 is_LNBREAK_latin1(s)
1770             );
1771             break;
1772         case VERTWS:
1773             REXEC_FBC_CSCAN(
1774                 is_VERTWS_utf8(s),
1775                 is_VERTWS_latin1(s)
1776             );
1777             break;
1778         case NVERTWS:
1779             REXEC_FBC_CSCAN(
1780                 !is_VERTWS_utf8(s),
1781                 !is_VERTWS_latin1(s)
1782             );
1783             break;
1784         case HORIZWS:
1785             REXEC_FBC_CSCAN(
1786                 is_HORIZWS_utf8(s),
1787                 is_HORIZWS_latin1(s)
1788             );
1789             break;
1790         case NHORIZWS:
1791             REXEC_FBC_CSCAN(
1792                 !is_HORIZWS_utf8(s),
1793                 !is_HORIZWS_latin1(s)
1794             );
1795             break;
1796         case AHOCORASICKC:
1797         case AHOCORASICK:
1798             {
1799                 DECL_TRIE_TYPE(c);
1800                 /* what trie are we using right now */
1801                 reg_ac_data *aho
1802                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1803                 reg_trie_data *trie
1804                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1805                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1806
1807                 const char *last_start = strend - trie->minlen;
1808 #ifdef DEBUGGING
1809                 const char *real_start = s;
1810 #endif
1811                 STRLEN maxlen = trie->maxlen;
1812                 SV *sv_points;
1813                 U8 **points; /* map of where we were in the input string
1814                                 when reading a given char. For ASCII this
1815                                 is unnecessary overhead as the relationship
1816                                 is always 1:1, but for Unicode, especially
1817                                 case folded Unicode this is not true. */
1818                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1819                 U8 *bitmap=NULL;
1820
1821
1822                 GET_RE_DEBUG_FLAGS_DECL;
1823
1824                 /* We can't just allocate points here. We need to wrap it in
1825                  * an SV so it gets freed properly if there is a croak while
1826                  * running the match */
1827                 ENTER;
1828                 SAVETMPS;
1829                 sv_points=newSV(maxlen * sizeof(U8 *));
1830                 SvCUR_set(sv_points,
1831                     maxlen * sizeof(U8 *));
1832                 SvPOK_on(sv_points);
1833                 sv_2mortal(sv_points);
1834                 points=(U8**)SvPV_nolen(sv_points );
1835                 if ( trie_type != trie_utf8_fold
1836                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1837                 {
1838                     if (trie->bitmap)
1839                         bitmap=(U8*)trie->bitmap;
1840                     else
1841                         bitmap=(U8*)ANYOF_BITMAP(c);
1842                 }
1843                 /* this is the Aho-Corasick algorithm modified a touch
1844                    to include special handling for long "unknown char"
1845                    sequences. The basic idea being that we use AC as long
1846                    as we are dealing with a possible matching char, when
1847                    we encounter an unknown char (and we have not encountered
1848                    an accepting state) we scan forward until we find a legal
1849                    starting char.
1850                    AC matching is basically that of trie matching, except
1851                    that when we encounter a failing transition, we fall back
1852                    to the current states "fail state", and try the current char
1853                    again, a process we repeat until we reach the root state,
1854                    state 1, or a legal transition. If we fail on the root state
1855                    then we can either terminate if we have reached an accepting
1856                    state previously, or restart the entire process from the beginning
1857                    if we have not.
1858
1859                  */
1860                 while (s <= last_start) {
1861                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1862                     U8 *uc = (U8*)s;
1863                     U16 charid = 0;
1864                     U32 base = 1;
1865                     U32 state = 1;
1866                     UV uvc = 0;
1867                     STRLEN len = 0;
1868                     STRLEN foldlen = 0;
1869                     U8 *uscan = (U8*)NULL;
1870                     U8 *leftmost = NULL;
1871 #ifdef DEBUGGING
1872                     U32 accepted_word= 0;
1873 #endif
1874                     U32 pointpos = 0;
1875
1876                     while ( state && uc <= (U8*)strend ) {
1877                         int failed=0;
1878                         U32 word = aho->states[ state ].wordnum;
1879
1880                         if( state==1 ) {
1881                             if ( bitmap ) {
1882                                 DEBUG_TRIE_EXECUTE_r(
1883                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1884                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1885                                             (char *)uc, utf8_target );
1886                                         PerlIO_printf( Perl_debug_log,
1887                                             " Scanning for legal start char...\n");
1888                                     }
1889                                 );
1890                                 if (utf8_target) {
1891                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1892                                         uc += UTF8SKIP(uc);
1893                                     }
1894                                 } else {
1895                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1896                                         uc++;
1897                                     }
1898                                 }
1899                                 s= (char *)uc;
1900                             }
1901                             if (uc >(U8*)last_start) break;
1902                         }
1903
1904                         if ( word ) {
1905                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1906                             if (!leftmost || lpos < leftmost) {
1907                                 DEBUG_r(accepted_word=word);
1908                                 leftmost= lpos;
1909                             }
1910                             if (base==0) break;
1911
1912                         }
1913                         points[pointpos++ % maxlen]= uc;
1914                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1915                                              uscan, len, uvc, charid, foldlen,
1916                                              foldbuf, uniflags);
1917                         DEBUG_TRIE_EXECUTE_r({
1918                             dump_exec_pos( (char *)uc, c, strend, real_start,
1919                                 s,   utf8_target );
1920                             PerlIO_printf(Perl_debug_log,
1921                                 " Charid:%3u CP:%4"UVxf" ",
1922                                  charid, uvc);
1923                         });
1924
1925                         do {
1926 #ifdef DEBUGGING
1927                             word = aho->states[ state ].wordnum;
1928 #endif
1929                             base = aho->states[ state ].trans.base;
1930
1931                             DEBUG_TRIE_EXECUTE_r({
1932                                 if (failed)
1933                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1934                                         s,   utf8_target );
1935                                 PerlIO_printf( Perl_debug_log,
1936                                     "%sState: %4"UVxf", word=%"UVxf,
1937                                     failed ? " Fail transition to " : "",
1938                                     (UV)state, (UV)word);
1939                             });
1940                             if ( base ) {
1941                                 U32 tmp;
1942                                 I32 offset;
1943                                 if (charid &&
1944                                      ( ((offset = base + charid
1945                                         - 1 - trie->uniquecharcount)) >= 0)
1946                                      && ((U32)offset < trie->lasttrans)
1947                                      && trie->trans[offset].check == state
1948                                      && (tmp=trie->trans[offset].next))
1949                                 {
1950                                     DEBUG_TRIE_EXECUTE_r(
1951                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1952                                     state = tmp;
1953                                     break;
1954                                 }
1955                                 else {
1956                                     DEBUG_TRIE_EXECUTE_r(
1957                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1958                                     failed = 1;
1959                                     state = aho->fail[state];
1960                                 }
1961                             }
1962                             else {
1963                                 /* we must be accepting here */
1964                                 DEBUG_TRIE_EXECUTE_r(
1965                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1966                                 failed = 1;
1967                                 break;
1968                             }
1969                         } while(state);
1970                         uc += len;
1971                         if (failed) {
1972                             if (leftmost)
1973                                 break;
1974                             if (!state) state = 1;
1975                         }
1976                     }
1977                     if ( aho->states[ state ].wordnum ) {
1978                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1979                         if (!leftmost || lpos < leftmost) {
1980                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1981                             leftmost = lpos;
1982                         }
1983                     }
1984                     if (leftmost) {
1985                         s = (char*)leftmost;
1986                         DEBUG_TRIE_EXECUTE_r({
1987                             PerlIO_printf(
1988                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1989                                 (UV)accepted_word, (IV)(s - real_start)
1990                             );
1991                         });
1992                         if (!reginfo || regtry(reginfo, &s)) {
1993                             FREETMPS;
1994                             LEAVE;
1995                             goto got_it;
1996                         }
1997                         s = HOPc(s,1);
1998                         DEBUG_TRIE_EXECUTE_r({
1999                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2000                         });
2001                     } else {
2002                         DEBUG_TRIE_EXECUTE_r(
2003                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2004                         break;
2005                     }
2006                 }
2007                 FREETMPS;
2008                 LEAVE;
2009             }
2010             break;
2011         default:
2012             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2013             break;
2014         }
2015         return 0;
2016       got_it:
2017         return s;
2018 }
2019
2020
2021 /*
2022  - regexec_flags - match a regexp against a string
2023  */
2024 I32
2025 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2026               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2027 /* strend: pointer to null at end of string */
2028 /* strbeg: real beginning of string */
2029 /* minend: end of match must be >=minend after stringarg. */
2030 /* data: May be used for some additional optimizations.
2031          Currently its only used, with a U32 cast, for transmitting
2032          the ganch offset when doing a /g match. This will change */
2033 /* nosave: For optimizations. */
2034 {
2035     dVAR;
2036     struct regexp *const prog = (struct regexp *)SvANY(rx);
2037     /*register*/ char *s;
2038     register regnode *c;
2039     /*register*/ char *startpos = stringarg;
2040     I32 minlen;         /* must match at least this many chars */
2041     I32 dontbother = 0; /* how many characters not to try at end */
2042     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2043     I32 scream_pos = -1;                /* Internal iterator of scream. */
2044     char *scream_olds = NULL;
2045     const bool utf8_target = cBOOL(DO_UTF8(sv));
2046     I32 multiline;
2047     RXi_GET_DECL(prog,progi);
2048     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2049     regexp_paren_pair *swap = NULL;
2050     GET_RE_DEBUG_FLAGS_DECL;
2051
2052     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2053     PERL_UNUSED_ARG(data);
2054
2055     /* Be paranoid... */
2056     if (prog == NULL || startpos == NULL) {
2057         Perl_croak(aTHX_ "NULL regexp parameter");
2058         return 0;
2059     }
2060
2061     multiline = prog->extflags & RXf_PMf_MULTILINE;
2062     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2063
2064     RX_MATCH_UTF8_set(rx, utf8_target);
2065     DEBUG_EXECUTE_r(
2066         debug_start_match(rx, utf8_target, startpos, strend,
2067         "Matching");
2068     );
2069
2070     minlen = prog->minlen;
2071
2072     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2073         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2074                               "String too short [regexec_flags]...\n"));
2075         goto phooey;
2076     }
2077
2078
2079     /* Check validity of program. */
2080     if (UCHARAT(progi->program) != REG_MAGIC) {
2081         Perl_croak(aTHX_ "corrupted regexp program");
2082     }
2083
2084     PL_reg_flags = 0;
2085     PL_reg_state.re_state_eval_setup_done = FALSE;
2086     PL_reg_maxiter = 0;
2087
2088     if (RX_UTF8(rx))
2089         PL_reg_flags |= RF_utf8;
2090
2091     /* Mark beginning of line for ^ and lookbehind. */
2092     reginfo.bol = startpos; /* XXX not used ??? */
2093     PL_bostr  = strbeg;
2094     reginfo.sv = sv;
2095
2096     /* Mark end of line for $ (and such) */
2097     PL_regeol = strend;
2098
2099     /* see how far we have to get to not match where we matched before */
2100     reginfo.till = startpos+minend;
2101
2102     /* If there is a "must appear" string, look for it. */
2103     s = startpos;
2104
2105     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2106         MAGIC *mg;
2107         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2108             reginfo.ganch = startpos + prog->gofs;
2109             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2110               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2111         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2112                   && SvMAGIC(sv)
2113                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2114                   && mg->mg_len >= 0) {
2115             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2116             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2117                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2118
2119             if (prog->extflags & RXf_ANCH_GPOS) {
2120                 if (s > reginfo.ganch)
2121                     goto phooey;
2122                 s = reginfo.ganch - prog->gofs;
2123                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2124                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2125                 if (s < strbeg)
2126                     goto phooey;
2127             }
2128         }
2129         else if (data) {
2130             reginfo.ganch = strbeg + PTR2UV(data);
2131             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2132                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2133
2134         } else {                                /* pos() not defined */
2135             reginfo.ganch = strbeg;
2136             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2137                  "GPOS: reginfo.ganch = strbeg\n"));
2138         }
2139     }
2140     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2141         /* We have to be careful. If the previous successful match
2142            was from this regex we don't want a subsequent partially
2143            successful match to clobber the old results.
2144            So when we detect this possibility we add a swap buffer
2145            to the re, and switch the buffer each match. If we fail
2146            we switch it back, otherwise we leave it swapped.
2147         */
2148         swap = prog->offs;
2149         /* do we need a save destructor here for eval dies? */
2150         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2151         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2152             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2153             PTR2UV(prog),
2154             PTR2UV(swap),
2155             PTR2UV(prog->offs)
2156         ));
2157     }
2158     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2159         re_scream_pos_data d;
2160
2161         d.scream_olds = &scream_olds;
2162         d.scream_pos = &scream_pos;
2163         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2164         if (!s) {
2165             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2166             goto phooey;        /* not present */
2167         }
2168     }
2169
2170
2171
2172     /* Simplest case:  anchored match need be tried only once. */
2173     /*  [unless only anchor is BOL and multiline is set] */
2174     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2175         if (s == startpos && regtry(&reginfo, &startpos))
2176             goto got_it;
2177         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2178                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2179         {
2180             char *end;
2181
2182             if (minlen)
2183                 dontbother = minlen - 1;
2184             end = HOP3c(strend, -dontbother, strbeg) - 1;
2185             /* for multiline we only have to try after newlines */
2186             if (prog->check_substr || prog->check_utf8) {
2187                 /* because of the goto we can not easily reuse the macros for bifurcating the
2188                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2189                 if (utf8_target) {
2190                     if (s == startpos)
2191                         goto after_try_utf8;
2192                     while (1) {
2193                         if (regtry(&reginfo, &s)) {
2194                             goto got_it;
2195                         }
2196                       after_try_utf8:
2197                         if (s > end) {
2198                             goto phooey;
2199                         }
2200                         if (prog->extflags & RXf_USE_INTUIT) {
2201                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2202                             if (!s) {
2203                                 goto phooey;
2204                             }
2205                         }
2206                         else {
2207                             s += UTF8SKIP(s);
2208                         }
2209                     }
2210                 } /* end search for check string in unicode */
2211                 else {
2212                     if (s == startpos) {
2213                         goto after_try_latin;
2214                     }
2215                     while (1) {
2216                         if (regtry(&reginfo, &s)) {
2217                             goto got_it;
2218                         }
2219                       after_try_latin:
2220                         if (s > end) {
2221                             goto phooey;
2222                         }
2223                         if (prog->extflags & RXf_USE_INTUIT) {
2224                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2225                             if (!s) {
2226                                 goto phooey;
2227                             }
2228                         }
2229                         else {
2230                             s++;
2231                         }
2232                     }
2233                 } /* end search for check string in latin*/
2234             } /* end search for check string */
2235             else { /* search for newline */
2236                 if (s > startpos) {
2237                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2238                     s--;
2239                 }
2240                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2241                 while (s <= end) { /* note it could be possible to match at the end of the string */
2242                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2243                         if (regtry(&reginfo, &s))
2244                             goto got_it;
2245                     }
2246                 }
2247             } /* end search for newline */
2248         } /* end anchored/multiline check string search */
2249         goto phooey;
2250     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2251     {
2252         /* the warning about reginfo.ganch being used without initialization
2253            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2254            and we only enter this block when the same bit is set. */
2255         char *tmp_s = reginfo.ganch - prog->gofs;
2256
2257         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2258             goto got_it;
2259         goto phooey;
2260     }
2261
2262     /* Messy cases:  unanchored match. */
2263     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2264         /* we have /x+whatever/ */
2265         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2266         char ch;
2267 #ifdef DEBUGGING
2268         int did_match = 0;
2269 #endif
2270         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2271             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2272         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2273
2274         if (utf8_target) {
2275             REXEC_FBC_SCAN(
2276                 if (*s == ch) {
2277                     DEBUG_EXECUTE_r( did_match = 1 );
2278                     if (regtry(&reginfo, &s)) goto got_it;
2279                     s += UTF8SKIP(s);
2280                     while (s < strend && *s == ch)
2281                         s += UTF8SKIP(s);
2282                 }
2283             );
2284         }
2285         else {
2286             REXEC_FBC_SCAN(
2287                 if (*s == ch) {
2288                     DEBUG_EXECUTE_r( did_match = 1 );
2289                     if (regtry(&reginfo, &s)) goto got_it;
2290                     s++;
2291                     while (s < strend && *s == ch)
2292                         s++;
2293                 }
2294             );
2295         }
2296         DEBUG_EXECUTE_r(if (!did_match)
2297                 PerlIO_printf(Perl_debug_log,
2298                                   "Did not find anchored character...\n")
2299                );
2300     }
2301     else if (prog->anchored_substr != NULL
2302               || prog->anchored_utf8 != NULL
2303               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2304                   && prog->float_max_offset < strend - s)) {
2305         SV *must;
2306         I32 back_max;
2307         I32 back_min;
2308         char *last;
2309         char *last1;            /* Last position checked before */
2310 #ifdef DEBUGGING
2311         int did_match = 0;
2312 #endif
2313         if (prog->anchored_substr || prog->anchored_utf8) {
2314             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2315                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2316             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2317             back_max = back_min = prog->anchored_offset;
2318         } else {
2319             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2320                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2321             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2322             back_max = prog->float_max_offset;
2323             back_min = prog->float_min_offset;
2324         }
2325
2326
2327         if (must == &PL_sv_undef)
2328             /* could not downgrade utf8 check substring, so must fail */
2329             goto phooey;
2330
2331         if (back_min<0) {
2332             last = strend;
2333         } else {
2334             last = HOP3c(strend,        /* Cannot start after this */
2335                   -(I32)(CHR_SVLEN(must)
2336                          - (SvTAIL(must) != 0) + back_min), strbeg);
2337         }
2338         if (s > PL_bostr)
2339             last1 = HOPc(s, -1);
2340         else
2341             last1 = s - 1;      /* bogus */
2342
2343         /* XXXX check_substr already used to find "s", can optimize if
2344            check_substr==must. */
2345         scream_pos = -1;
2346         dontbother = end_shift;
2347         strend = HOPc(strend, -dontbother);
2348         while ( (s <= last) &&
2349                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2350                                   (unsigned char*)strend, must,
2351                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2352             DEBUG_EXECUTE_r( did_match = 1 );
2353             if (HOPc(s, -back_max) > last1) {
2354                 last1 = HOPc(s, -back_min);
2355                 s = HOPc(s, -back_max);
2356             }
2357             else {
2358                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2359
2360                 last1 = HOPc(s, -back_min);
2361                 s = t;
2362             }
2363             if (utf8_target) {
2364                 while (s <= last1) {
2365                     if (regtry(&reginfo, &s))
2366                         goto got_it;
2367                     s += UTF8SKIP(s);
2368                 }
2369             }
2370             else {
2371                 while (s <= last1) {
2372                     if (regtry(&reginfo, &s))
2373                         goto got_it;
2374                     s++;
2375                 }
2376             }
2377         }
2378         DEBUG_EXECUTE_r(if (!did_match) {
2379             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2380                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2381             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2382                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2383                                ? "anchored" : "floating"),
2384                 quoted, RE_SV_TAIL(must));
2385         });
2386         goto phooey;
2387     }
2388     else if ( (c = progi->regstclass) ) {
2389         if (minlen) {
2390             const OPCODE op = OP(progi->regstclass);
2391             /* don't bother with what can't match */
2392             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2393                 strend = HOPc(strend, -(minlen - 1));
2394         }
2395         DEBUG_EXECUTE_r({
2396             SV * const prop = sv_newmortal();
2397             regprop(prog, prop, c);
2398             {
2399                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2400                     s,strend-s,60);
2401                 PerlIO_printf(Perl_debug_log,
2402                     "Matching stclass %.*s against %s (%d bytes)\n",
2403                     (int)SvCUR(prop), SvPVX_const(prop),
2404                      quoted, (int)(strend - s));
2405             }
2406         });
2407         if (find_byclass(prog, c, s, strend, &reginfo))
2408             goto got_it;
2409         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2410     }
2411     else {
2412         dontbother = 0;
2413         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2414             /* Trim the end. */
2415             char *last= NULL;
2416             SV* float_real;
2417             STRLEN len;
2418             const char *little;
2419
2420             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2421                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2422             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2423
2424             little = SvPV_const(float_real, len);
2425             if (SvTAIL(float_real)) {
2426                     /* This means that float_real contains an artificial \n on the end
2427                      * due to the presence of something like this: /foo$/
2428                      * where we can match both "foo" and "foo\n" at the end of the string.
2429                      * So we have to compare the end of the string first against the float_real
2430                      * without the \n and then against the full float_real with the string.
2431                      * We have to watch out for cases where the string might be smaller
2432                      * than the float_real or the float_real without the \n.
2433                      */
2434                     char *checkpos= strend - len;
2435                     DEBUG_OPTIMISE_r(
2436                         PerlIO_printf(Perl_debug_log,
2437                             "%sChecking for float_real.%s\n",
2438                             PL_colors[4], PL_colors[5]));
2439                     if (checkpos + 1 < strbeg) {
2440                         /* can't match, even if we remove the trailing \n string is too short to match */
2441                         DEBUG_EXECUTE_r(
2442                             PerlIO_printf(Perl_debug_log,
2443                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2444                                 PL_colors[4], PL_colors[5]));
2445                         goto phooey;
2446                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2447                         /* can match, the end of the string matches without the "\n" */
2448                         last = checkpos + 1;
2449                     } else if (checkpos < strbeg) {
2450                         /* cant match, string is too short when the "\n" is included */
2451                         DEBUG_EXECUTE_r(
2452                             PerlIO_printf(Perl_debug_log,
2453                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2454                                 PL_colors[4], PL_colors[5]));
2455                         goto phooey;
2456                     } else if (!multiline) {
2457                         /* non multiline match, so compare with the "\n" at the end of the string */
2458                         if (memEQ(checkpos, little, len)) {
2459                             last= checkpos;
2460                         } else {
2461                             DEBUG_EXECUTE_r(
2462                                 PerlIO_printf(Perl_debug_log,
2463                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2464                                     PL_colors[4], PL_colors[5]));
2465                             goto phooey;
2466                         }
2467                     } else {
2468                         /* multiline match, so we have to search for a place where the full string is located */
2469                         goto find_last;
2470                     }
2471             } else {
2472                   find_last:
2473                     if (len)
2474                         last = rninstr(s, strend, little, little + len);
2475                     else
2476                         last = strend;  /* matching "$" */
2477             }
2478             if (!last) {
2479                 /* at one point this block contained a comment which was probably
2480                  * incorrect, which said that this was a "should not happen" case.
2481                  * Even if it was true when it was written I am pretty sure it is
2482                  * not anymore, so I have removed the comment and replaced it with
2483                  * this one. Yves */
2484                 DEBUG_EXECUTE_r(
2485                     PerlIO_printf(Perl_debug_log,
2486                         "String does not contain required substring, cannot match.\n"
2487                     ));
2488                 goto phooey;
2489             }
2490             dontbother = strend - last + prog->float_min_offset;
2491         }
2492         if (minlen && (dontbother < minlen))
2493             dontbother = minlen - 1;
2494         strend -= dontbother;              /* this one's always in bytes! */
2495         /* We don't know much -- general case. */
2496         if (utf8_target) {
2497             for (;;) {
2498                 if (regtry(&reginfo, &s))
2499                     goto got_it;
2500                 if (s >= strend)
2501                     break;
2502                 s += UTF8SKIP(s);
2503             };
2504         }
2505         else {
2506             do {
2507                 if (regtry(&reginfo, &s))
2508                     goto got_it;
2509             } while (s++ < strend);
2510         }
2511     }
2512
2513     /* Failure. */
2514     goto phooey;
2515
2516 got_it:
2517     DEBUG_BUFFERS_r(
2518         if (swap)
2519             PerlIO_printf(Perl_debug_log,
2520                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2521                 PTR2UV(prog),
2522                 PTR2UV(swap)
2523             );
2524     );
2525     Safefree(swap);
2526     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2527
2528     if (PL_reg_state.re_state_eval_setup_done)
2529         restore_pos(aTHX_ prog);
2530     if (RXp_PAREN_NAMES(prog))
2531         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2532
2533     /* make sure $`, $&, $', and $digit will work later */
2534     if ( !(flags & REXEC_NOT_FIRST) ) {
2535         RX_MATCH_COPY_FREE(rx);
2536         if (flags & REXEC_COPY_STR) {
2537             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2538 #ifdef PERL_OLD_COPY_ON_WRITE
2539             if ((SvIsCOW(sv)
2540                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2541                 if (DEBUG_C_TEST) {
2542                     PerlIO_printf(Perl_debug_log,
2543                                   "Copy on write: regexp capture, type %d\n",
2544                                   (int) SvTYPE(sv));
2545                 }
2546                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2547                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2548                 assert (SvPOKp(prog->saved_copy));
2549             } else
2550 #endif
2551             {
2552                 RX_MATCH_COPIED_on(rx);
2553                 s = savepvn(strbeg, i);
2554                 prog->subbeg = s;
2555             }
2556             prog->sublen = i;
2557         }
2558         else {
2559             prog->subbeg = strbeg;
2560             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2561         }
2562     }
2563
2564     return 1;
2565
2566 phooey:
2567     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2568                           PL_colors[4], PL_colors[5]));
2569     if (PL_reg_state.re_state_eval_setup_done)
2570         restore_pos(aTHX_ prog);
2571     if (swap) {
2572         /* we failed :-( roll it back */
2573         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2574             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2575             PTR2UV(prog),
2576             PTR2UV(prog->offs),
2577             PTR2UV(swap)
2578         ));
2579         Safefree(prog->offs);
2580         prog->offs = swap;
2581     }
2582
2583     return 0;
2584 }
2585
2586
2587 /*
2588  - regtry - try match at specific point
2589  */
2590 STATIC I32                      /* 0 failure, 1 success */
2591 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2592 {
2593     dVAR;
2594     CHECKPOINT lastcp;
2595     REGEXP *const rx = reginfo->prog;
2596     regexp *const prog = (struct regexp *)SvANY(rx);
2597     RXi_GET_DECL(prog,progi);
2598     GET_RE_DEBUG_FLAGS_DECL;
2599
2600     PERL_ARGS_ASSERT_REGTRY;
2601
2602     reginfo->cutpoint=NULL;
2603
2604     if ((prog->extflags & RXf_EVAL_SEEN)
2605         && !PL_reg_state.re_state_eval_setup_done)
2606     {
2607         MAGIC *mg;
2608
2609         PL_reg_state.re_state_eval_setup_done = TRUE;
2610         if (reginfo->sv) {
2611             /* Make $_ available to executed code. */
2612             if (reginfo->sv != DEFSV) {
2613                 SAVE_DEFSV;
2614                 DEFSV_set(reginfo->sv);
2615             }
2616
2617             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2618                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2619                 /* prepare for quick setting of pos */
2620 #ifdef PERL_OLD_COPY_ON_WRITE
2621                 if (SvIsCOW(reginfo->sv))
2622                     sv_force_normal_flags(reginfo->sv, 0);
2623 #endif
2624                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2625                                  &PL_vtbl_mglob, NULL, 0);
2626                 mg->mg_len = -1;
2627             }
2628             PL_reg_magic    = mg;
2629             PL_reg_oldpos   = mg->mg_len;
2630             SAVEDESTRUCTOR_X(restore_pos, prog);
2631         }
2632         if (!PL_reg_curpm) {
2633             Newxz(PL_reg_curpm, 1, PMOP);
2634 #ifdef USE_ITHREADS
2635             {
2636                 SV* const repointer = &PL_sv_undef;
2637                 /* this regexp is also owned by the new PL_reg_curpm, which
2638                    will try to free it.  */
2639                 av_push(PL_regex_padav, repointer);
2640                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2641                 PL_regex_pad = AvARRAY(PL_regex_padav);
2642             }
2643 #endif
2644         }
2645 #ifdef USE_ITHREADS
2646         /* It seems that non-ithreads works both with and without this code.
2647            So for efficiency reasons it seems best not to have the code
2648            compiled when it is not needed.  */
2649         /* This is safe against NULLs: */
2650         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2651         /* PM_reg_curpm owns a reference to this regexp.  */
2652         (void)ReREFCNT_inc(rx);
2653 #endif
2654         PM_SETRE(PL_reg_curpm, rx);
2655         PL_reg_oldcurpm = PL_curpm;
2656         PL_curpm = PL_reg_curpm;
2657         if (RXp_MATCH_COPIED(prog)) {
2658             /*  Here is a serious problem: we cannot rewrite subbeg,
2659                 since it may be needed if this match fails.  Thus
2660                 $` inside (?{}) could fail... */
2661             PL_reg_oldsaved = prog->subbeg;
2662             PL_reg_oldsavedlen = prog->sublen;
2663 #ifdef PERL_OLD_COPY_ON_WRITE
2664             PL_nrs = prog->saved_copy;
2665 #endif
2666             RXp_MATCH_COPIED_off(prog);
2667         }
2668         else
2669             PL_reg_oldsaved = NULL;
2670         prog->subbeg = PL_bostr;
2671         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2672     }
2673 #ifdef DEBUGGING
2674     PL_reg_starttry = *startpos;
2675 #endif
2676     prog->offs[0].start = *startpos - PL_bostr;
2677     PL_reginput = *startpos;
2678     prog->lastparen = 0;
2679     prog->lastcloseparen = 0;
2680     PL_regsize = 0;
2681
2682     /* XXXX What this code is doing here?!!!  There should be no need
2683        to do this again and again, prog->lastparen should take care of
2684        this!  --ilya*/
2685
2686     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2687      * Actually, the code in regcppop() (which Ilya may be meaning by
2688      * prog->lastparen), is not needed at all by the test suite
2689      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2690      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2691      * Meanwhile, this code *is* needed for the
2692      * above-mentioned test suite tests to succeed.  The common theme
2693      * on those tests seems to be returning null fields from matches.
2694      * --jhi updated by dapm */
2695 #if 1
2696     if (prog->nparens) {
2697         regexp_paren_pair *pp = prog->offs;
2698         register I32 i;
2699         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2700             ++pp;
2701             pp->start = -1;
2702             pp->end = -1;
2703         }
2704     }
2705 #endif
2706     REGCP_SET(lastcp);
2707     if (regmatch(reginfo, progi->program + 1)) {
2708         prog->offs[0].end = PL_reginput - PL_bostr;
2709         return 1;
2710     }
2711     if (reginfo->cutpoint)
2712         *startpos= reginfo->cutpoint;
2713     REGCP_UNWIND(lastcp);
2714     return 0;
2715 }
2716
2717
2718 #define sayYES goto yes
2719 #define sayNO goto no
2720 #define sayNO_SILENT goto no_silent
2721
2722 /* we dont use STMT_START/END here because it leads to
2723    "unreachable code" warnings, which are bogus, but distracting. */
2724 #define CACHEsayNO \
2725     if (ST.cache_mask) \
2726        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2727     sayNO
2728
2729 /* this is used to determine how far from the left messages like
2730    'failed...' are printed. It should be set such that messages
2731    are inline with the regop output that created them.
2732 */
2733 #define REPORT_CODE_OFF 32
2734
2735
2736 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2737 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2738
2739 #define SLAB_FIRST(s) (&(s)->states[0])
2740 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2741
2742 /* grab a new slab and return the first slot in it */
2743
2744 STATIC regmatch_state *
2745 S_push_slab(pTHX)
2746 {
2747 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2748     dMY_CXT;
2749 #endif
2750     regmatch_slab *s = PL_regmatch_slab->next;
2751     if (!s) {
2752         Newx(s, 1, regmatch_slab);
2753         s->prev = PL_regmatch_slab;
2754         s->next = NULL;
2755         PL_regmatch_slab->next = s;
2756     }
2757     PL_regmatch_slab = s;
2758     return SLAB_FIRST(s);
2759 }
2760
2761
2762 /* push a new state then goto it */
2763
2764 #define PUSH_STATE_GOTO(state, node) \
2765     scan = node; \
2766     st->resume_state = state; \
2767     goto push_state;
2768
2769 /* push a new state with success backtracking, then goto it */
2770
2771 #define PUSH_YES_STATE_GOTO(state, node) \
2772     scan = node; \
2773     st->resume_state = state; \
2774     goto push_yes_state;
2775
2776
2777
2778 /*
2779
2780 regmatch() - main matching routine
2781
2782 This is basically one big switch statement in a loop. We execute an op,
2783 set 'next' to point the next op, and continue. If we come to a point which
2784 we may need to backtrack to on failure such as (A|B|C), we push a
2785 backtrack state onto the backtrack stack. On failure, we pop the top
2786 state, and re-enter the loop at the state indicated. If there are no more
2787 states to pop, we return failure.
2788
2789 Sometimes we also need to backtrack on success; for example /A+/, where
2790 after successfully matching one A, we need to go back and try to
2791 match another one; similarly for lookahead assertions: if the assertion
2792 completes successfully, we backtrack to the state just before the assertion
2793 and then carry on.  In these cases, the pushed state is marked as
2794 'backtrack on success too'. This marking is in fact done by a chain of
2795 pointers, each pointing to the previous 'yes' state. On success, we pop to
2796 the nearest yes state, discarding any intermediate failure-only states.
2797 Sometimes a yes state is pushed just to force some cleanup code to be
2798 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2799 it to free the inner regex.
2800
2801 Note that failure backtracking rewinds the cursor position, while
2802 success backtracking leaves it alone.
2803
2804 A pattern is complete when the END op is executed, while a subpattern
2805 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2806 ops trigger the "pop to last yes state if any, otherwise return true"
2807 behaviour.
2808
2809 A common convention in this function is to use A and B to refer to the two
2810 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2811 the subpattern to be matched possibly multiple times, while B is the entire
2812 rest of the pattern. Variable and state names reflect this convention.
2813
2814 The states in the main switch are the union of ops and failure/success of
2815 substates associated with with that op.  For example, IFMATCH is the op
2816 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2817 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2818 successfully matched A and IFMATCH_A_fail is a state saying that we have
2819 just failed to match A. Resume states always come in pairs. The backtrack
2820 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2821 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2822 on success or failure.
2823
2824 The struct that holds a backtracking state is actually a big union, with
2825 one variant for each major type of op. The variable st points to the
2826 top-most backtrack struct. To make the code clearer, within each
2827 block of code we #define ST to alias the relevant union.
2828
2829 Here's a concrete example of a (vastly oversimplified) IFMATCH
2830 implementation:
2831
2832     switch (state) {
2833     ....
2834
2835 #define ST st->u.ifmatch
2836
2837     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2838         ST.foo = ...; // some state we wish to save
2839         ...
2840         // push a yes backtrack state with a resume value of
2841         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2842         // first node of A:
2843         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2844         // NOTREACHED
2845
2846     case IFMATCH_A: // we have successfully executed A; now continue with B
2847         next = B;
2848         bar = ST.foo; // do something with the preserved value
2849         break;
2850
2851     case IFMATCH_A_fail: // A failed, so the assertion failed
2852         ...;   // do some housekeeping, then ...
2853         sayNO; // propagate the failure
2854
2855 #undef ST
2856
2857     ...
2858     }
2859
2860 For any old-timers reading this who are familiar with the old recursive
2861 approach, the code above is equivalent to:
2862
2863     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2864     {
2865         int foo = ...
2866         ...
2867         if (regmatch(A)) {
2868             next = B;
2869             bar = foo;
2870             break;
2871         }
2872         ...;   // do some housekeeping, then ...
2873         sayNO; // propagate the failure
2874     }
2875
2876 The topmost backtrack state, pointed to by st, is usually free. If you
2877 want to claim it, populate any ST.foo fields in it with values you wish to
2878 save, then do one of
2879
2880         PUSH_STATE_GOTO(resume_state, node);
2881         PUSH_YES_STATE_GOTO(resume_state, node);
2882
2883 which sets that backtrack state's resume value to 'resume_state', pushes a
2884 new free entry to the top of the backtrack stack, then goes to 'node'.
2885 On backtracking, the free slot is popped, and the saved state becomes the
2886 new free state. An ST.foo field in this new top state can be temporarily
2887 accessed to retrieve values, but once the main loop is re-entered, it
2888 becomes available for reuse.
2889
2890 Note that the depth of the backtrack stack constantly increases during the
2891 left-to-right execution of the pattern, rather than going up and down with
2892 the pattern nesting. For example the stack is at its maximum at Z at the
2893 end of the pattern, rather than at X in the following:
2894
2895     /(((X)+)+)+....(Y)+....Z/
2896
2897 The only exceptions to this are lookahead/behind assertions and the cut,
2898 (?>A), which pop all the backtrack states associated with A before
2899 continuing.
2900
2901 Backtrack state structs are allocated in slabs of about 4K in size.
2902 PL_regmatch_state and st always point to the currently active state,
2903 and PL_regmatch_slab points to the slab currently containing
2904 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2905 allocated, and is never freed until interpreter destruction. When the slab
2906 is full, a new one is allocated and chained to the end. At exit from
2907 regmatch(), slabs allocated since entry are freed.
2908
2909 */
2910
2911
2912 #define DEBUG_STATE_pp(pp)                                  \
2913     DEBUG_STATE_r({                                         \
2914         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2915         PerlIO_printf(Perl_debug_log,                       \
2916             "    %*s"pp" %s%s%s%s%s\n",                     \
2917             depth*2, "",                                    \
2918             PL_reg_name[st->resume_state],                     \
2919             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2920             ((st==yes_state) ? "Y" : ""),                   \
2921             ((st==mark_state) ? "M" : ""),                  \
2922             ((st==yes_state||st==mark_state) ? "]" : "")    \
2923         );                                                  \
2924     });
2925
2926
2927 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2928
2929 #ifdef DEBUGGING
2930
2931 STATIC void
2932 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2933     const char *start, const char *end, const char *blurb)
2934 {
2935     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2936
2937     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2938
2939     if (!PL_colorset)
2940             reginitcolors();
2941     {
2942         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2943             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2944
2945         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2946             start, end - start, 60);
2947
2948         PerlIO_printf(Perl_debug_log,
2949             "%s%s REx%s %s against %s\n",
2950                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2951
2952         if (utf8_target||utf8_pat)
2953             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2954                 utf8_pat ? "pattern" : "",
2955                 utf8_pat && utf8_target ? " and " : "",
2956                 utf8_target ? "string" : ""
2957             );
2958     }
2959 }
2960
2961 STATIC void
2962 S_dump_exec_pos(pTHX_ const char *locinput,
2963                       const regnode *scan,
2964                       const char *loc_regeol,
2965                       const char *loc_bostr,
2966                       const char *loc_reg_starttry,
2967                       const bool utf8_target)
2968 {
2969     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2970     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2971     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2972     /* The part of the string before starttry has one color
2973        (pref0_len chars), between starttry and current
2974        position another one (pref_len - pref0_len chars),
2975        after the current position the third one.
2976        We assume that pref0_len <= pref_len, otherwise we
2977        decrease pref0_len.  */
2978     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2979         ? (5 + taill) - l : locinput - loc_bostr;
2980     int pref0_len;
2981
2982     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2983
2984     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2985         pref_len++;
2986     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2987     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2988         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2989               ? (5 + taill) - pref_len : loc_regeol - locinput);
2990     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2991         l--;
2992     if (pref0_len < 0)
2993         pref0_len = 0;
2994     if (pref0_len > pref_len)
2995         pref0_len = pref_len;
2996     {
2997         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2998
2999         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3000             (locinput - pref_len),pref0_len, 60, 4, 5);
3001
3002         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3003                     (locinput - pref_len + pref0_len),
3004                     pref_len - pref0_len, 60, 2, 3);
3005
3006         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3007                     locinput, loc_regeol - locinput, 10, 0, 1);
3008
3009         const STRLEN tlen=len0+len1+len2;
3010         PerlIO_printf(Perl_debug_log,
3011                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3012                     (IV)(locinput - loc_bostr),
3013                     len0, s0,
3014                     len1, s1,
3015                     (docolor ? "" : "> <"),
3016                     len2, s2,
3017                     (int)(tlen > 19 ? 0 :  19 - tlen),
3018                     "");
3019     }
3020 }
3021
3022 #endif
3023
3024 /* reg_check_named_buff_matched()
3025  * Checks to see if a named buffer has matched. The data array of
3026  * buffer numbers corresponding to the buffer is expected to reside
3027  * in the regexp->data->data array in the slot stored in the ARG() of
3028  * node involved. Note that this routine doesn't actually care about the
3029  * name, that information is not preserved from compilation to execution.
3030  * Returns the index of the leftmost defined buffer with the given name
3031  * or 0 if non of the buffers matched.
3032  */
3033 STATIC I32
3034 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3035 {
3036     I32 n;
3037     RXi_GET_DECL(rex,rexi);
3038     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3039     I32 *nums=(I32*)SvPVX(sv_dat);
3040
3041     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3042
3043     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3044         if ((I32)rex->lastparen >= nums[n] &&
3045             rex->offs[nums[n]].end != -1)
3046         {
3047             return nums[n];
3048         }
3049     }
3050     return 0;
3051 }
3052
3053
3054 /* free all slabs above current one  - called during LEAVE_SCOPE */
3055
3056 STATIC void
3057 S_clear_backtrack_stack(pTHX_ void *p)
3058 {
3059     regmatch_slab *s = PL_regmatch_slab->next;
3060     PERL_UNUSED_ARG(p);
3061
3062     if (!s)
3063         return;
3064     PL_regmatch_slab->next = NULL;
3065     while (s) {
3066         regmatch_slab * const osl = s;
3067         s = s->next;
3068         Safefree(osl);
3069     }
3070 }
3071
3072
3073 #define SETREX(Re1,Re2) \
3074     if (PL_reg_state.re_state_eval_setup_done) \
3075         PM_SETRE((PL_reg_curpm), (Re2)); \
3076     Re1 = (Re2)
3077
3078 STATIC I32                      /* 0 failure, 1 success */
3079 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3080 {
3081 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3082     dMY_CXT;
3083 #endif
3084     dVAR;
3085     register const bool utf8_target = PL_reg_match_utf8;
3086     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3087     REGEXP *rex_sv = reginfo->prog;
3088     regexp *rex = (struct regexp *)SvANY(rex_sv);
3089     RXi_GET_DECL(rex,rexi);
3090     I32 oldsave;
3091     /* the current state. This is a cached copy of PL_regmatch_state */
3092     register regmatch_state *st;
3093     /* cache heavy used fields of st in registers */
3094     register regnode *scan;
3095     register regnode *next;
3096     register U32 n = 0; /* general value; init to avoid compiler warning */
3097     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3098     register char *locinput = PL_reginput;
3099     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3100
3101     bool result = 0;        /* return value of S_regmatch */
3102     int depth = 0;          /* depth of backtrack stack */
3103     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3104     const U32 max_nochange_depth =
3105         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3106         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3107     regmatch_state *yes_state = NULL; /* state to pop to on success of
3108                                                             subpattern */
3109     /* mark_state piggy backs on the yes_state logic so that when we unwind
3110        the stack on success we can update the mark_state as we go */
3111     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3112     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3113     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3114     U32 state_num;
3115     bool no_final = 0;      /* prevent failure from backtracking? */
3116     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3117     char *startpoint = PL_reginput;
3118     SV *popmark = NULL;     /* are we looking for a mark? */
3119     SV *sv_commit = NULL;   /* last mark name seen in failure */
3120     SV *sv_yes_mark = NULL; /* last mark name we have seen
3121                                during a successful match */
3122     U32 lastopen = 0;       /* last open we saw */
3123     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3124     SV* const oreplsv = GvSV(PL_replgv);
3125     /* these three flags are set by various ops to signal information to
3126      * the very next op. They have a useful lifetime of exactly one loop
3127      * iteration, and are not preserved or restored by state pushes/pops
3128      */
3129     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3130     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3131     int logical = 0;        /* the following EVAL is:
3132                                 0: (?{...})
3133                                 1: (?(?{...})X|Y)
3134                                 2: (??{...})
3135                                or the following IFMATCH/UNLESSM is:
3136                                 false: plain (?=foo)
3137                                 true:  used as a condition: (?(?=foo))
3138                             */
3139     PAD* last_pad = NULL;
3140     dMULTICALL;
3141     I32 gimme = G_SCALAR;
3142     CV *caller_cv = NULL;       /* who called us */
3143     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3144
3145 #ifdef DEBUGGING
3146     GET_RE_DEBUG_FLAGS_DECL;
3147 #endif
3148
3149     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3150     multicall_oldcatch = 0;
3151     multicall_cv = NULL;
3152     cx = NULL;
3153
3154
3155     PERL_ARGS_ASSERT_REGMATCH;
3156
3157     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3158             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3159     }));
3160     /* on first ever call to regmatch, allocate first slab */
3161     if (!PL_regmatch_slab) {
3162         Newx(PL_regmatch_slab, 1, regmatch_slab);
3163         PL_regmatch_slab->prev = NULL;
3164         PL_regmatch_slab->next = NULL;
3165         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3166     }
3167
3168     oldsave = PL_savestack_ix;
3169     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3170     SAVEVPTR(PL_regmatch_slab);
3171     SAVEVPTR(PL_regmatch_state);
3172
3173     /* grab next free state slot */
3174     st = ++PL_regmatch_state;
3175     if (st >  SLAB_LAST(PL_regmatch_slab))
3176         st = PL_regmatch_state = S_push_slab(aTHX);
3177
3178     /* Note that nextchr is a byte even in UTF */
3179     nextchr = UCHARAT(locinput);
3180     scan = prog;
3181     while (scan != NULL) {
3182
3183         DEBUG_EXECUTE_r( {
3184             SV * const prop = sv_newmortal();
3185             regnode *rnext=regnext(scan);
3186             DUMP_EXEC_POS( locinput, scan, utf8_target );
3187             regprop(rex, prop, scan);
3188
3189             PerlIO_printf(Perl_debug_log,
3190                     "%3"IVdf":%*s%s(%"IVdf")\n",
3191                     (IV)(scan - rexi->program), depth*2, "",
3192                     SvPVX_const(prop),
3193                     (PL_regkind[OP(scan)] == END || !rnext) ?
3194                         0 : (IV)(rnext - rexi->program));
3195         });
3196
3197         next = scan + NEXT_OFF(scan);
3198         if (next == scan)
3199             next = NULL;
3200         state_num = OP(scan);
3201
3202       reenter_switch:
3203
3204         switch (state_num) {
3205         case BOL:
3206             if (locinput == PL_bostr)
3207             {
3208                 /* reginfo->till = reginfo->bol; */
3209                 break;
3210             }
3211             sayNO;
3212         case MBOL:
3213             if (locinput == PL_bostr ||
3214                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3215             {
3216                 break;
3217             }
3218             sayNO;
3219         case SBOL:
3220             if (locinput == PL_bostr)
3221                 break;
3222             sayNO;
3223         case GPOS:
3224             if (locinput == reginfo->ganch)
3225                 break;
3226             sayNO;
3227
3228         case KEEPS:
3229             /* update the startpoint */
3230             st->u.keeper.val = rex->offs[0].start;
3231             PL_reginput = locinput;
3232             rex->offs[0].start = locinput - PL_bostr;
3233             PUSH_STATE_GOTO(KEEPS_next, next);
3234             /*NOT-REACHED*/
3235         case KEEPS_next_fail:
3236             /* rollback the start point change */
3237             rex->offs[0].start = st->u.keeper.val;
3238             sayNO_SILENT;
3239             /*NOT-REACHED*/
3240         case EOL:
3241                 goto seol;
3242         case MEOL:
3243             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3244                 sayNO;
3245             break;
3246         case SEOL:
3247           seol:
3248             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3249                 sayNO;
3250             if (PL_regeol - locinput > 1)
3251                 sayNO;
3252             break;
3253         case EOS:
3254             if (PL_regeol != locinput)
3255                 sayNO;
3256             break;
3257         case SANY:
3258             if (!nextchr && locinput >= PL_regeol)
3259                 sayNO;
3260             if (utf8_target) {
3261                 locinput += PL_utf8skip[nextchr];
3262                 if (locinput > PL_regeol)
3263                     sayNO;
3264                 nextchr = UCHARAT(locinput);
3265             }
3266             else
3267                 nextchr = UCHARAT(++locinput);
3268             break;
3269         case CANY:
3270             if (!nextchr && locinput >= PL_regeol)
3271                 sayNO;
3272             nextchr = UCHARAT(++locinput);
3273             break;
3274         case REG_ANY:
3275             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3276                 sayNO;
3277             if (utf8_target) {
3278                 locinput += PL_utf8skip[nextchr];
3279                 if (locinput > PL_regeol)
3280                     sayNO;
3281                 nextchr = UCHARAT(locinput);
3282             }
3283             else
3284                 nextchr = UCHARAT(++locinput);
3285             break;
3286
3287 #undef  ST
3288 #define ST st->u.trie
3289         case TRIEC:
3290             /* In this case the charclass data is available inline so
3291                we can fail fast without a lot of extra overhead.
3292              */
3293             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3294                 DEBUG_EXECUTE_r(
3295                     PerlIO_printf(Perl_debug_log,
3296                               "%*s  %sfailed to match trie start class...%s\n",
3297                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3298                 );
3299                 sayNO_SILENT;
3300                 /* NOTREACHED */
3301             }
3302             /* FALL THROUGH */
3303         case TRIE:
3304             /* the basic plan of execution of the trie is:
3305              * At the beginning, run though all the states, and
3306              * find the longest-matching word. Also remember the position
3307              * of the shortest matching word. For example, this pattern:
3308              *    1  2 3 4    5
3309              *    ab|a|x|abcd|abc
3310              * when matched against the string "abcde", will generate
3311              * accept states for all words except 3, with the longest
3312              * matching word being 4, and the shortest being 1 (with
3313              * the position being after char 1 of the string).
3314              *
3315              * Then for each matching word, in word order (i.e. 1,2,4,5),
3316              * we run the remainder of the pattern; on each try setting
3317              * the current position to the character following the word,
3318              * returning to try the next word on failure.
3319              *
3320              * We avoid having to build a list of words at runtime by
3321              * using a compile-time structure, wordinfo[].prev, which
3322              * gives, for each word, the previous accepting word (if any).
3323              * In the case above it would contain the mappings 1->2, 2->0,
3324              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3325              * the longest word (4 above), a list of all words, by
3326              * following the list of prev pointers; this gives us the
3327              * unordered list 4,5,1,2. Then given the current word we have
3328              * just tried, we can go through the list and find the
3329              * next-biggest word to try (so if we just failed on word 2,
3330              * the next in the list is 4).
3331              *
3332              * Since at runtime we don't record the matching position in
3333              * the string for each word, we have to work that out for
3334              * each word we're about to process. The wordinfo table holds
3335              * the character length of each word; given that we recorded
3336              * at the start: the position of the shortest word and its
3337              * length in chars, we just need to move the pointer the
3338              * difference between the two char lengths. Depending on
3339              * Unicode status and folding, that's cheap or expensive.
3340              *
3341              * This algorithm is optimised for the case where are only a
3342              * small number of accept states, i.e. 0,1, or maybe 2.
3343              * With lots of accepts states, and having to try all of them,
3344              * it becomes quadratic on number of accept states to find all
3345              * the next words.
3346              */
3347
3348             {
3349                 /* what type of TRIE am I? (utf8 makes this contextual) */
3350                 DECL_TRIE_TYPE(scan);
3351
3352                 /* what trie are we using right now */
3353                 reg_trie_data * const trie
3354                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3355                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3356                 U32 state = trie->startstate;
3357
3358                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3359                     if (trie->states[ state ].wordnum) {
3360                          DEBUG_EXECUTE_r(
3361                             PerlIO_printf(Perl_debug_log,
3362                                           "%*s  %smatched empty string...%s\n",
3363                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3364                         );
3365                         if (!trie->jump)
3366                             break;
3367                     } else {
3368                         DEBUG_EXECUTE_r(
3369                             PerlIO_printf(Perl_debug_log,
3370                                           "%*s  %sfailed to match trie start class...%s\n",
3371                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3372                         );
3373                         sayNO_SILENT;
3374                    }
3375                 }
3376
3377             {
3378                 U8 *uc = ( U8* )locinput;
3379
3380                 STRLEN len = 0;
3381                 STRLEN foldlen = 0;
3382                 U8 *uscan = (U8*)NULL;
3383                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3384                 U32 charcount = 0; /* how many input chars we have matched */
3385                 U32 accepted = 0; /* have we seen any accepting states? */
3386
3387                 ST.B = next;
3388                 ST.jump = trie->jump;
3389                 ST.me = scan;
3390                 ST.firstpos = NULL;
3391                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3392                 ST.nextword = 0;
3393
3394                 /* fully traverse the TRIE; note the position of the
3395                    shortest accept state and the wordnum of the longest
3396                    accept state */
3397
3398                 while ( state && uc <= (U8*)PL_regeol ) {
3399                     U32 base = trie->states[ state ].trans.base;
3400                     UV uvc = 0;
3401                     U16 charid = 0;
3402                     U16 wordnum;
3403                     wordnum = trie->states[ state ].wordnum;
3404
3405                     if (wordnum) { /* it's an accept state */
3406                         if (!accepted) {
3407                             accepted = 1;
3408                             /* record first match position */
3409                             if (ST.longfold) {
3410                                 ST.firstpos = (U8*)locinput;
3411                                 ST.firstchars = 0;
3412                             }
3413                             else {
3414                                 ST.firstpos = uc;
3415                                 ST.firstchars = charcount;
3416                             }
3417                         }
3418                         if (!ST.nextword || wordnum < ST.nextword)
3419                             ST.nextword = wordnum;
3420                         ST.topword = wordnum;
3421                     }
3422
3423                     DEBUG_TRIE_EXECUTE_r({
3424                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3425                                 PerlIO_printf( Perl_debug_log,
3426                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3427                                     2+depth * 2, "", PL_colors[4],
3428                                     (UV)state, (accepted ? 'Y' : 'N'));
3429                     });
3430
3431                     /* read a char and goto next state */
3432                     if ( base ) {
3433                         I32 offset;
3434                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3435                                              uscan, len, uvc, charid, foldlen,
3436                                              foldbuf, uniflags);
3437                         charcount++;
3438                         if (foldlen>0)
3439                             ST.longfold = TRUE;
3440                         if (charid &&
3441                              ( ((offset =
3442                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3443
3444                              && ((U32)offset < trie->lasttrans)
3445                              && trie->trans[offset].check == state)
3446                         {
3447                             state = trie->trans[offset].next;
3448                         }
3449                         else {
3450                             state = 0;
3451                         }
3452                         uc += len;
3453
3454                     }
3455                     else {
3456                         state = 0;
3457                     }
3458                     DEBUG_TRIE_EXECUTE_r(
3459                         PerlIO_printf( Perl_debug_log,
3460                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3461                             charid, uvc, (UV)state, PL_colors[5] );
3462                     );
3463                 }
3464                 if (!accepted)
3465                    sayNO;
3466
3467                 /* calculate total number of accept states */
3468                 {
3469                     U16 w = ST.topword;
3470                     accepted = 0;
3471                     while (w) {
3472                         w = trie->wordinfo[w].prev;
3473                         accepted++;
3474                     }
3475                     ST.accepted = accepted;
3476                 }
3477
3478                 DEBUG_EXECUTE_r(
3479                     PerlIO_printf( Perl_debug_log,
3480                         "%*s  %sgot %"IVdf" possible matches%s\n",
3481                         REPORT_CODE_OFF + depth * 2, "",
3482                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3483                 );
3484                 goto trie_first_try; /* jump into the fail handler */
3485             }}
3486             /* NOTREACHED */
3487
3488         case TRIE_next_fail: /* we failed - try next alternative */
3489             if ( ST.jump) {
3490                 REGCP_UNWIND(ST.cp);
3491                 for (n = rex->lastparen; n > ST.lastparen; n--)
3492                     rex->offs[n].end = -1;
3493                 rex->lastparen = n;
3494             }
3495             if (!--ST.accepted) {
3496                 DEBUG_EXECUTE_r({
3497                     PerlIO_printf( Perl_debug_log,
3498                         "%*s  %sTRIE failed...%s\n",
3499                         REPORT_CODE_OFF+depth*2, "",
3500                         PL_colors[4],
3501                         PL_colors[5] );
3502                 });
3503                 sayNO_SILENT;
3504             }
3505             {
3506                 /* Find next-highest word to process.  Note that this code
3507                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3508                 register U16 min = 0;
3509                 register U16 word;
3510                 register U16 const nextword = ST.nextword;
3511                 register reg_trie_wordinfo * const wordinfo
3512                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3513                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3514                     if (word > nextword && (!min || word < min))
3515                         min = word;
3516                 }
3517                 ST.nextword = min;
3518             }
3519
3520           trie_first_try:
3521             if (do_cutgroup) {
3522                 do_cutgroup = 0;
3523                 no_final = 0;
3524             }
3525
3526             if ( ST.jump) {
3527                 ST.lastparen = rex->lastparen;
3528                 REGCP_SET(ST.cp);
3529             }
3530
3531             /* find start char of end of current word */
3532             {
3533                 U32 chars; /* how many chars to skip */
3534                 U8 *uc = ST.firstpos;
3535                 reg_trie_data * const trie
3536                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3537
3538                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3539                             >=  ST.firstchars);
3540                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3541                             - ST.firstchars;
3542
3543                 if (ST.longfold) {
3544                     /* the hard option - fold each char in turn and find
3545                      * its folded length (which may be different */
3546                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3547                     STRLEN foldlen;
3548                     STRLEN len;
3549                     UV uvc;
3550                     U8 *uscan;
3551
3552                     while (chars) {
3553                         if (utf8_target) {
3554                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3555                                                     uniflags);
3556                             uc += len;
3557                         }
3558                         else {
3559                             uvc = *uc;
3560                             uc++;
3561                         }
3562                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3563                         uscan = foldbuf;
3564                         while (foldlen) {
3565                             if (!--chars)
3566                                 break;
3567                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3568                                             uniflags);
3569                             uscan += len;
3570                             foldlen -= len;
3571                         }
3572                     }
3573                 }
3574                 else {
3575                     if (utf8_target)
3576                         while (chars--)
3577                             uc += UTF8SKIP(uc);
3578                     else
3579                         uc += chars;
3580                 }
3581                 PL_reginput = (char *)uc;
3582             }
3583
3584             scan = (ST.jump && ST.jump[ST.nextword])
3585                         ? ST.me + ST.jump[ST.nextword]
3586                         : ST.B;
3587
3588             DEBUG_EXECUTE_r({
3589                 PerlIO_printf( Perl_debug_log,
3590                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3591                     REPORT_CODE_OFF+depth*2, "",
3592                     PL_colors[4],
3593                     ST.nextword,
3594                     PL_colors[5]
3595                     );
3596             });
3597
3598             if (ST.accepted > 1 || has_cutgroup) {
3599                 PUSH_STATE_GOTO(TRIE_next, scan);
3600                 /* NOTREACHED */
3601             }
3602             /* only one choice left - just continue */
3603             DEBUG_EXECUTE_r({
3604                 AV *const trie_words
3605                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3606                 SV ** const tmp = av_fetch( trie_words,
3607                     ST.nextword-1, 0 );
3608                 SV *sv= tmp ? sv_newmortal() : NULL;
3609
3610                 PerlIO_printf( Perl_debug_log,
3611                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3612                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3613                     ST.nextword,
3614                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3615                             PL_colors[0], PL_colors[1],
3616                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3617                         )
3618                     : "not compiled under -Dr",
3619                     PL_colors[5] );
3620             });
3621
3622             locinput = PL_reginput;
3623             nextchr = UCHARAT(locinput);
3624             continue; /* execute rest of RE */
3625             /* NOTREACHED */
3626 #undef  ST
3627
3628         case EXACT: {
3629             char *s = STRING(scan);
3630             ln = STR_LEN(scan);
3631             if (utf8_target != UTF_PATTERN) {
3632                 /* The target and the pattern have differing utf8ness. */
3633                 char *l = locinput;
3634                 const char * const e = s + ln;
3635
3636                 if (utf8_target) {
3637                     /* The target is utf8, the pattern is not utf8. */
3638                     while (s < e) {
3639                         STRLEN ulen;
3640                         if (l >= PL_regeol)
3641                              sayNO;
3642                         if (NATIVE_TO_UNI(*(U8*)s) !=
3643                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3644                                             uniflags))
3645                              sayNO;
3646                         l += ulen;
3647                         s ++;
3648                     }
3649                 }
3650                 else {
3651                     /* The target is not utf8, the pattern is utf8. */
3652                     while (s < e) {
3653                         STRLEN ulen;
3654                         if (l >= PL_regeol)
3655                             sayNO;
3656                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3657                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3658                                            uniflags))
3659                             sayNO;
3660                         s += ulen;
3661                         l ++;
3662                     }
3663                 }
3664                 locinput = l;
3665                 nextchr = UCHARAT(locinput);
3666                 break;
3667             }
3668             /* The target and the pattern have the same utf8ness. */
3669             /* Inline the first character, for speed. */
3670             if (UCHARAT(s) != nextchr)
3671                 sayNO;
3672             if (PL_regeol - locinput < ln)
3673                 sayNO;
3674             if (ln > 1 && memNE(s, locinput, ln))
3675                 sayNO;
3676             locinput += ln;
3677             nextchr = UCHARAT(locinput);
3678             break;
3679             }
3680         case EXACTFL: {
3681             re_fold_t folder;
3682             const U8 * fold_array;
3683             const char * s;
3684             U32 fold_utf8_flags;
3685
3686             PL_reg_flags |= RF_tainted;
3687             folder = foldEQ_locale;
3688             fold_array = PL_fold_locale;
3689             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3690             goto do_exactf;
3691
3692         case EXACTFU_SS:
3693         case EXACTFU_TRICKYFOLD:
3694         case EXACTFU:
3695             folder = foldEQ_latin1;
3696             fold_array = PL_fold_latin1;
3697             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3698             goto do_exactf;
3699
3700         case EXACTFA:
3701             folder = foldEQ_latin1;
3702             fold_array = PL_fold_latin1;
3703             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3704             goto do_exactf;
3705
3706         case EXACTF:
3707             folder = foldEQ;
3708             fold_array = PL_fold;
3709             fold_utf8_flags = 0;
3710
3711           do_exactf:
3712             s = STRING(scan);
3713             ln = STR_LEN(scan);
3714
3715             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3716               /* Either target or the pattern are utf8, or has the issue where
3717                * the fold lengths may differ. */
3718                 const char * const l = locinput;
3719                 char *e = PL_regeol;
3720
3721                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3722                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3723                 {
3724                     sayNO;
3725                 }
3726                 locinput = e;
3727                 nextchr = UCHARAT(locinput);
3728                 break;
3729             }
3730
3731             /* Neither the target nor the pattern are utf8 */
3732             if (UCHARAT(s) != nextchr &&
3733                 UCHARAT(s) != fold_array[nextchr])
3734             {
3735                 sayNO;
3736             }
3737             if (PL_regeol - locinput < ln)
3738                 sayNO;
3739             if (ln > 1 && ! folder(s, locinput, ln))
3740                 sayNO;
3741             locinput += ln;
3742             nextchr = UCHARAT(locinput);
3743             break;
3744         }
3745
3746         /* XXX Could improve efficiency by separating these all out using a
3747          * macro or in-line function.  At that point regcomp.c would no longer
3748          * have to set the FLAGS fields of these */
3749         case BOUNDL:
3750         case NBOUNDL:
3751             PL_reg_flags |= RF_tainted;
3752             /* FALL THROUGH */
3753         case BOUND:
3754         case BOUNDU:
3755         case BOUNDA:
3756         case NBOUND:
3757         case NBOUNDU:
3758         case NBOUNDA:
3759             /* was last char in word? */
3760             if (utf8_target
3761                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3762                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3763             {
3764                 if (locinput == PL_bostr)
3765                     ln = '\n';
3766                 else {
3767                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3768
3769                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3770                 }
3771                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3772                     ln = isALNUM_uni(ln);
3773                     LOAD_UTF8_CHARCLASS_ALNUM();
3774                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3775                 }
3776                 else {
3777                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3778                     n = isALNUM_LC_utf8((U8*)locinput);
3779                 }
3780             }
3781             else {
3782
3783                 /* Here the string isn't utf8, or is utf8 and only ascii
3784                  * characters are to match \w.  In the latter case looking at
3785                  * the byte just prior to the current one may be just the final
3786                  * byte of a multi-byte character.  This is ok.  There are two
3787                  * cases:
3788                  * 1) it is a single byte character, and then the test is doing
3789                  *      just what it's supposed to.
3790                  * 2) it is a multi-byte character, in which case the final
3791                  *      byte is never mistakable for ASCII, and so the test
3792                  *      will say it is not a word character, which is the
3793                  *      correct answer. */
3794                 ln = (locinput != PL_bostr) ?
3795                     UCHARAT(locinput - 1) : '\n';
3796                 switch (FLAGS(scan)) {
3797                     case REGEX_UNICODE_CHARSET:
3798                         ln = isWORDCHAR_L1(ln);
3799                         n = isWORDCHAR_L1(nextchr);
3800                         break;
3801                     case REGEX_LOCALE_CHARSET:
3802                         ln = isALNUM_LC(ln);
3803                         n = isALNUM_LC(nextchr);
3804                         break;
3805                     case REGEX_DEPENDS_CHARSET:
3806                         ln = isALNUM(ln);
3807                         n = isALNUM(nextchr);
3808                         break;
3809                     case REGEX_ASCII_RESTRICTED_CHARSET:
3810                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3811                         ln = isWORDCHAR_A(ln);
3812                         n = isWORDCHAR_A(nextchr);
3813                         break;
3814                     default:
3815                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3816                         break;
3817                 }
3818             }
3819             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3820              * regcomp.sym */
3821             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3822                     sayNO;
3823             break;
3824         case ANYOFV:
3825         case ANYOF:
3826             if (utf8_target || state_num == ANYOFV) {
3827                 STRLEN inclasslen = PL_regeol - locinput;
3828                 if (locinput >= PL_regeol)
3829                     sayNO;
3830
3831                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3832                     sayNO;
3833                 locinput += inclasslen;
3834                 nextchr = UCHARAT(locinput);
3835                 break;
3836             }
3837             else {
3838                 if (nextchr < 0)
3839                     nextchr = UCHARAT(locinput);
3840                 if (!nextchr && locinput >= PL_regeol)
3841                     sayNO;
3842                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3843                     sayNO;
3844                 nextchr = UCHARAT(++locinput);
3845                 break;
3846             }
3847             break;
3848         /* Special char classes - The defines start on line 129 or so */
3849         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3850                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3851                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3852                   ALNUMA, NALNUMA, isWORDCHAR_A,
3853                   alnum, "a");
3854
3855         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3856                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3857                   SPACEU, NSPACEU, isSPACE_L1,
3858                   SPACEA, NSPACEA, isSPACE_A,
3859                   space, " ");
3860
3861         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3862                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3863                 DIGITA, NDIGITA, isDIGIT_A,
3864                 digit, "0");
3865
3866         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3867                        a Unicode extended Grapheme Cluster */
3868             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3869               extended Grapheme Cluster is:
3870
3871                CR LF
3872                | Prepend* Begin Extend*
3873                | .
3874
3875                Begin is (Hangul-syllable | ! Control)
3876                Extend is (Grapheme_Extend | Spacing_Mark)
3877                Control is [ GCB_Control CR LF ]
3878
3879                The discussion below shows how the code for CLUMP is derived
3880                from this regex.  Note that most of these concepts are from
3881                property values of the Grapheme Cluster Boundary (GCB) property.
3882                No code point can have multiple property values for a given
3883                property.  Thus a code point in Prepend can't be in Control, but
3884                it must be in !Control.  This is why Control above includes
3885                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3886                property separately, and so can't be in GCB_Control, even though
3887                they logically are controls.  Control is not the same as gc=cc,
3888                but includes format and other characters as well.
3889
3890                The Unicode definition of Hangul-syllable is:
3891                    L+
3892                    | (L* ( ( V | LV ) V* | LVT ) T*)
3893                    | T+
3894                   )
3895                Each of these is a value for the GCB property, and hence must be
3896                disjoint, so the order they are tested is immaterial, so the
3897                above can safely be changed to
3898                    T+
3899                    | L+
3900                    | (L* ( LVT | ( V | LV ) V*) T*)
3901
3902                The last two terms can be combined like this:
3903                    L* ( L
3904                         | (( LVT | ( V | LV ) V*) T*))
3905
3906                And refactored into this:
3907                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3908
3909                That means that if we have seen any L's at all we can quit
3910                there, but if the next character is an LVT, a V, or an LV we
3911                should keep going.
3912
3913                There is a subtlety with Prepend* which showed up in testing.
3914                Note that the Begin, and only the Begin is required in:
3915                 | Prepend* Begin Extend*
3916                Also, Begin contains '! Control'.  A Prepend must be a
3917                '!  Control', which means it must also be a Begin.  What it
3918                comes down to is that if we match Prepend* and then find no
3919                suitable Begin afterwards, that if we backtrack the last
3920                Prepend, that one will be a suitable Begin.
3921             */
3922
3923             if (locinput >= PL_regeol)
3924                 sayNO;
3925             if  (! utf8_target) {
3926
3927                 /* Match either CR LF  or '.', as all the other possibilities
3928                  * require utf8 */
3929                 locinput++;         /* Match the . or CR */
3930                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3931                                        match the LF */
3932                     && locinput < PL_regeol
3933                     && UCHARAT(locinput) == '\n') locinput++;
3934             }
3935             else {
3936
3937                 /* Utf8: See if is ( CR LF ); already know that locinput <
3938                  * PL_regeol, so locinput+1 is in bounds */
3939                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3940                     locinput += 2;
3941                 }
3942                 else {
3943                     /* In case have to backtrack to beginning, then match '.' */
3944                     char *starting = locinput;
3945
3946                     /* In case have to backtrack the last prepend */
3947                     char *previous_prepend = 0;
3948
3949                     LOAD_UTF8_CHARCLASS_GCB();
3950
3951                     /* Match (prepend)* */
3952                     while (locinput < PL_regeol
3953                            && swash_fetch(PL_utf8_X_prepend,
3954                                           (U8*)locinput, utf8_target))
3955                     {
3956                         previous_prepend = locinput;
3957                         locinput += UTF8SKIP(locinput);
3958                     }
3959
3960                     /* As noted above, if we matched a prepend character, but
3961                      * the next thing won't match, back off the last prepend we
3962                      * matched, as it is guaranteed to match the begin */
3963                     if (previous_prepend
3964                         && (locinput >=  PL_regeol
3965                             || ! swash_fetch(PL_utf8_X_begin,
3966                                              (U8*)locinput, utf8_target)))
3967                     {
3968                         locinput = previous_prepend;
3969                     }
3970
3971                     /* Note that here we know PL_regeol > locinput, as we
3972                      * tested that upon input to this switch case, and if we
3973                      * moved locinput forward, we tested the result just above
3974                      * and it either passed, or we backed off so that it will
3975                      * now pass */
3976                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3977
3978                         /* Here did not match the required 'Begin' in the
3979                          * second term.  So just match the very first
3980                          * character, the '.' of the final term of the regex */
3981                         locinput = starting + UTF8SKIP(starting);
3982                     } else {
3983
3984                         /* Here is the beginning of a character that can have
3985                          * an extender.  It is either a hangul syllable, or a
3986                          * non-control */
3987                         if (swash_fetch(PL_utf8_X_non_hangul,
3988                                         (U8*)locinput, utf8_target))
3989                         {
3990
3991                             /* Here not a Hangul syllable, must be a
3992                              * ('!  * Control') */
3993                             locinput += UTF8SKIP(locinput);
3994                         } else {
3995
3996                             /* Here is a Hangul syllable.  It can be composed
3997                              * of several individual characters.  One
3998                              * possibility is T+ */
3999                             if (swash_fetch(PL_utf8_X_T,
4000                                             (U8*)locinput, utf8_target))
4001                             {
4002                                 while (locinput < PL_regeol
4003                                         && swash_fetch(PL_utf8_X_T,
4004                                                         (U8*)locinput, utf8_target))
4005                                 {
4006                                     locinput += UTF8SKIP(locinput);
4007                                 }
4008                             } else {
4009
4010                                 /* Here, not T+, but is a Hangul.  That means
4011                                  * it is one of the others: L, LV, LVT or V,
4012                                  * and matches:
4013                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
4014
4015                                 /* Match L*           */
4016                                 while (locinput < PL_regeol
4017                                         && swash_fetch(PL_utf8_X_L,
4018                                                         (U8*)locinput, utf8_target))
4019                                 {
4020                                     locinput += UTF8SKIP(locinput);
4021                                 }
4022
4023                                 /* Here, have exhausted L*.  If the next
4024                                  * character is not an LV, LVT nor V, it means
4025                                  * we had to have at least one L, so matches L+
4026                                  * in the original equation, we have a complete
4027                                  * hangul syllable.  Are done. */
4028
4029                                 if (locinput < PL_regeol
4030                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4031                                                     (U8*)locinput, utf8_target))
4032                                 {
4033
4034                                     /* Otherwise keep going.  Must be LV, LVT
4035                                      * or V.  See if LVT */
4036                                     if (swash_fetch(PL_utf8_X_LVT,
4037                                                     (U8*)locinput, utf8_target))
4038                                     {
4039                                         locinput += UTF8SKIP(locinput);
4040                                     } else {
4041
4042                                         /* Must be  V or LV.  Take it, then
4043                                          * match V*     */
4044                                         locinput += UTF8SKIP(locinput);
4045                                         while (locinput < PL_regeol
4046                                                 && swash_fetch(PL_utf8_X_V,
4047                                                          (U8*)locinput, utf8_target))
4048                                         {
4049                                             locinput += UTF8SKIP(locinput);
4050                                         }
4051                                     }
4052
4053                                     /* And any of LV, LVT, or V can be followed
4054                                      * by T*            */
4055                                     while (locinput < PL_regeol
4056                                            && swash_fetch(PL_utf8_X_T,
4057                                                            (U8*)locinput,
4058                                                            utf8_target))
4059                                     {
4060                                         locinput += UTF8SKIP(locinput);
4061                                     }
4062                                 }
4063                             }
4064                         }
4065
4066                         /* Match any extender */
4067                         while (locinput < PL_regeol
4068                                 && swash_fetch(PL_utf8_X_extend,
4069                                                 (U8*)locinput, utf8_target))
4070                         {
4071                             locinput += UTF8SKIP(locinput);
4072                         }
4073                     }
4074                 }
4075                 if (locinput > PL_regeol) sayNO;
4076             }
4077             nextchr = UCHARAT(locinput);
4078             break;
4079
4080         case NREFFL:
4081         {   /* The capture buffer cases.  The ones beginning with N for the
4082                named buffers just convert to the equivalent numbered and
4083                pretend they were called as the corresponding numbered buffer
4084                op.  */
4085             /* don't initialize these in the declaration, it makes C++
4086                unhappy */
4087             char *s;
4088             char type;
4089             re_fold_t folder;
4090             const U8 *fold_array;
4091             UV utf8_fold_flags;
4092
4093             PL_reg_flags |= RF_tainted;
4094             folder = foldEQ_locale;
4095             fold_array = PL_fold_locale;
4096             type = REFFL;
4097             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4098             goto do_nref;
4099
4100         case NREFFA:
4101             folder = foldEQ_latin1;
4102             fold_array = PL_fold_latin1;
4103             type = REFFA;
4104             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4105             goto do_nref;
4106
4107         case NREFFU:
4108             folder = foldEQ_latin1;
4109             fold_array = PL_fold_latin1;
4110             type = REFFU;
4111             utf8_fold_flags = 0;
4112             goto do_nref;
4113
4114         case NREFF:
4115             folder = foldEQ;
4116             fold_array = PL_fold;
4117             type = REFF;
4118             utf8_fold_flags = 0;
4119             goto do_nref;
4120
4121         case NREF:
4122             type = REF;
4123             folder = NULL;
4124             fold_array = NULL;
4125             utf8_fold_flags = 0;
4126           do_nref:
4127
4128             /* For the named back references, find the corresponding buffer
4129              * number */
4130             n = reg_check_named_buff_matched(rex,scan);
4131
4132             if ( ! n ) {
4133                 sayNO;
4134             }
4135             goto do_nref_ref_common;
4136
4137         case REFFL:
4138             PL_reg_flags |= RF_tainted;
4139             folder = foldEQ_locale;
4140             fold_array = PL_fold_locale;
4141             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4142             goto do_ref;
4143
4144         case REFFA:
4145             folder = foldEQ_latin1;
4146             fold_array = PL_fold_latin1;
4147             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4148             goto do_ref;
4149
4150         case REFFU:
4151             folder = foldEQ_latin1;
4152             fold_array = PL_fold_latin1;
4153             utf8_fold_flags = 0;
4154             goto do_ref;
4155
4156         case REFF:
4157             folder = foldEQ;
4158             fold_array = PL_fold;
4159             utf8_fold_flags = 0;
4160             goto do_ref;
4161
4162         case REF:
4163             folder = NULL;
4164             fold_array = NULL;
4165             utf8_fold_flags = 0;
4166
4167           do_ref:
4168             type = OP(scan);
4169             n = ARG(scan);  /* which paren pair */
4170
4171           do_nref_ref_common:
4172             ln = rex->offs[n].start;
4173             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4174             if (rex->lastparen < n || ln == -1)
4175                 sayNO;                  /* Do not match unless seen CLOSEn. */
4176             if (ln == rex->offs[n].end)
4177                 break;
4178
4179             s = PL_bostr + ln;
4180             if (type != REF     /* REF can do byte comparison */
4181                 && (utf8_target || type == REFFU))
4182             { /* XXX handle REFFL better */
4183                 char * limit = PL_regeol;
4184
4185                 /* This call case insensitively compares the entire buffer
4186                     * at s, with the current input starting at locinput, but
4187                     * not going off the end given by PL_regeol, and returns in
4188                     * limit upon success, how much of the current input was
4189                     * matched */
4190                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4191                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4192                 {
4193                     sayNO;
4194                 }
4195                 locinput = limit;
4196                 nextchr = UCHARAT(locinput);
4197                 break;
4198             }
4199
4200             /* Not utf8:  Inline the first character, for speed. */
4201             if (UCHARAT(s) != nextchr &&
4202                 (type == REF ||
4203                  UCHARAT(s) != fold_array[nextchr]))
4204                 sayNO;
4205             ln = rex->offs[n].end - ln;
4206             if (locinput + ln > PL_regeol)
4207                 sayNO;
4208             if (ln > 1 && (type == REF
4209                            ? memNE(s, locinput, ln)
4210                            : ! folder(s, locinput, ln)))
4211                 sayNO;
4212             locinput += ln;
4213             nextchr = UCHARAT(locinput);
4214             break;
4215         }
4216         case NOTHING:
4217         case TAIL:
4218             break;
4219         case BACK:
4220             break;
4221
4222 #undef  ST
4223 #define ST st->u.eval
4224         {
4225             SV *ret;
4226             REGEXP *re_sv;
4227             regexp *re;
4228             regexp_internal *rei;
4229             regnode *startpoint;
4230
4231         case GOSTART:
4232         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4233             if (cur_eval && cur_eval->locinput==locinput) {
4234                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4235                     Perl_croak(aTHX_ "Infinite recursion in regex");
4236                 if ( ++nochange_depth > max_nochange_depth )
4237                     Perl_croak(aTHX_
4238                         "Pattern subroutine nesting without pos change"
4239                         " exceeded limit in regex");
4240             } else {
4241                 nochange_depth = 0;
4242             }
4243             re_sv = rex_sv;
4244             re = rex;
4245             rei = rexi;
4246             (void)ReREFCNT_inc(rex_sv);
4247             if (OP(scan)==GOSUB) {
4248                 startpoint = scan + ARG2L(scan);
4249                 ST.close_paren = ARG(scan);
4250             } else {
4251                 startpoint = rei->program+1;
4252                 ST.close_paren = 0;
4253             }
4254             goto eval_recurse_doit;
4255             /* NOTREACHED */
4256         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4257             if (cur_eval && cur_eval->locinput==locinput) {
4258                 if ( ++nochange_depth > max_nochange_depth )
4259                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4260             } else {
4261                 nochange_depth = 0;
4262             }
4263             {
4264                 /* execute the code in the {...} */
4265
4266                 dSP;
4267                 SV ** before;
4268                 OP * const oop = PL_op;
4269                 COP * const ocurcop = PL_curcop;
4270                 OP *nop;
4271                 char *saved_regeol = PL_regeol;
4272                 struct re_save_state saved_state;
4273                 CV *newcv;
4274
4275                 /* To not corrupt the existing regex state while executing the
4276                  * eval we would normally put it on the save stack, like with
4277                  * save_re_context. However, re-evals have a weird scoping so we
4278                  * can't just add ENTER/LEAVE here. With that, things like
4279                  *
4280                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4281                  *
4282                  * would break, as they expect the localisation to be unwound
4283                  * only when the re-engine backtracks through the bit that
4284                  * localised it.
4285                  *
4286                  * What we do instead is just saving the state in a local c
4287                  * variable.
4288                  */
4289                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4290
4291                 PL_reg_state.re_reparsing = FALSE;
4292
4293                 if (!caller_cv)
4294                     caller_cv = find_runcv(NULL);
4295
4296                 n = ARG(scan);
4297
4298                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4299                     newcv = ((struct regexp *)SvANY(
4300                                                 (REGEXP*)(rexi->data->data[n])
4301                                             ))->qr_anoncv
4302                                         ;
4303                     nop = (OP*)rexi->data->data[n+1];
4304                 }
4305                 else if (rexi->data->what[n] == 'l') { /* literal code */
4306                     newcv = caller_cv;
4307                     nop = (OP*)rexi->data->data[n];
4308                     assert(CvDEPTH(newcv));
4309                 }
4310                 else {
4311                     /* literal with own CV */
4312                     assert(rexi->data->what[n] == 'L');
4313                     newcv = rex->qr_anoncv;
4314                     nop = (OP*)rexi->data->data[n];
4315                 }
4316
4317                 /* the initial nextstate you would normally execute
4318                  * at the start of an eval (which would cause error
4319                  * messages to come from the eval), may be optimised
4320                  * away from the execution path in the regex code blocks;
4321                  * so manually set PL_curcop to it initially */
4322                 {
4323                     OP *o = cUNOPx(nop)->op_first;
4324                     assert(o->op_type == OP_NULL);
4325                     if (o->op_targ == OP_SCOPE) {
4326                         o = cUNOPo->op_first;
4327                     }
4328                     else {
4329                         assert(o->op_targ == OP_LEAVE);
4330                         o = cUNOPo->op_first;
4331                         assert(o->op_type == OP_ENTER);
4332                         o = o->op_sibling;
4333                     }
4334
4335                     if (o->op_type != OP_STUB) {
4336                         assert(    o->op_type == OP_NEXTSTATE
4337                                 || o->op_type == OP_DBSTATE
4338                                 || (o->op_type == OP_NULL
4339                                     &&  (  o->op_targ == OP_NEXTSTATE
4340                                         || o->op_targ == OP_DBSTATE
4341                                         )
4342                                     )
4343                         );
4344                         PL_curcop = (COP*)o;
4345                     }
4346                 }
4347                 nop = nop->op_next;
4348
4349                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4350                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4351
4352                 /* normally if we're about to execute code from the same
4353                  * CV that we used previously, we just use the existing
4354                  * CX stack entry. However, its possible that in the
4355                  * meantime we may have backtracked, popped from the save
4356                  * stack, and undone the SAVECOMPPAD(s) associated with
4357                  * PUSH_MULTICALL; in which case PL_comppad no longer
4358                  * points to newcv's pad. */
4359                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4360                 {
4361                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4362                     if (last_pushed_cv) {
4363                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4364                     }
4365                     else {
4366                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4367                     }
4368                     last_pushed_cv = newcv;
4369                 }
4370                 last_pad = PL_comppad;
4371
4372                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4373
4374                 if (sv_yes_mark) {
4375                     SV *sv_mrk = get_sv("REGMARK", 1);
4376                     sv_setsv(sv_mrk, sv_yes_mark);
4377                 }
4378
4379                 /* we don't use MULTICALL here as we want to call the
4380                  * first op of the block of interest, rather than the
4381                  * first op of the sub */
4382                 before = SP;
4383                 PL_op = nop;
4384                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4385                 SPAGAIN;
4386                 if (SP == before)
4387                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4388                 else {
4389                     ret = POPs;
4390                     PUTBACK;
4391                 }
4392
4393                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4394
4395                 /* *** Note that at this point we don't restore
4396                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4397                  * be used again soon. This is safe as long as nothing
4398                  * in the regexp code uses the pad ! */
4399                 PL_op = oop;
4400                 PL_curcop = ocurcop;
4401                 PL_regeol = saved_regeol;
4402                 if (!logical) {
4403                     /* /(?{...})/ */
4404                     sv_setsv(save_scalar(PL_replgv), ret);
4405                     break;
4406                 }
4407             }
4408             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4409                 logical = 0;
4410                 {
4411                     /* extract RE object from returned value; compiling if
4412                      * necessary */
4413                     MAGIC *mg = NULL;
4414                     REGEXP *rx = NULL;
4415
4416                     if (SvROK(ret)) {
4417                         SV *const sv = SvRV(ret);
4418
4419                         if (SvTYPE(sv) == SVt_REGEXP) {
4420                             rx = (REGEXP*) sv;
4421                         } else if (SvSMAGICAL(sv)) {
4422                             mg = mg_find(sv, PERL_MAGIC_qr);
4423                             assert(mg);
4424                         }
4425                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4426                         rx = (REGEXP*) ret;
4427                     } else if (SvSMAGICAL(ret)) {
4428                         if (SvGMAGICAL(ret)) {
4429                             /* I don't believe that there is ever qr magic
4430                                here.  */
4431                             assert(!mg_find(ret, PERL_MAGIC_qr));
4432                             sv_unmagic(ret, PERL_MAGIC_qr);
4433                         }
4434                         else {
4435                             mg = mg_find(ret, PERL_MAGIC_qr);
4436                             /* testing suggests mg only ends up non-NULL for
4437                                scalars who were upgraded and compiled in the
4438                                else block below. In turn, this is only
4439                                triggered in the "postponed utf8 string" tests
4440                                in t/op/pat.t  */
4441                         }
4442                     }
4443
4444                     if (mg) {
4445                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4446                         assert(rx);
4447                     }
4448                     if (rx) {
4449                         rx = reg_temp_copy(NULL, rx);
4450                     }
4451                     else {
4452                         U32 pm_flags = 0;
4453                         const I32 osize = PL_regsize;
4454
4455                         if (DO_UTF8(ret)) {
4456                             assert (SvUTF8(ret));
4457                         } else if (SvUTF8(ret)) {
4458                             /* Not doing UTF-8, despite what the SV says. Is
4459                                this only if we're trapped in use 'bytes'?  */
4460                             /* Make a copy of the octet sequence, but without
4461                                the flag on, as the compiler now honours the
4462                                SvUTF8 flag on ret.  */
4463                             STRLEN len;
4464                             const char *const p = SvPV(ret, len);
4465                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4466                         }
4467                         rx = CALLREGCOMP(ret, pm_flags);
4468                         if (!(SvFLAGS(ret)
4469                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4470                                  | SVs_GMG))) {
4471                             /* This isn't a first class regexp. Instead, it's
4472                                caching a regexp onto an existing, Perl visible
4473                                scalar.  */
4474                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4475                         }
4476                         PL_regsize = osize;
4477                     }
4478                     re_sv = rx;
4479                     re = (struct regexp *)SvANY(rx);
4480                 }
4481                 RXp_MATCH_COPIED_off(re);
4482                 re->subbeg = rex->subbeg;
4483                 re->sublen = rex->sublen;
4484                 rei = RXi_GET(re);
4485                 DEBUG_EXECUTE_r(
4486                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4487                         "Matching embedded");
4488                 );
4489                 startpoint = rei->program + 1;
4490                 ST.close_paren = 0; /* only used for GOSUB */
4491
4492         eval_recurse_doit: /* Share code with GOSUB below this line */
4493                 /* run the pattern returned from (??{...}) */
4494                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4495                 REGCP_SET(ST.lastcp);
4496
4497                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4498                 re->lastparen = 0;
4499                 re->lastcloseparen = 0;
4500
4501                 PL_reginput = locinput;
4502                 PL_regsize = 0;
4503
4504                 /* XXXX This is too dramatic a measure... */
4505                 PL_reg_maxiter = 0;
4506
4507                 ST.toggle_reg_flags = PL_reg_flags;
4508                 if (RX_UTF8(re_sv))
4509                     PL_reg_flags |= RF_utf8;
4510                 else
4511                     PL_reg_flags &= ~RF_utf8;
4512                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4513
4514                 ST.prev_rex = rex_sv;
4515                 ST.prev_curlyx = cur_curlyx;
4516                 SETREX(rex_sv,re_sv);
4517                 rex = re;
4518                 rexi = rei;
4519                 cur_curlyx = NULL;
4520                 ST.B = next;
4521                 ST.prev_eval = cur_eval;
4522                 cur_eval = st;
4523                 /* now continue from first node in postoned RE */
4524                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4525                 /* NOTREACHED */
4526             }
4527             /* logical is 1,   /(?(?{...})X|Y)/ */
4528             sw = cBOOL(SvTRUE(ret));
4529             logical = 0;
4530             break;
4531         }
4532
4533         case EVAL_AB: /* cleanup after a successful (??{A})B */
4534             /* note: this is called twice; first after popping B, then A */
4535             PL_reg_flags ^= ST.toggle_reg_flags;
4536             ReREFCNT_dec(rex_sv);
4537             SETREX(rex_sv,ST.prev_rex);
4538             rex = (struct regexp *)SvANY(rex_sv);
4539             rexi = RXi_GET(rex);
4540             regcpblow(ST.cp);
4541             cur_eval = ST.prev_eval;
4542             cur_curlyx = ST.prev_curlyx;
4543
4544             /* XXXX This is too dramatic a measure... */
4545             PL_reg_maxiter = 0;
4546             if ( nochange_depth )
4547                 nochange_depth--;
4548             sayYES;
4549
4550
4551         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4552             /* note: this is called twice; first after popping B, then A */
4553             PL_reg_flags ^= ST.toggle_reg_flags;
4554             ReREFCNT_dec(rex_sv);
4555             SETREX(rex_sv,ST.prev_rex);
4556             rex = (struct regexp *)SvANY(rex_sv);
4557             rexi = RXi_GET(rex);
4558
4559             PL_reginput = locinput;
4560             REGCP_UNWIND(ST.lastcp);
4561             regcppop(rex);
4562             cur_eval = ST.prev_eval;
4563             cur_curlyx = ST.prev_curlyx;
4564             /* XXXX This is too dramatic a measure... */
4565             PL_reg_maxiter = 0;
4566             if ( nochange_depth )
4567                 nochange_depth--;
4568             sayNO_SILENT;
4569 #undef ST
4570
4571         case OPEN:
4572             n = ARG(scan);  /* which paren pair */
4573             rex->offs[n].start_tmp = locinput - PL_bostr;
4574             if (n > PL_regsize)
4575                 PL_regsize = n;
4576             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4577                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4578                 PTR2UV(rex),
4579                 PTR2UV(rex->offs),
4580                 (UV)n,
4581                 (IV)rex->offs[n].start_tmp,
4582                 (UV)PL_regsize
4583             ));
4584             lastopen = n;
4585             break;
4586
4587 /* XXX really need to log other places start/end are set too */
4588 #define CLOSE_CAPTURE \
4589     rex->offs[n].start = rex->offs[n].start_tmp; \
4590     rex->offs[n].end = locinput - PL_bostr; \
4591     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4592         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4593         PTR2UV(rex), \
4594         PTR2UV(rex->offs), \
4595         (UV)n, \
4596         (IV)rex->offs[n].start, \
4597         (IV)rex->offs[n].end \
4598     ))
4599
4600         case CLOSE:
4601             n = ARG(scan);  /* which paren pair */
4602             CLOSE_CAPTURE;
4603             /*if (n > PL_regsize)
4604                 PL_regsize = n;*/
4605             if (n > rex->lastparen)
4606                 rex->lastparen = n;
4607             rex->lastcloseparen = n;
4608             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4609                 goto fake_end;
4610             }
4611             break;
4612         case ACCEPT:
4613             if (ARG(scan)){
4614                 regnode *cursor;
4615                 for (cursor=scan;
4616                      cursor && OP(cursor)!=END;
4617                      cursor=regnext(cursor))
4618                 {
4619                     if ( OP(cursor)==CLOSE ){
4620                         n = ARG(cursor);
4621                         if ( n <= lastopen ) {
4622                             CLOSE_CAPTURE;
4623                             /*if (n > PL_regsize)
4624                             PL_regsize = n;*/
4625                             if (n > rex->lastparen)
4626                                 rex->lastparen = n;
4627                             rex->lastcloseparen = n;
4628                             if ( n == ARG(scan) || (cur_eval &&
4629                                 cur_eval->u.eval.close_paren == n))
4630                                 break;
4631                         }
4632                     }
4633                 }
4634             }
4635             goto fake_end;
4636             /*NOTREACHED*/
4637         case GROUPP:
4638             n = ARG(scan);  /* which paren pair */
4639             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4640             break;
4641         case NGROUPP:
4642             /* reg_check_named_buff_matched returns 0 for no match */
4643             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4644             break;
4645         case INSUBP:
4646             n = ARG(scan);
4647             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4648             break;
4649         case DEFINEP:
4650             sw = 0;
4651             break;
4652         case IFTHEN:
4653             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4654             if (sw)
4655                 next = NEXTOPER(NEXTOPER(scan));
4656             else {
4657                 next = scan + ARG(scan);
4658                 if (OP(next) == IFTHEN) /* Fake one. */
4659                     next = NEXTOPER(NEXTOPER(next));
4660             }
4661             break;
4662         case LOGICAL:
4663             logical = scan->flags;
4664             break;
4665
4666 /*******************************************************************
4667
4668 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4669 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4670 STAR/PLUS/CURLY/CURLYN are used instead.)
4671
4672 A*B is compiled as <CURLYX><A><WHILEM><B>
4673
4674 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4675 state, which contains the current count, initialised to -1. It also sets
4676 cur_curlyx to point to this state, with any previous value saved in the
4677 state block.
4678
4679 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4680 since the pattern may possibly match zero times (i.e. it's a while {} loop
4681 rather than a do {} while loop).
4682
4683 Each entry to WHILEM represents a successful match of A. The count in the
4684 CURLYX block is incremented, another WHILEM state is pushed, and execution
4685 passes to A or B depending on greediness and the current count.
4686
4687 For example, if matching against the string a1a2a3b (where the aN are
4688 substrings that match /A/), then the match progresses as follows: (the
4689 pushed states are interspersed with the bits of strings matched so far):
4690
4691     <CURLYX cnt=-1>
4692     <CURLYX cnt=0><WHILEM>
4693     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4694     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4695     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4696     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4697
4698 (Contrast this with something like CURLYM, which maintains only a single
4699 backtrack state:
4700
4701     <CURLYM cnt=0> a1
4702     a1 <CURLYM cnt=1> a2
4703     a1 a2 <CURLYM cnt=2> a3
4704     a1 a2 a3 <CURLYM cnt=3> b
4705 )
4706
4707 Each WHILEM state block marks a point to backtrack to upon partial failure
4708 of A or B, and also contains some minor state data related to that
4709 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4710 overall state, such as the count, and pointers to the A and B ops.
4711
4712 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4713 must always point to the *current* CURLYX block, the rules are:
4714
4715 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4716 and set cur_curlyx to point the new block.
4717
4718 When popping the CURLYX block after a successful or unsuccessful match,
4719 restore the previous cur_curlyx.
4720
4721 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4722 to the outer one saved in the CURLYX block.
4723
4724 When popping the WHILEM block after a successful or unsuccessful B match,
4725 restore the previous cur_curlyx.
4726
4727 Here's an example for the pattern (AI* BI)*BO
4728 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4729
4730 cur_
4731 curlyx backtrack stack
4732 ------ ---------------
4733 NULL
4734 CO     <CO prev=NULL> <WO>
4735 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4736 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4737 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4738
4739 At this point the pattern succeeds, and we work back down the stack to
4740 clean up, restoring as we go:
4741
4742 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4743 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4744 CO     <CO prev=NULL> <WO>
4745 NULL
4746
4747 *******************************************************************/
4748
4749 #define ST st->u.curlyx
4750
4751         case CURLYX:    /* start of /A*B/  (for complex A) */
4752         {
4753             /* No need to save/restore up to this paren */
4754             I32 parenfloor = scan->flags;
4755
4756             assert(next); /* keep Coverity happy */
4757             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4758                 next += ARG(next);
4759
4760             /* XXXX Probably it is better to teach regpush to support
4761                parenfloor > PL_regsize... */
4762             if (parenfloor > (I32)rex->lastparen)
4763                 parenfloor = rex->lastparen; /* Pessimization... */
4764
4765             ST.prev_curlyx= cur_curlyx;
4766             cur_curlyx = st;
4767             ST.cp = PL_savestack_ix;
4768
4769             /* these fields contain the state of the current curly.
4770              * they are accessed by subsequent WHILEMs */
4771             ST.parenfloor = parenfloor;
4772             ST.me = scan;
4773             ST.B = next;
4774             ST.minmod = minmod;
4775             minmod = 0;
4776             ST.count = -1;      /* this will be updated by WHILEM */
4777             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4778
4779             PL_reginput = locinput;
4780             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4781             /* NOTREACHED */
4782         }
4783
4784         case CURLYX_end: /* just finished matching all of A*B */
4785             cur_curlyx = ST.prev_curlyx;
4786             sayYES;
4787             /* NOTREACHED */
4788
4789         case CURLYX_end_fail: /* just failed to match all of A*B */
4790             regcpblow(ST.cp);
4791             cur_curlyx = ST.prev_curlyx;
4792             sayNO;
4793             /* NOTREACHED */
4794
4795
4796 #undef ST
4797 #define ST st->u.whilem
4798
4799         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4800         {
4801             /* see the discussion above about CURLYX/WHILEM */
4802             I32 n;
4803             int min = ARG1(cur_curlyx->u.curlyx.me);
4804             int max = ARG2(cur_curlyx->u.curlyx.me);
4805             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4806
4807             assert(cur_curlyx); /* keep Coverity happy */
4808             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4809             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4810             ST.cache_offset = 0;
4811             ST.cache_mask = 0;
4812
4813             PL_reginput = locinput;
4814
4815             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4816                   "%*s  whilem: matched %ld out of %d..%d\n",
4817                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4818             );
4819
4820             /* First just match a string of min A's. */
4821
4822             if (n < min) {
4823                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4824                 cur_curlyx->u.curlyx.lastloc = locinput;
4825                 REGCP_SET(ST.lastcp);
4826
4827                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4828                 /* NOTREACHED */
4829             }
4830
4831             /* If degenerate A matches "", assume A done. */
4832
4833             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4834                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4835                    "%*s  whilem: empty match detected, trying continuation...\n",
4836                    REPORT_CODE_OFF+depth*2, "")
4837                 );
4838                 goto do_whilem_B_max;
4839             }
4840
4841             /* super-linear cache processing */
4842
4843             if (scan->flags) {
4844
4845                 if (!PL_reg_maxiter) {
4846                     /* start the countdown: Postpone detection until we
4847                      * know the match is not *that* much linear. */
4848                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4849                     /* possible overflow for long strings and many CURLYX's */
4850                     if (PL_reg_maxiter < 0)
4851                         PL_reg_maxiter = I32_MAX;
4852                     PL_reg_leftiter = PL_reg_maxiter;
4853                 }
4854
4855                 if (PL_reg_leftiter-- == 0) {
4856                     /* initialise cache */
4857                     const I32 size = (PL_reg_maxiter + 7)/8;
4858                     if (PL_reg_poscache) {
4859                         if ((I32)PL_reg_poscache_size < size) {
4860                             Renew(PL_reg_poscache, size, char);
4861                             PL_reg_poscache_size = size;
4862                         }
4863                         Zero(PL_reg_poscache, size, char);
4864                     }
4865                     else {
4866                         PL_reg_poscache_size = size;
4867                         Newxz(PL_reg_poscache, size, char);
4868                     }
4869                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4870       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4871                               PL_colors[4], PL_colors[5])
4872                     );
4873                 }
4874
4875                 if (PL_reg_leftiter < 0) {
4876                     /* have we already failed at this position? */
4877                     I32 offset, mask;
4878                     offset  = (scan->flags & 0xf) - 1
4879                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4880                     mask    = 1 << (offset % 8);
4881                     offset /= 8;
4882                     if (PL_reg_poscache[offset] & mask) {
4883                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4884                             "%*s  whilem: (cache) already tried at this position...\n",
4885                             REPORT_CODE_OFF+depth*2, "")
4886                         );
4887                         sayNO; /* cache records failure */
4888                     }
4889                     ST.cache_offset = offset;
4890                     ST.cache_mask   = mask;
4891                 }
4892             }
4893
4894             /* Prefer B over A for minimal matching. */
4895
4896             if (cur_curlyx->u.curlyx.minmod) {
4897                 ST.save_curlyx = cur_curlyx;
4898                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4899                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
4900                 REGCP_SET(ST.lastcp);
4901                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4902                 /* NOTREACHED */
4903             }
4904
4905             /* Prefer A over B for maximal matching. */
4906
4907             if (n < max) { /* More greed allowed? */
4908                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4909                 cur_curlyx->u.curlyx.lastloc = locinput;
4910                 REGCP_SET(ST.lastcp);
4911                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4912                 /* NOTREACHED */
4913             }
4914             goto do_whilem_B_max;
4915         }
4916         /* NOTREACHED */
4917
4918         case WHILEM_B_min: /* just matched B in a minimal match */
4919         case WHILEM_B_max: /* just matched B in a maximal match */
4920             cur_curlyx = ST.save_curlyx;
4921             sayYES;
4922             /* NOTREACHED */
4923
4924         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4925             cur_curlyx = ST.save_curlyx;
4926             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4927             cur_curlyx->u.curlyx.count--;
4928             CACHEsayNO;
4929             /* NOTREACHED */
4930
4931         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4932             /* FALL THROUGH */
4933         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4934             REGCP_UNWIND(ST.lastcp);
4935             regcppop(rex);
4936             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4937             cur_curlyx->u.curlyx.count--;
4938             CACHEsayNO;
4939             /* NOTREACHED */
4940
4941         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4942             REGCP_UNWIND(ST.lastcp);
4943             regcppop(rex);      /* Restore some previous $<digit>s? */
4944             PL_reginput = locinput;
4945             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4946                 "%*s  whilem: failed, trying continuation...\n",
4947                 REPORT_CODE_OFF+depth*2, "")
4948             );
4949           do_whilem_B_max:
4950             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4951                 && ckWARN(WARN_REGEXP)
4952                 && !(PL_reg_flags & RF_warned))
4953             {
4954                 PL_reg_flags |= RF_warned;
4955                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4956                      "Complex regular subexpression recursion limit (%d) "
4957                      "exceeded",
4958                      REG_INFTY - 1);
4959             }
4960
4961             /* now try B */
4962             ST.save_curlyx = cur_curlyx;
4963             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4964             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4965             /* NOTREACHED */
4966
4967         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4968             cur_curlyx = ST.save_curlyx;
4969             REGCP_UNWIND(ST.lastcp);
4970             regcppop(rex);
4971
4972             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4973                 /* Maximum greed exceeded */
4974                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4975                     && ckWARN(WARN_REGEXP)
4976                     && !(PL_reg_flags & RF_warned))
4977                 {
4978                     PL_reg_flags |= RF_warned;
4979                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4980                         "Complex regular subexpression recursion "
4981                         "limit (%d) exceeded",
4982                         REG_INFTY - 1);
4983                 }
4984                 cur_curlyx->u.curlyx.count--;
4985                 CACHEsayNO;
4986             }
4987
4988             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4989                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4990             );
4991             /* Try grabbing another A and see if it helps. */
4992             PL_reginput = locinput;
4993             cur_curlyx->u.curlyx.lastloc = locinput;
4994             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4995             REGCP_SET(ST.lastcp);
4996             PUSH_STATE_GOTO(WHILEM_A_min,
4997                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4998             /* NOTREACHED */
4999
5000 #undef  ST
5001 #define ST st->u.branch
5002
5003         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5004             next = scan + ARG(scan);
5005             if (next == scan)
5006                 next = NULL;
5007             scan = NEXTOPER(scan);
5008             /* FALL THROUGH */
5009
5010         case BRANCH:        /*  /(...|A|...)/ */
5011             scan = NEXTOPER(scan); /* scan now points to inner node */
5012             ST.lastparen = rex->lastparen;
5013             ST.next_branch = next;
5014             REGCP_SET(ST.cp);
5015             PL_reginput = locinput;
5016
5017             /* Now go into the branch */
5018             if (has_cutgroup) {
5019                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5020             } else {
5021                 PUSH_STATE_GOTO(BRANCH_next, scan);
5022             }
5023             /* NOTREACHED */
5024         case CUTGROUP:
5025             PL_reginput = locinput;
5026             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5027                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5028             PUSH_STATE_GOTO(CUTGROUP_next,next);
5029             /* NOTREACHED */
5030         case CUTGROUP_next_fail:
5031             do_cutgroup = 1;
5032             no_final = 1;
5033             if (st->u.mark.mark_name)
5034                 sv_commit = st->u.mark.mark_name;
5035             sayNO;
5036             /* NOTREACHED */
5037         case BRANCH_next:
5038             sayYES;
5039             /* NOTREACHED */
5040         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5041             if (do_cutgroup) {
5042                 do_cutgroup = 0;
5043                 no_final = 0;
5044             }
5045             REGCP_UNWIND(ST.cp);
5046             for (n = rex->lastparen; n > ST.lastparen; n--)
5047                 rex->offs[n].end = -1;
5048             rex->lastparen = n;
5049             /*dmq: rex->lastcloseparen = n; */
5050             scan = ST.next_branch;
5051             /* no more branches? */
5052             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5053                 DEBUG_EXECUTE_r({
5054                     PerlIO_printf( Perl_debug_log,
5055                         "%*s  %sBRANCH failed...%s\n",
5056                         REPORT_CODE_OFF+depth*2, "",
5057                         PL_colors[4],
5058                         PL_colors[5] );
5059                 });
5060                 sayNO_SILENT;
5061             }
5062             continue; /* execute next BRANCH[J] op */
5063             /* NOTREACHED */
5064
5065         case MINMOD:
5066             minmod = 1;
5067             break;
5068
5069 #undef  ST
5070 #define ST st->u.curlym
5071
5072         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5073
5074             /* This is an optimisation of CURLYX that enables us to push
5075              * only a single backtracking state, no matter how many matches
5076              * there are in {m,n}. It relies on the pattern being constant
5077              * length, with no parens to influence future backrefs
5078              */
5079
5080             ST.me = scan;
5081             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5082
5083             /* if paren positive, emulate an OPEN/CLOSE around A */
5084             if (ST.me->flags) {
5085                 U32 paren = ST.me->flags;
5086                 if (paren > PL_regsize)
5087                     PL_regsize = paren;
5088                 if (paren > rex->lastparen)
5089                     rex->lastparen = paren;
5090                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5091             }
5092             ST.A = scan;
5093             ST.B = next;
5094             ST.alen = 0;
5095             ST.count = 0;
5096             ST.minmod = minmod;
5097             minmod = 0;
5098             ST.c1 = CHRTEST_UNINIT;
5099             REGCP_SET(ST.cp);
5100
5101             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5102                 goto curlym_do_B;
5103
5104           curlym_do_A: /* execute the A in /A{m,n}B/  */
5105             PL_reginput = locinput;
5106             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5107             /* NOTREACHED */
5108
5109         case CURLYM_A: /* we've just matched an A */
5110             locinput = st->locinput;
5111             nextchr = UCHARAT(locinput);
5112
5113             ST.count++;
5114             /* after first match, determine A's length: u.curlym.alen */
5115             if (ST.count == 1) {
5116                 if (PL_reg_match_utf8) {
5117                     char *s = locinput;
5118                     while (s < PL_reginput) {
5119                         ST.alen++;
5120                         s += UTF8SKIP(s);
5121                     }
5122                 }
5123                 else {
5124                     ST.alen = PL_reginput - locinput;
5125                 }
5126                 if (ST.alen == 0)
5127                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5128             }
5129             DEBUG_EXECUTE_r(
5130                 PerlIO_printf(Perl_debug_log,
5131                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5132                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5133                           (IV) ST.count, (IV)ST.alen)
5134             );
5135
5136             locinput = PL_reginput;
5137
5138             if (cur_eval && cur_eval->u.eval.close_paren &&
5139                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5140                 goto fake_end;
5141
5142             {
5143                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5144                 if ( max == REG_INFTY || ST.count < max )
5145                     goto curlym_do_A; /* try to match another A */
5146             }
5147             goto curlym_do_B; /* try to match B */
5148
5149         case CURLYM_A_fail: /* just failed to match an A */
5150             REGCP_UNWIND(ST.cp);
5151
5152             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5153                 || (cur_eval && cur_eval->u.eval.close_paren &&
5154                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5155                 sayNO;
5156
5157           curlym_do_B: /* execute the B in /A{m,n}B/  */
5158             PL_reginput = locinput;
5159             if (ST.c1 == CHRTEST_UNINIT) {
5160                 /* calculate c1 and c2 for possible match of 1st char
5161                  * following curly */
5162                 ST.c1 = ST.c2 = CHRTEST_VOID;
5163                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5164                     regnode *text_node = ST.B;
5165                     if (! HAS_TEXT(text_node))
5166                         FIND_NEXT_IMPT(text_node);
5167                     /* this used to be
5168
5169                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5170
5171                         But the former is redundant in light of the latter.
5172
5173                         if this changes back then the macro for
5174                         IS_TEXT and friends need to change.
5175                      */
5176                     if (PL_regkind[OP(text_node)] == EXACT)
5177                     {
5178
5179                         ST.c1 = (U8)*STRING(text_node);
5180                         switch (OP(text_node)) {
5181                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5182                             case EXACTFA:
5183                             case EXACTFU_SS:
5184                             case EXACTFU_TRICKYFOLD:
5185                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5186                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5187                             default: ST.c2 = ST.c1;
5188                         }
5189                     }
5190                 }
5191             }
5192
5193             DEBUG_EXECUTE_r(
5194                 PerlIO_printf(Perl_debug_log,
5195                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5196                     (int)(REPORT_CODE_OFF+(depth*2)),
5197                     "", (IV)ST.count)
5198                 );
5199             if (ST.c1 != CHRTEST_VOID
5200                     && UCHARAT(PL_reginput) != ST.c1
5201                     && UCHARAT(PL_reginput) != ST.c2)
5202             {
5203                 /* simulate B failing */
5204                 DEBUG_OPTIMISE_r(
5205                     PerlIO_printf(Perl_debug_log,
5206                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5207                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5208                         (IV)ST.c1,(IV)ST.c2
5209                 ));
5210                 state_num = CURLYM_B_fail;
5211                 goto reenter_switch;
5212             }
5213
5214             if (ST.me->flags) {
5215                 /* mark current A as captured */
5216                 I32 paren = ST.me->flags;
5217                 if (ST.count) {
5218                     rex->offs[paren].start
5219                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5220                     rex->offs[paren].end = PL_reginput - PL_bostr;
5221                     /*dmq: rex->lastcloseparen = paren; */
5222                 }
5223                 else
5224                     rex->offs[paren].end = -1;
5225                 if (cur_eval && cur_eval->u.eval.close_paren &&
5226                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5227                 {
5228                     if (ST.count)
5229                         goto fake_end;
5230                     else
5231                         sayNO;
5232                 }
5233             }
5234
5235             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5236             /* NOTREACHED */
5237
5238         case CURLYM_B_fail: /* just failed to match a B */
5239             REGCP_UNWIND(ST.cp);
5240             if (ST.minmod) {
5241                 I32 max = ARG2(ST.me);
5242                 if (max != REG_INFTY && ST.count == max)
5243                     sayNO;
5244                 goto curlym_do_A; /* try to match a further A */
5245             }
5246             /* backtrack one A */
5247             if (ST.count == ARG1(ST.me) /* min */)
5248                 sayNO;
5249             ST.count--;
5250             locinput = HOPc(locinput, -ST.alen);
5251             goto curlym_do_B; /* try to match B */
5252
5253 #undef ST
5254 #define ST st->u.curly
5255
5256 #define CURLY_SETPAREN(paren, success) \
5257     if (paren) { \
5258         if (success) { \
5259             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5260             rex->offs[paren].end = locinput - PL_bostr; \
5261             rex->lastcloseparen = paren; \
5262         } \
5263         else \
5264             rex->offs[paren].end = -1; \
5265     }
5266
5267         case STAR:              /*  /A*B/ where A is width 1 */
5268             ST.paren = 0;
5269             ST.min = 0;
5270             ST.max = REG_INFTY;
5271             scan = NEXTOPER(scan);
5272             goto repeat;
5273         case PLUS:              /*  /A+B/ where A is width 1 */
5274             ST.paren = 0;
5275             ST.min = 1;
5276             ST.max = REG_INFTY;
5277             scan = NEXTOPER(scan);
5278             goto repeat;
5279         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5280             ST.paren = scan->flags;     /* Which paren to set */
5281             if (ST.paren > PL_regsize)
5282                 PL_regsize = ST.paren;
5283             if (ST.paren > rex->lastparen)
5284                 rex->lastparen = ST.paren;
5285             ST.min = ARG1(scan);  /* min to match */
5286             ST.max = ARG2(scan);  /* max to match */
5287             if (cur_eval && cur_eval->u.eval.close_paren &&
5288                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5289                 ST.min=1;
5290                 ST.max=1;
5291             }
5292             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5293             goto repeat;
5294         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5295             ST.paren = 0;
5296             ST.min = ARG1(scan);  /* min to match */
5297             ST.max = ARG2(scan);  /* max to match */
5298             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5299           repeat:
5300             /*
5301             * Lookahead to avoid useless match attempts
5302             * when we know what character comes next.
5303             *
5304             * Used to only do .*x and .*?x, but now it allows
5305             * for )'s, ('s and (?{ ... })'s to be in the way
5306             * of the quantifier and the EXACT-like node.  -- japhy
5307             */
5308
5309             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5310                 sayNO;
5311             if (HAS_TEXT(next) || JUMPABLE(next)) {
5312                 U8 *s;
5313                 regnode *text_node = next;
5314
5315                 if (! HAS_TEXT(text_node))
5316                     FIND_NEXT_IMPT(text_node);
5317
5318                 if (! HAS_TEXT(text_node))
5319                     ST.c1 = ST.c2 = CHRTEST_VOID;
5320                 else {
5321                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5322                         ST.c1 = ST.c2 = CHRTEST_VOID;
5323                         goto assume_ok_easy;
5324                     }
5325                     else
5326                         s = (U8*)STRING(text_node);
5327
5328                     /*  Currently we only get here when
5329
5330                         PL_rekind[OP(text_node)] == EXACT
5331
5332                         if this changes back then the macro for IS_TEXT and
5333                         friends need to change. */
5334                     if (!UTF_PATTERN) {
5335                         ST.c1 = *s;
5336                         switch (OP(text_node)) {
5337                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5338                             case EXACTFA:
5339                             case EXACTFU_SS:
5340                             case EXACTFU_TRICKYFOLD:
5341                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5342                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5343                             default: ST.c2 = ST.c1; break;
5344                         }
5345                     }
5346                     else { /* UTF_PATTERN */
5347                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5348                              STRLEN ulen;
5349                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5350
5351                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5352                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5353                                                     uniflags);
5354                         }
5355                         else {
5356                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5357                                                      uniflags);
5358                         }
5359                     }
5360                 }
5361             }
5362             else
5363                 ST.c1 = ST.c2 = CHRTEST_VOID;
5364         assume_ok_easy:
5365
5366             ST.A = scan;
5367             ST.B = next;
5368             PL_reginput = locinput;
5369             if (minmod) {
5370                 minmod = 0;
5371                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5372                     sayNO;
5373                 ST.count = ST.min;
5374                 locinput = PL_reginput;
5375                 REGCP_SET(ST.cp);
5376                 if (ST.c1 == CHRTEST_VOID)
5377                     goto curly_try_B_min;
5378
5379                 ST.oldloc = locinput;
5380
5381                 /* set ST.maxpos to the furthest point along the
5382                  * string that could possibly match */
5383                 if  (ST.max == REG_INFTY) {
5384                     ST.maxpos = PL_regeol - 1;
5385                     if (utf8_target)
5386                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5387                             ST.maxpos--;
5388                 }
5389                 else if (utf8_target) {
5390                     int m = ST.max - ST.min;
5391                     for (ST.maxpos = locinput;
5392                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5393                         ST.maxpos += UTF8SKIP(ST.maxpos);
5394                 }
5395                 else {
5396                     ST.maxpos = locinput + ST.max - ST.min;
5397                     if (ST.maxpos >= PL_regeol)
5398                         ST.maxpos = PL_regeol - 1;
5399                 }
5400                 goto curly_try_B_min_known;
5401
5402             }
5403             else {
5404                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5405                 locinput = PL_reginput;
5406                 if (ST.count < ST.min)
5407                     sayNO;
5408                 if ((ST.count > ST.min)
5409                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5410                 {
5411                     /* A{m,n} must come at the end of the string, there's
5412                      * no point in backing off ... */
5413                     ST.min = ST.count;
5414                     /* ...except that $ and \Z can match before *and* after
5415                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5416                        We may back off by one in this case. */
5417                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5418                         ST.min--;
5419                 }
5420                 REGCP_SET(ST.cp);
5421                 goto curly_try_B_max;
5422             }
5423             /* NOTREACHED */
5424
5425
5426         case CURLY_B_min_known_fail:
5427             /* failed to find B in a non-greedy match where c1,c2 valid */
5428             if (ST.paren && ST.count)
5429                 rex->offs[ST.paren].end = -1;
5430
5431             PL_reginput = locinput;     /* Could be reset... */
5432             REGCP_UNWIND(ST.cp);
5433             /* Couldn't or didn't -- move forward. */
5434             ST.oldloc = locinput;
5435             if (utf8_target)
5436                 locinput += UTF8SKIP(locinput);
5437             else
5438                 locinput++;
5439             ST.count++;
5440           curly_try_B_min_known:
5441              /* find the next place where 'B' could work, then call B */
5442             {
5443                 int n;
5444                 if (utf8_target) {
5445                     n = (ST.oldloc == locinput) ? 0 : 1;
5446                     if (ST.c1 == ST.c2) {
5447                         STRLEN len;
5448                         /* set n to utf8_distance(oldloc, locinput) */
5449                         while (locinput <= ST.maxpos &&
5450                                utf8n_to_uvchr((U8*)locinput,
5451                                               UTF8_MAXBYTES, &len,
5452                                               uniflags) != (UV)ST.c1) {
5453                             locinput += len;
5454                             n++;
5455                         }
5456                     }
5457                     else {
5458                         /* set n to utf8_distance(oldloc, locinput) */
5459                         while (locinput <= ST.maxpos) {
5460                             STRLEN len;
5461                             const UV c = utf8n_to_uvchr((U8*)locinput,
5462                                                   UTF8_MAXBYTES, &len,
5463                                                   uniflags);
5464                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5465                                 break;
5466                             locinput += len;
5467                             n++;
5468                         }
5469                     }
5470                 }
5471                 else {
5472                     if (ST.c1 == ST.c2) {
5473                         while (locinput <= ST.maxpos &&
5474                                UCHARAT(locinput) != ST.c1)
5475                             locinput++;
5476                     }
5477                     else {
5478                         while (locinput <= ST.maxpos
5479                                && UCHARAT(locinput) != ST.c1
5480                                && UCHARAT(locinput) != ST.c2)
5481                             locinput++;
5482                     }
5483                     n = locinput - ST.oldloc;
5484                 }
5485                 if (locinput > ST.maxpos)
5486                     sayNO;
5487                 /* PL_reginput == oldloc now */
5488                 if (n) {
5489                     ST.count += n;
5490                     if (regrepeat(rex, ST.A, n, depth) < n)
5491                         sayNO;
5492                 }
5493                 PL_reginput = locinput;
5494                 CURLY_SETPAREN(ST.paren, ST.count);
5495                 if (cur_eval && cur_eval->u.eval.close_paren &&
5496                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5497                     goto fake_end;
5498                 }
5499                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5500             }
5501             /* NOTREACHED */
5502
5503
5504         case CURLY_B_min_fail:
5505             /* failed to find B in a non-greedy match where c1,c2 invalid */
5506             if (ST.paren && ST.count)
5507                 rex->offs[ST.paren].end = -1;
5508
5509             REGCP_UNWIND(ST.cp);
5510             /* failed -- move forward one */
5511             PL_reginput = locinput;
5512             if (regrepeat(rex, ST.A, 1, depth)) {
5513                 ST.count++;
5514                 locinput = PL_reginput;
5515                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5516                         ST.count > 0)) /* count overflow ? */
5517                 {
5518                   curly_try_B_min:
5519                     CURLY_SETPAREN(ST.paren, ST.count);
5520                     if (cur_eval && cur_eval->u.eval.close_paren &&
5521                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5522                         goto fake_end;
5523                     }
5524                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5525                 }
5526             }
5527             sayNO;
5528             /* NOTREACHED */
5529
5530
5531         curly_try_B_max:
5532             /* a successful greedy match: now try to match B */
5533             if (cur_eval && cur_eval->u.eval.close_paren &&
5534                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5535                 goto fake_end;
5536             }
5537             {
5538                 UV c = 0;
5539                 if (ST.c1 != CHRTEST_VOID)
5540                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5541                                            UTF8_MAXBYTES, 0, uniflags)
5542                                 : (UV) UCHARAT(PL_reginput);
5543                 /* If it could work, try it. */
5544                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5545                     CURLY_SETPAREN(ST.paren, ST.count);
5546                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5547                     /* NOTREACHED */
5548                 }
5549             }
5550             /* FALL THROUGH */
5551         case CURLY_B_max_fail:
5552             /* failed to find B in a greedy match */
5553             if (ST.paren && ST.count)
5554                 rex->offs[ST.paren].end = -1;
5555
5556             REGCP_UNWIND(ST.cp);
5557             /*  back up. */
5558             if (--ST.count < ST.min)
5559                 sayNO;
5560             PL_reginput = locinput = HOPc(locinput, -1);
5561             goto curly_try_B_max;
5562
5563 #undef ST
5564
5565         case END:
5566             fake_end:
5567             if (cur_eval) {
5568                 /* we've just finished A in /(??{A})B/; now continue with B */
5569                 I32 tmpix;
5570                 st->u.eval.toggle_reg_flags
5571                             = cur_eval->u.eval.toggle_reg_flags;
5572                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5573
5574                 st->u.eval.prev_rex = rex_sv;           /* inner */
5575                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5576                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5577                 rex = (struct regexp *)SvANY(rex_sv);
5578                 rexi = RXi_GET(rex);
5579                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5580                 (void)ReREFCNT_inc(rex_sv);
5581
5582                 REGCP_SET(st->u.eval.lastcp);
5583                 PL_reginput = locinput;
5584
5585                 /* Restore parens of the outer rex without popping the
5586                  * savestack */
5587                 tmpix = PL_savestack_ix;
5588                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5589                 regcppop(rex);
5590                 PL_savestack_ix = tmpix;
5591
5592                 st->u.eval.prev_eval = cur_eval;
5593                 cur_eval = cur_eval->u.eval.prev_eval;
5594                 DEBUG_EXECUTE_r(
5595                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5596                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5597                 if ( nochange_depth )
5598                     nochange_depth--;
5599
5600                 PUSH_YES_STATE_GOTO(EVAL_AB,
5601                         st->u.eval.prev_eval->u.eval.B); /* match B */
5602             }
5603
5604             if (locinput < reginfo->till) {
5605                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5606                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5607                                       PL_colors[4],
5608                                       (long)(locinput - PL_reg_starttry),
5609                                       (long)(reginfo->till - PL_reg_starttry),
5610                                       PL_colors[5]));
5611
5612                 sayNO_SILENT;           /* Cannot match: too short. */
5613             }
5614             PL_reginput = locinput;     /* put where regtry can find it */
5615             sayYES;                     /* Success! */
5616
5617         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5618             DEBUG_EXECUTE_r(
5619             PerlIO_printf(Perl_debug_log,
5620                 "%*s  %ssubpattern success...%s\n",
5621                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5622             PL_reginput = locinput;     /* put where regtry can find it */
5623             sayYES;                     /* Success! */
5624
5625 #undef  ST
5626 #define ST st->u.ifmatch
5627
5628         case SUSPEND:   /* (?>A) */
5629             ST.wanted = 1;
5630             PL_reginput = locinput;
5631             goto do_ifmatch;
5632
5633         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5634             ST.wanted = 0;
5635             goto ifmatch_trivial_fail_test;
5636
5637         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5638             ST.wanted = 1;
5639           ifmatch_trivial_fail_test:
5640             if (scan->flags) {
5641                 char * const s = HOPBACKc(locinput, scan->flags);
5642                 if (!s) {
5643                     /* trivial fail */
5644                     if (logical) {
5645                         logical = 0;
5646                         sw = 1 - cBOOL(ST.wanted);
5647                     }
5648                     else if (ST.wanted)
5649                         sayNO;
5650                     next = scan + ARG(scan);
5651                     if (next == scan)
5652                         next = NULL;
5653                     break;
5654                 }
5655                 PL_reginput = s;
5656             }
5657             else
5658                 PL_reginput = locinput;
5659
5660           do_ifmatch:
5661             ST.me = scan;
5662             ST.logical = logical;
5663             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5664
5665             /* execute body of (?...A) */
5666             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5667             /* NOTREACHED */
5668
5669         case IFMATCH_A_fail: /* body of (?...A) failed */
5670             ST.wanted = !ST.wanted;
5671             /* FALL THROUGH */
5672
5673         case IFMATCH_A: /* body of (?...A) succeeded */
5674             if (ST.logical) {
5675                 sw = cBOOL(ST.wanted);
5676             }
5677             else if (!ST.wanted)
5678                 sayNO;
5679
5680             if (OP(ST.me) == SUSPEND)
5681                 locinput = PL_reginput;
5682             else {
5683                 locinput = PL_reginput = st->locinput;
5684                 nextchr = UCHARAT(locinput);
5685             }
5686             scan = ST.me + ARG(ST.me);
5687             if (scan == ST.me)
5688                 scan = NULL;
5689             continue; /* execute B */
5690
5691 #undef ST
5692
5693         case LONGJMP:
5694             next = scan + ARG(scan);
5695             if (next == scan)
5696                 next = NULL;
5697             break;
5698         case COMMIT:
5699             reginfo->cutpoint = PL_regeol;
5700             /* FALLTHROUGH */
5701         case PRUNE:
5702             PL_reginput = locinput;
5703             if (!scan->flags)
5704                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5705             PUSH_STATE_GOTO(COMMIT_next,next);
5706             /* NOTREACHED */
5707         case COMMIT_next_fail:
5708             no_final = 1;
5709             /* FALLTHROUGH */
5710         case OPFAIL:
5711             sayNO;
5712             /* NOTREACHED */
5713
5714 #define ST st->u.mark
5715         case MARKPOINT:
5716             ST.prev_mark = mark_state;
5717             ST.mark_name = sv_commit = sv_yes_mark
5718                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5719             mark_state = st;
5720             ST.mark_loc = PL_reginput = locinput;
5721             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5722             /* NOTREACHED */
5723         case MARKPOINT_next:
5724             mark_state = ST.prev_mark;
5725             sayYES;
5726             /* NOTREACHED */
5727         case MARKPOINT_next_fail:
5728             if (popmark && sv_eq(ST.mark_name,popmark))
5729             {
5730                 if (ST.mark_loc > startpoint)
5731                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5732                 popmark = NULL; /* we found our mark */
5733                 sv_commit = ST.mark_name;
5734
5735                 DEBUG_EXECUTE_r({
5736                         PerlIO_printf(Perl_debug_log,
5737                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5738                             REPORT_CODE_OFF+depth*2, "",
5739                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5740                 });
5741             }
5742             mark_state = ST.prev_mark;
5743             sv_yes_mark = mark_state ?
5744                 mark_state->u.mark.mark_name : NULL;
5745             sayNO;
5746             /* NOTREACHED */
5747         case SKIP:
5748             PL_reginput = locinput;
5749             if (scan->flags) {
5750                 /* (*SKIP) : if we fail we cut here*/
5751                 ST.mark_name = NULL;
5752                 ST.mark_loc = locinput;
5753                 PUSH_STATE_GOTO(SKIP_next,next);
5754             } else {
5755                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5756                    otherwise do nothing.  Meaning we need to scan
5757                  */
5758                 regmatch_state *cur = mark_state;
5759                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5760
5761                 while (cur) {
5762                     if ( sv_eq( cur->u.mark.mark_name,
5763                                 find ) )
5764                     {
5765                         ST.mark_name = find;
5766                         PUSH_STATE_GOTO( SKIP_next, next );
5767                     }
5768                     cur = cur->u.mark.prev_mark;
5769                 }
5770             }
5771             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5772             break;
5773         case SKIP_next_fail:
5774             if (ST.mark_name) {
5775                 /* (*CUT:NAME) - Set up to search for the name as we
5776                    collapse the stack*/
5777                 popmark = ST.mark_name;
5778             } else {
5779                 /* (*CUT) - No name, we cut here.*/
5780                 if (ST.mark_loc > startpoint)
5781                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5782                 /* but we set sv_commit to latest mark_name if there
5783                    is one so they can test to see how things lead to this
5784                    cut */
5785                 if (mark_state)
5786                     sv_commit=mark_state->u.mark.mark_name;
5787             }
5788             no_final = 1;
5789             sayNO;
5790             /* NOTREACHED */
5791 #undef ST
5792         case LNBREAK:
5793             if ((n=is_LNBREAK(locinput,utf8_target))) {
5794                 locinput += n;
5795                 nextchr = UCHARAT(locinput);
5796             } else
5797                 sayNO;
5798             break;
5799
5800 #define CASE_CLASS(nAmE)                              \
5801         case nAmE:                                    \
5802             if (locinput >= PL_regeol)                \
5803                 sayNO;                                \
5804             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5805                 locinput += n;                        \
5806                 nextchr = UCHARAT(locinput);          \
5807             } else                                    \
5808                 sayNO;                                \
5809             break;                                    \
5810         case N##nAmE:                                 \
5811             if (locinput >= PL_regeol)                \
5812                 sayNO;                                \
5813             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5814                 sayNO;                                \
5815             } else {                                  \
5816                 locinput += UTF8SKIP(locinput);       \
5817                 nextchr = UCHARAT(locinput);          \
5818             }                                         \
5819             break
5820
5821         CASE_CLASS(VERTWS);
5822         CASE_CLASS(HORIZWS);
5823 #undef CASE_CLASS
5824
5825         default:
5826             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5827                           PTR2UV(scan), OP(scan));
5828             Perl_croak(aTHX_ "regexp memory corruption");
5829
5830         } /* end switch */
5831
5832         /* switch break jumps here */
5833         scan = next; /* prepare to execute the next op and ... */
5834         continue;    /* ... jump back to the top, reusing st */
5835         /* NOTREACHED */
5836
5837       push_yes_state:
5838         /* push a state that backtracks on success */
5839         st->u.yes.prev_yes_state = yes_state;
5840         yes_state = st;
5841         /* FALL THROUGH */
5842       push_state:
5843         /* push a new regex state, then continue at scan  */
5844         {
5845             regmatch_state *newst;
5846
5847             DEBUG_STACK_r({
5848                 regmatch_state *cur = st;
5849                 regmatch_state *curyes = yes_state;
5850                 int curd = depth;
5851                 regmatch_slab *slab = PL_regmatch_slab;
5852                 for (;curd > -1;cur--,curd--) {
5853                     if (cur < SLAB_FIRST(slab)) {
5854                         slab = slab->prev;
5855                         cur = SLAB_LAST(slab);
5856                     }
5857                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5858                         REPORT_CODE_OFF + 2 + depth * 2,"",
5859                         curd, PL_reg_name[cur->resume_state],
5860                         (curyes == cur) ? "yes" : ""
5861                     );
5862                     if (curyes == cur)
5863                         curyes = cur->u.yes.prev_yes_state;
5864                 }
5865             } else
5866                 DEBUG_STATE_pp("push")
5867             );
5868             depth++;
5869             st->locinput = locinput;
5870             newst = st+1;
5871             if (newst >  SLAB_LAST(PL_regmatch_slab))
5872                 newst = S_push_slab(aTHX);
5873             PL_regmatch_state = newst;
5874
5875             locinput = PL_reginput;
5876             nextchr = UCHARAT(locinput);
5877             st = newst;
5878             continue;
5879             /* NOTREACHED */
5880         }
5881     }
5882
5883     /*
5884     * We get here only if there's trouble -- normally "case END" is
5885     * the terminating point.
5886     */
5887     Perl_croak(aTHX_ "corrupted regexp pointers");
5888     /*NOTREACHED*/
5889     sayNO;
5890
5891 yes:
5892     if (yes_state) {
5893         /* we have successfully completed a subexpression, but we must now
5894          * pop to the state marked by yes_state and continue from there */
5895         assert(st != yes_state);
5896 #ifdef DEBUGGING
5897         while (st != yes_state) {
5898             st--;
5899             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5900                 PL_regmatch_slab = PL_regmatch_slab->prev;
5901                 st = SLAB_LAST(PL_regmatch_slab);
5902             }
5903             DEBUG_STATE_r({
5904                 if (no_final) {
5905                     DEBUG_STATE_pp("pop (no final)");
5906                 } else {
5907                     DEBUG_STATE_pp("pop (yes)");
5908                 }
5909             });
5910             depth--;
5911         }
5912 #else
5913         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5914             || yes_state > SLAB_LAST(PL_regmatch_slab))
5915         {
5916             /* not in this slab, pop slab */
5917             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5918             PL_regmatch_slab = PL_regmatch_slab->prev;
5919             st = SLAB_LAST(PL_regmatch_slab);
5920         }
5921         depth -= (st - yes_state);
5922 #endif
5923         st = yes_state;
5924         yes_state = st->u.yes.prev_yes_state;
5925         PL_regmatch_state = st;
5926
5927         if (no_final) {
5928             locinput= st->locinput;
5929             nextchr = UCHARAT(locinput);
5930         }
5931         state_num = st->resume_state + no_final;
5932         goto reenter_switch;
5933     }
5934
5935     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5936                           PL_colors[4], PL_colors[5]));
5937
5938     if (PL_reg_state.re_state_eval_setup_done) {
5939         /* each successfully executed (?{...}) block does the equivalent of
5940          *   local $^R = do {...}
5941          * When popping the save stack, all these locals would be undone;
5942          * bypass this by setting the outermost saved $^R to the latest
5943          * value */
5944         if (oreplsv != GvSV(PL_replgv))
5945             sv_setsv(oreplsv, GvSV(PL_replgv));
5946     }
5947     result = 1;
5948     goto final_exit;
5949
5950 no:
5951     DEBUG_EXECUTE_r(
5952         PerlIO_printf(Perl_debug_log,
5953             "%*s  %sfailed...%s\n",
5954             REPORT_CODE_OFF+depth*2, "",
5955             PL_colors[4], PL_colors[5])
5956         );
5957
5958 no_silent:
5959     if (no_final) {
5960         if (yes_state) {
5961             goto yes;
5962         } else {
5963             goto final_exit;
5964         }
5965     }
5966     if (depth) {
5967         /* there's a previous state to backtrack to */
5968         st--;
5969         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5970             PL_regmatch_slab = PL_regmatch_slab->prev;
5971             st = SLAB_LAST(PL_regmatch_slab);
5972         }
5973         PL_regmatch_state = st;
5974         locinput= st->locinput;
5975         nextchr = UCHARAT(locinput);
5976
5977         DEBUG_STATE_pp("pop");
5978         depth--;
5979         if (yes_state == st)
5980             yes_state = st->u.yes.prev_yes_state;
5981
5982         state_num = st->resume_state + 1; /* failure = success + 1 */
5983         goto reenter_switch;
5984     }
5985     result = 0;
5986
5987   final_exit:
5988     if (rex->intflags & PREGf_VERBARG_SEEN) {
5989         SV *sv_err = get_sv("REGERROR", 1);
5990         SV *sv_mrk = get_sv("REGMARK", 1);
5991         if (result) {
5992             sv_commit = &PL_sv_no;
5993             if (!sv_yes_mark)
5994                 sv_yes_mark = &PL_sv_yes;
5995         } else {
5996             if (!sv_commit)
5997                 sv_commit = &PL_sv_yes;
5998             sv_yes_mark = &PL_sv_no;
5999         }
6000         sv_setsv(sv_err, sv_commit);
6001         sv_setsv(sv_mrk, sv_yes_mark);
6002     }
6003
6004
6005     if (last_pushed_cv) {
6006         dSP;
6007         POP_MULTICALL;
6008     }
6009
6010     /* clean up; in particular, free all slabs above current one */
6011     LEAVE_SCOPE(oldsave);
6012
6013     return result;
6014 }
6015
6016 /*
6017  - regrepeat - repeatedly match something simple, report how many
6018  */
6019 /*
6020  * [This routine now assumes that it will only match on things of length 1.
6021  * That was true before, but now we assume scan - reginput is the count,
6022  * rather than incrementing count on every character.  [Er, except utf8.]]
6023  */
6024 STATIC I32
6025 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
6026 {
6027     dVAR;
6028     register char *scan;
6029     register I32 c;
6030     register char *loceol = PL_regeol;
6031     register I32 hardcount = 0;
6032     register bool utf8_target = PL_reg_match_utf8;
6033     UV utf8_flags;
6034 #ifndef DEBUGGING
6035     PERL_UNUSED_ARG(depth);
6036 #endif
6037
6038     PERL_ARGS_ASSERT_REGREPEAT;
6039
6040     scan = PL_reginput;
6041     if (max == REG_INFTY)
6042         max = I32_MAX;
6043     else if (max < loceol - scan)
6044         loceol = scan + max;
6045     switch (OP(p)) {
6046     case REG_ANY:
6047         if (utf8_target) {
6048             loceol = PL_regeol;
6049             while (scan < loceol && hardcount < max && *scan != '\n') {
6050                 scan += UTF8SKIP(scan);
6051                 hardcount++;
6052             }
6053         } else {
6054             while (scan < loceol && *scan != '\n')
6055                 scan++;
6056         }
6057         break;
6058     case SANY:
6059         if (utf8_target) {
6060             loceol = PL_regeol;
6061             while (scan < loceol && hardcount < max) {
6062                 scan += UTF8SKIP(scan);
6063                 hardcount++;
6064             }
6065         }
6066         else
6067             scan = loceol;
6068         break;
6069     case CANY:
6070         scan = loceol;
6071         break;
6072     case EXACT:
6073         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6074          * means they match only characters in the string that can be expressed
6075          * as a single byte.  For non-utf8 strings, that means a simple match.
6076          * For utf8 strings, the character matched must be an invariant, or
6077          * downgradable to a single byte.  The pattern's utf8ness is
6078          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6079          * it is, it's an invariant */
6080
6081         c = (U8)*STRING(p);
6082         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6083
6084         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6085             while (scan < loceol && UCHARAT(scan) == c) {
6086                 scan++;
6087             }
6088         }
6089         else {
6090
6091             /* Here, the string is utf8, and the pattern char is different
6092              * in utf8 than not, so can't compare them directly.  Outside the
6093              * loop, find the two utf8 bytes that represent c, and then
6094              * look for those in sequence in the utf8 string */
6095             U8 high = UTF8_TWO_BYTE_HI(c);
6096             U8 low = UTF8_TWO_BYTE_LO(c);
6097             loceol = PL_regeol;
6098
6099             while (hardcount < max
6100                     && scan + 1 < loceol
6101                     && UCHARAT(scan) == high
6102                     && UCHARAT(scan + 1) == low)
6103             {
6104                 scan += 2;
6105                 hardcount++;
6106             }
6107         }
6108         break;
6109     case EXACTFA:
6110         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6111         goto do_exactf;
6112
6113     case EXACTFL:
6114         PL_reg_flags |= RF_tainted;
6115         utf8_flags = FOLDEQ_UTF8_LOCALE;
6116         goto do_exactf;
6117
6118     case EXACTF:
6119             utf8_flags = 0;
6120             goto do_exactf;
6121
6122     case EXACTFU_SS:
6123     case EXACTFU_TRICKYFOLD:
6124     case EXACTFU:
6125         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6126
6127         /* The comments for the EXACT case above apply as well to these fold
6128          * ones */
6129
6130     do_exactf:
6131         c = (U8)*STRING(p);
6132         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6133
6134         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6135             char *tmpeol = loceol;
6136             while (hardcount < max
6137                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6138                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6139             {
6140                 scan = tmpeol;
6141                 tmpeol = loceol;
6142                 hardcount++;
6143             }
6144
6145             /* XXX Note that the above handles properly the German sharp s in
6146              * the pattern matching ss in the string.  But it doesn't handle
6147              * properly cases where the string contains say 'LIGATURE ff' and
6148              * the pattern is 'f+'.  This would require, say, a new function or
6149              * revised interface to foldEQ_utf8(), in which the maximum number
6150              * of characters to match could be passed and it would return how
6151              * many actually did.  This is just one of many cases where
6152              * multi-char folds don't work properly, and so the fix is being
6153              * deferred */
6154         }
6155         else {
6156             U8 folded;
6157
6158             /* Here, the string isn't utf8 and c is a single byte; and either
6159              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6160              * doesn't affect c.  Can just do simple comparisons for exact or
6161              * fold matching. */
6162             switch (OP(p)) {
6163                 case EXACTF: folded = PL_fold[c]; break;
6164                 case EXACTFA:
6165                 case EXACTFU_TRICKYFOLD:
6166                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6167                 case EXACTFL: folded = PL_fold_locale[c]; break;
6168                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6169             }
6170             while (scan < loceol &&
6171                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6172             {
6173                 scan++;
6174             }
6175         }
6176         break;
6177     case ANYOFV:
6178     case ANYOF:
6179         if (utf8_target || OP(p) == ANYOFV) {
6180             STRLEN inclasslen;
6181             loceol = PL_regeol;
6182             inclasslen = loceol - scan;
6183             while (hardcount < max
6184                    && ((inclasslen = loceol - scan) > 0)
6185                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6186             {
6187                 scan += inclasslen;
6188                 hardcount++;
6189             }
6190         } else {
6191             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6192                 scan++;
6193         }
6194         break;
6195     case ALNUMU:
6196         if (utf8_target) {
6197     utf8_wordchar:
6198             loceol = PL_regeol;
6199             LOAD_UTF8_CHARCLASS_ALNUM();
6200             while (hardcount < max && scan < loceol &&
6201                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6202             {
6203                 scan += UTF8SKIP(scan);
6204                 hardcount++;
6205             }
6206         } else {
6207             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6208                 scan++;
6209             }
6210         }
6211         break;
6212     case ALNUM:
6213         if (utf8_target)
6214             goto utf8_wordchar;
6215         while (scan < loceol && isALNUM((U8) *scan)) {
6216             scan++;
6217         }
6218         break;
6219     case ALNUMA:
6220         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6221             scan++;
6222         }
6223         break;
6224     case ALNUML:
6225         PL_reg_flags |= RF_tainted;
6226         if (utf8_target) {
6227             loceol = PL_regeol;
6228             while (hardcount < max && scan < loceol &&
6229                    isALNUM_LC_utf8((U8*)scan)) {
6230                 scan += UTF8SKIP(scan);
6231                 hardcount++;
6232             }
6233         } else {
6234             while (scan < loceol && isALNUM_LC(*scan))
6235                 scan++;
6236         }
6237         break;
6238     case NALNUMU:
6239         if (utf8_target) {
6240
6241     utf8_Nwordchar:
6242
6243             loceol = PL_regeol;
6244             LOAD_UTF8_CHARCLASS_ALNUM();
6245             while (hardcount < max && scan < loceol &&
6246                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6247             {
6248                 scan += UTF8SKIP(scan);
6249                 hardcount++;
6250             }
6251         } else {
6252             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6253                 scan++;
6254             }
6255         }
6256         break;
6257     case NALNUM:
6258         if (utf8_target)
6259             goto utf8_Nwordchar;
6260         while (scan < loceol && ! isALNUM((U8) *scan)) {
6261             scan++;
6262         }
6263         break;
6264     case NALNUMA:
6265         if (utf8_target) {
6266             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6267                 scan += UTF8SKIP(scan);
6268             }
6269         }
6270         else {
6271             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6272                 scan++;
6273             }
6274         }
6275         break;
6276     case NALNUML:
6277         PL_reg_flags |= RF_tainted;
6278         if (utf8_target) {
6279             loceol = PL_regeol;
6280             while (hardcount < max && scan < loceol &&
6281                    !isALNUM_LC_utf8((U8*)scan)) {
6282                 scan += UTF8SKIP(scan);
6283                 hardcount++;
6284             }
6285         } else {
6286             while (scan < loceol && !isALNUM_LC(*scan))
6287                 scan++;
6288         }
6289         break;
6290     case SPACEU:
6291         if (utf8_target) {
6292
6293     utf8_space:
6294
6295             loceol = PL_regeol;
6296             LOAD_UTF8_CHARCLASS_SPACE();
6297             while (hardcount < max && scan < loceol &&
6298                    (*scan == ' ' ||
6299                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6300             {
6301                 scan += UTF8SKIP(scan);
6302                 hardcount++;
6303             }
6304             break;
6305         }
6306         else {
6307             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6308                 scan++;
6309             }
6310             break;
6311         }
6312     case SPACE:
6313         if (utf8_target)
6314             goto utf8_space;
6315
6316         while (scan < loceol && isSPACE((U8) *scan)) {
6317             scan++;
6318         }
6319         break;
6320     case SPACEA:
6321         while (scan < loceol && isSPACE_A((U8) *scan)) {
6322             scan++;
6323         }
6324         break;
6325     case SPACEL:
6326         PL_reg_flags |= RF_tainted;
6327         if (utf8_target) {
6328             loceol = PL_regeol;
6329             while (hardcount < max && scan < loceol &&
6330                    isSPACE_LC_utf8((U8*)scan)) {
6331                 scan += UTF8SKIP(scan);
6332                 hardcount++;
6333             }
6334         } else {
6335             while (scan < loceol && isSPACE_LC(*scan))
6336                 scan++;
6337         }
6338         break;
6339     case NSPACEU:
6340         if (utf8_target) {
6341
6342     utf8_Nspace:
6343
6344             loceol = PL_regeol;
6345             LOAD_UTF8_CHARCLASS_SPACE();
6346             while (hardcount < max && scan < loceol &&
6347                    ! (*scan == ' ' ||
6348                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6349             {
6350                 scan += UTF8SKIP(scan);
6351                 hardcount++;
6352             }
6353             break;
6354         }
6355         else {
6356             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6357                 scan++;
6358             }
6359         }
6360         break;
6361     case NSPACE:
6362         if (utf8_target)
6363             goto utf8_Nspace;
6364
6365         while (scan < loceol && ! isSPACE((U8) *scan)) {
6366             scan++;
6367         }
6368         break;
6369     case NSPACEA:
6370         if (utf8_target) {
6371             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6372                 scan += UTF8SKIP(scan);
6373             }
6374         }
6375         else {
6376             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6377                 scan++;
6378             }
6379         }
6380         break;
6381     case NSPACEL:
6382         PL_reg_flags |= RF_tainted;
6383         if (utf8_target) {
6384             loceol = PL_regeol;
6385             while (hardcount < max && scan < loceol &&
6386                    !isSPACE_LC_utf8((U8*)scan)) {
6387                 scan += UTF8SKIP(scan);
6388                 hardcount++;
6389             }
6390         } else {
6391             while (scan < loceol && !isSPACE_LC(*scan))
6392                 scan++;
6393         }
6394         break;
6395     case DIGIT:
6396         if (utf8_target) {
6397             loceol = PL_regeol;
6398             LOAD_UTF8_CHARCLASS_DIGIT();
6399             while (hardcount < max && scan < loceol &&
6400                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6401                 scan += UTF8SKIP(scan);
6402                 hardcount++;
6403             }
6404         } else {
6405             while (scan < loceol && isDIGIT(*scan))
6406                 scan++;
6407         }
6408         break;
6409     case DIGITA:
6410         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6411             scan++;
6412         }
6413         break;
6414     case DIGITL:
6415         PL_reg_flags |= RF_tainted;
6416         if (utf8_target) {
6417             loceol = PL_regeol;
6418             while (hardcount < max && scan < loceol &&
6419                    isDIGIT_LC_utf8((U8*)scan)) {
6420                 scan += UTF8SKIP(scan);
6421                 hardcount++;
6422             }
6423         } else {
6424             while (scan < loceol && isDIGIT_LC(*scan))
6425                 scan++;
6426         }
6427         break;
6428     case NDIGIT:
6429         if (utf8_target) {
6430             loceol = PL_regeol;
6431             LOAD_UTF8_CHARCLASS_DIGIT();
6432             while (hardcount < max && scan < loceol &&
6433                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6434                 scan += UTF8SKIP(scan);
6435                 hardcount++;
6436             }
6437         } else {
6438             while (scan < loceol && !isDIGIT(*scan))
6439                 scan++;
6440         }
6441         break;
6442     case NDIGITA:
6443         if (utf8_target) {
6444             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6445                 scan += UTF8SKIP(scan);
6446             }
6447         }
6448         else {
6449             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6450                 scan++;
6451             }
6452         }
6453         break;
6454     case NDIGITL:
6455         PL_reg_flags |= RF_tainted;
6456         if (utf8_target) {
6457             loceol = PL_regeol;
6458             while (hardcount < max && scan < loceol &&
6459                    !isDIGIT_LC_utf8((U8*)scan)) {
6460                 scan += UTF8SKIP(scan);
6461                 hardcount++;
6462             }
6463         } else {
6464             while (scan < loceol && !isDIGIT_LC(*scan))
6465                 scan++;
6466         }
6467         break;
6468     case LNBREAK:
6469         if (utf8_target) {
6470             loceol = PL_regeol;
6471             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6472                 scan += c;
6473                 hardcount++;
6474             }
6475         } else {
6476             /*
6477               LNBREAK can match two latin chars, which is ok,
6478               because we have a null terminated string, but we
6479               have to use hardcount in this situation
6480             */
6481             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6482                 scan+=c;
6483                 hardcount++;
6484             }
6485         }
6486         break;
6487     case HORIZWS:
6488         if (utf8_target) {
6489             loceol = PL_regeol;
6490             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6491                 scan += c;
6492                 hardcount++;
6493             }
6494         } else {
6495             while (scan < loceol && is_HORIZWS_latin1(scan))
6496                 scan++;
6497         }
6498         break;
6499     case NHORIZWS:
6500         if (utf8_target) {
6501             loceol = PL_regeol;
6502             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6503                 scan += UTF8SKIP(scan);
6504                 hardcount++;
6505             }
6506         } else {
6507             while (scan < loceol && !is_HORIZWS_latin1(scan))
6508                 scan++;
6509
6510         }
6511         break;
6512     case VERTWS:
6513         if (utf8_target) {
6514             loceol = PL_regeol;
6515             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6516                 scan += c;
6517                 hardcount++;
6518             }
6519         } else {
6520             while (scan < loceol && is_VERTWS_latin1(scan))
6521                 scan++;
6522
6523         }
6524         break;
6525     case NVERTWS:
6526         if (utf8_target) {
6527             loceol = PL_regeol;
6528             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6529                 scan += UTF8SKIP(scan);
6530                 hardcount++;
6531             }
6532         } else {
6533             while (scan < loceol && !is_VERTWS_latin1(scan))
6534                 scan++;
6535
6536         }
6537         break;
6538
6539     default:            /* Called on something of 0 width. */
6540         break;          /* So match right here or not at all. */
6541     }
6542
6543     if (hardcount)
6544         c = hardcount;
6545     else
6546         c = scan - PL_reginput;
6547     PL_reginput = scan;
6548
6549     DEBUG_r({
6550         GET_RE_DEBUG_FLAGS_DECL;
6551         DEBUG_EXECUTE_r({
6552             SV * const prop = sv_newmortal();
6553             regprop(prog, prop, p);
6554             PerlIO_printf(Perl_debug_log,
6555                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6556                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6557         });
6558     });
6559
6560     return(c);
6561 }
6562
6563
6564 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6565 /*
6566 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6567 create a copy so that changes the caller makes won't change the shared one
6568  */
6569 SV *
6570 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6571 {
6572     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6573     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6574 }
6575 #endif
6576
6577 STATIC SV *
6578 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6579 {
6580     /* Returns the swash for the input 'node' in the regex 'prog'.
6581      * If <doinit> is true, will attempt to create the swash if not already
6582      *    done.
6583      * If <listsvp> is non-null, will return the swash initialization string in
6584      *    it.
6585      * If <altsvp> is non-null, will return the alternates to the regular swash
6586      *    in it
6587      * Tied intimately to how regcomp.c sets up the data structure */
6588
6589     dVAR;
6590     SV *sw  = NULL;
6591     SV *si  = NULL;
6592     SV *alt = NULL;
6593     SV*  invlist = NULL;
6594
6595     RXi_GET_DECL(prog,progi);
6596     const struct reg_data * const data = prog ? progi->data : NULL;
6597
6598     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6599
6600     assert(ANYOF_NONBITMAP(node));
6601
6602     if (data && data->count) {
6603         const U32 n = ARG(node);
6604
6605         if (data->what[n] == 's') {
6606             SV * const rv = MUTABLE_SV(data->data[n]);
6607             AV * const av = MUTABLE_AV(SvRV(rv));
6608             SV **const ary = AvARRAY(av);
6609             bool invlist_has_user_defined_property;
6610
6611             si = *ary;  /* ary[0] = the string to initialize the swash with */
6612
6613             /* Elements 3 and 4 are either both present or both absent. [3] is
6614              * any inversion list generated at compile time; [4] indicates if
6615              * that inversion list has any user-defined properties in it. */
6616             if (av_len(av) >= 3) {
6617                 invlist = ary[3];
6618                 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6619             }
6620             else {
6621                 invlist = NULL;
6622                 invlist_has_user_defined_property = FALSE;
6623             }
6624
6625             /* Element [1] is reserved for the set-up swash.  If already there,
6626              * return it; if not, create it and store it there */
6627             if (SvROK(ary[1])) {
6628                 sw = ary[1];
6629             }
6630             else if (si && doinit) {
6631
6632                 sw = _core_swash_init("utf8", /* the utf8 package */
6633                                       "", /* nameless */
6634                                       si,
6635                                       1, /* binary */
6636                                       0, /* not from tr/// */
6637                                       FALSE, /* is error if can't find
6638                                                 property */
6639                                       invlist,
6640                                       invlist_has_user_defined_property);
6641                 (void)av_store(av, 1, sw);
6642             }
6643
6644             /* Element [2] is for any multi-char folds.  Note that is a
6645              * fundamentally flawed design, because can't backtrack and try
6646              * again.  See [perl #89774] */
6647             if (SvTYPE(ary[2]) == SVt_PVAV) {
6648                 alt = ary[2];
6649             }
6650         }
6651     }
6652
6653     if (listsvp) {
6654         SV* matches_string = newSVpvn("", 0);
6655         SV** invlistsvp;
6656
6657         /* Use the swash, if any, which has to have incorporated into it all
6658          * possibilities */
6659         if (   sw
6660             && SvROK(sw)
6661             && SvTYPE(SvRV(sw)) == SVt_PVHV
6662             && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6663         {
6664             invlist = *invlistsvp;
6665         }
6666         else if (si && si != &PL_sv_undef) {
6667
6668             /* If no swash, use the input nitialization string, if available */
6669             sv_catsv(matches_string, si);
6670         }
6671
6672         /* Add the inversion list to whatever we have.  This may have come from
6673          * the swash, or from an input parameter */
6674         if (invlist) {
6675             sv_catsv(matches_string, _invlist_contents(invlist));
6676         }
6677         *listsvp = matches_string;
6678     }
6679
6680     if (altsvp)
6681         *altsvp  = alt;
6682
6683     return sw;
6684 }
6685
6686 /*
6687  - reginclass - determine if a character falls into a character class
6688
6689   n is the ANYOF regnode
6690   p is the target string
6691   lenp is pointer to the maximum number of bytes of how far to go in p
6692     (This is assumed wthout checking to always be at least the current
6693     character's size)
6694   utf8_target tells whether p is in UTF-8.
6695
6696   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6697   from a successful match, the value it points to will be updated to how many
6698   bytes in p were matched.  If there was no match, the value is undefined,
6699   possibly changed from the input.
6700
6701   Note that this can be a synthetic start class, a combination of various
6702   nodes, so things you think might be mutually exclusive, such as locale,
6703   aren't.  It can match both locale and non-locale
6704
6705  */
6706
6707 STATIC bool
6708 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6709 {
6710     dVAR;
6711     const char flags = ANYOF_FLAGS(n);
6712     bool match = FALSE;
6713     UV c = *p;
6714     STRLEN c_len = 0;
6715     STRLEN maxlen;
6716
6717     PERL_ARGS_ASSERT_REGINCLASS;
6718
6719     /* If c is not already the code point, get it */
6720     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6721         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6722                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6723                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6724                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6725                  * UTF8_ALLOW_FFFF */
6726         if (c_len == (STRLEN)-1)
6727             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6728     }
6729     else {
6730         c_len = 1;
6731     }
6732
6733     /* Use passed in max length, or one character if none passed in or less
6734      * than one character.  And assume will match just one character.  This is
6735      * overwritten later if matched more. */
6736     if (lenp) {
6737         maxlen = (*lenp > c_len) ? *lenp : c_len;
6738         *lenp = c_len;
6739
6740     }
6741     else {
6742         maxlen = c_len;
6743     }
6744
6745     /* If this character is potentially in the bitmap, check it */
6746     if (c < 256) {
6747         if (ANYOF_BITMAP_TEST(n, c))
6748             match = TRUE;
6749         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6750                 && ! utf8_target
6751                 && ! isASCII(c))
6752         {
6753             match = TRUE;
6754         }
6755
6756         else if (flags & ANYOF_LOCALE) {
6757             PL_reg_flags |= RF_tainted;
6758
6759             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6760                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6761             {
6762                 match = TRUE;
6763             }
6764             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6765                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6766                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6767                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6768                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6769                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6770                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6771                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6772                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6773                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6774                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6775                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6776                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6777                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6778                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6779                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6780                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6781                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6782                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6783                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6784                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6785                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6786                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6787                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6788                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6789                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6790                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6791                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6792                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6793                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6794                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6795                      ) /* How's that for a conditional? */
6796             ) {
6797                 match = TRUE;
6798             }
6799         }
6800     }
6801
6802     /* If the bitmap didn't (or couldn't) match, and something outside the
6803      * bitmap could match, try that.  Locale nodes specifiy completely the
6804      * behavior of code points in the bit map (otherwise, a utf8 target would
6805      * cause them to be treated as Unicode and not locale), except in
6806      * the very unlikely event when this node is a synthetic start class, which
6807      * could be a combination of locale and non-locale nodes.  So allow locale
6808      * to match for the synthetic start class, which will give a false
6809      * positive that will be resolved when the match is done again as not part
6810      * of the synthetic start class */
6811     if (!match) {
6812         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6813             match = TRUE;       /* Everything above 255 matches */
6814         }
6815         else if (ANYOF_NONBITMAP(n)
6816                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6817                      || (utf8_target
6818                          && (c >=256
6819                              || (! (flags & ANYOF_LOCALE))
6820                              || (flags & ANYOF_IS_SYNTHETIC)))))
6821         {
6822             AV *av;
6823             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6824
6825             if (sw) {
6826                 U8 * utf8_p;
6827                 if (utf8_target) {
6828                     utf8_p = (U8 *) p;
6829                 } else {
6830
6831                     /* Not utf8.  Convert as much of the string as available up
6832                      * to the limit of how far the (single) character in the
6833                      * pattern can possibly match (no need to go further).  If
6834                      * the node is a straight ANYOF or not folding, it can't
6835                      * match more than one.  Otherwise, It can match up to how
6836                      * far a single char can fold to.  Since not utf8, each
6837                      * character is a single byte, so the max it can be in
6838                      * bytes is the same as the max it can be in characters */
6839                     STRLEN len = (OP(n) == ANYOF
6840                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6841                                   ? 1
6842                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6843                                     ? maxlen
6844                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6845                     utf8_p = bytes_to_utf8(p, &len);
6846                 }
6847
6848                 if (swash_fetch(sw, utf8_p, TRUE))
6849                     match = TRUE;
6850                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6851
6852                     /* Here, we need to test if the fold of the target string
6853                      * matches.  The non-multi char folds have all been moved to
6854                      * the compilation phase, and the multi-char folds have
6855                      * been stored by regcomp into 'av'; we linearly check to
6856                      * see if any match the target string (folded).   We know
6857                      * that the originals were each one character, but we don't
6858                      * currently know how many characters/bytes each folded to,
6859                      * except we do know that there are small limits imposed by
6860                      * Unicode.  XXX A performance enhancement would be to have
6861                      * regcomp.c store the max number of chars/bytes that are
6862                      * in an av entry, as, say the 0th element.  Even better
6863                      * would be to have a hash of the few characters that can
6864                      * start a multi-char fold to the max number of chars of
6865                      * those folds.
6866                      *
6867                      * If there is a match, we will need to advance (if lenp is
6868                      * specified) the match pointer in the target string.  But
6869                      * what we are comparing here isn't that string directly,
6870                      * but its fold, whose length may differ from the original.
6871                      * As we go along in constructing the fold, therefore, we
6872                      * create a map so that we know how many bytes in the
6873                      * source to advance given that we have matched a certain
6874                      * number of bytes in the fold.  This map is stored in
6875                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6876                      * the fold of the first character that we are folding.
6877                      * Then map_fold_len_back[n] is set to the number of bytes
6878                      * in that first character.  Similarly let m be the
6879                      * corresponding number for the second character to be
6880                      * folded.  Then map_fold_len_back[n+m] is set to the
6881                      * number of bytes occupied by the first two source
6882                      * characters. ... */
6883                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6884                     U8 folded[UTF8_MAXBYTES_CASE+1];
6885                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6886                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6887                                                   chars */
6888
6889                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6890
6891                         /* Here, only need to fold the first char of the target
6892                          * string.  It the source wasn't utf8, is 1 byte long */
6893                         to_utf8_fold(utf8_p, folded, &foldlen);
6894                         total_foldlen = foldlen;
6895                         map_fold_len_back[foldlen] = (utf8_target)
6896                                                      ? UTF8SKIP(utf8_p)
6897                                                      : 1;
6898                     }
6899                     else {
6900
6901                         /* Here, need to fold more than the first char.  Do so
6902                          * up to the limits */
6903                         U8* source_ptr = utf8_p;    /* The source for the fold
6904                                                        is the regex target
6905                                                        string */
6906                         U8* folded_ptr = folded;
6907                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6908                                                        available byte in the
6909                                                        target string */
6910                         U8 i;
6911                         for (i = 0;
6912                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6913                              i++)
6914                         {
6915
6916                             /* Fold the next character */
6917                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6918                             STRLEN this_char_foldlen;
6919                             to_utf8_fold(source_ptr,
6920                                          this_char_folded,
6921                                          &this_char_foldlen);
6922
6923                             /* Bail if it would exceed the byte limit for
6924                              * folding a single char. */
6925                             if (this_char_foldlen + folded_ptr - folded >
6926                                                             UTF8_MAXBYTES_CASE)
6927                             {
6928                                 break;
6929                             }
6930
6931                             /* Add the fold of this character */
6932                             Copy(this_char_folded,
6933                                  folded_ptr,
6934                                  this_char_foldlen,
6935                                  U8);
6936                             source_ptr += UTF8SKIP(source_ptr);
6937                             folded_ptr += this_char_foldlen;
6938                             total_foldlen = folded_ptr - folded;
6939
6940                             /* Create map from the number of bytes in the fold
6941                              * back to the number of bytes in the source.  If
6942                              * the source isn't utf8, the byte count is just
6943                              * the number of characters so far */
6944                             map_fold_len_back[total_foldlen]
6945                                                       = (utf8_target)
6946                                                         ? source_ptr - utf8_p
6947                                                         : i + 1;
6948                         }
6949                         *folded_ptr = '\0';
6950                     }
6951
6952
6953                     /* Do the linear search to see if the fold is in the list
6954                      * of multi-char folds. */
6955                     if (av) {
6956                         I32 i;
6957                         for (i = 0; i <= av_len(av); i++) {
6958                             SV* const sv = *av_fetch(av, i, FALSE);
6959                             STRLEN len;
6960                             const char * const s = SvPV_const(sv, len);
6961
6962                             if (len <= total_foldlen
6963                                 && memEQ(s, (char*)folded, len)
6964
6965                                    /* If 0, means matched a partial char. See
6966                                     * [perl #90536] */
6967                                 && map_fold_len_back[len])
6968                             {
6969
6970                                 /* Advance the target string ptr to account for
6971                                  * this fold, but have to translate from the
6972                                  * folded length to the corresponding source
6973                                  * length. */
6974                                 if (lenp) {
6975                                     *lenp = map_fold_len_back[len];
6976                                 }
6977                                 match = TRUE;
6978                                 break;
6979                             }
6980                         }
6981                     }
6982                 }
6983
6984                 /* If we allocated a string above, free it */
6985                 if (! utf8_target) Safefree(utf8_p);
6986             }
6987         }
6988     }
6989
6990     return (flags & ANYOF_INVERT) ? !match : match;
6991 }
6992
6993 STATIC U8 *
6994 S_reghop3(U8 *s, I32 off, const U8* lim)
6995 {
6996     /* return the position 'off' UTF-8 characters away from 's', forward if
6997      * 'off' >= 0, backwards if negative.  But don't go outside of position
6998      * 'lim', which better be < s  if off < 0 */
6999
7000     dVAR;
7001
7002     PERL_ARGS_ASSERT_REGHOP3;
7003
7004     if (off >= 0) {
7005         while (off-- && s < lim) {
7006             /* XXX could check well-formedness here */
7007             s += UTF8SKIP(s);
7008         }
7009     }
7010     else {
7011         while (off++ && s > lim) {
7012             s--;
7013             if (UTF8_IS_CONTINUED(*s)) {
7014                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7015                     s--;
7016             }
7017             /* XXX could check well-formedness here */
7018         }
7019     }
7020     return s;
7021 }
7022
7023 #ifdef XXX_dmq
7024 /* there are a bunch of places where we use two reghop3's that should
7025    be replaced with this routine. but since thats not done yet
7026    we ifdef it out - dmq
7027 */
7028 STATIC U8 *
7029 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7030 {
7031     dVAR;
7032
7033     PERL_ARGS_ASSERT_REGHOP4;
7034
7035     if (off >= 0) {
7036         while (off-- && s < rlim) {
7037             /* XXX could check well-formedness here */
7038             s += UTF8SKIP(s);
7039         }
7040     }
7041     else {
7042         while (off++ && s > llim) {
7043             s--;
7044             if (UTF8_IS_CONTINUED(*s)) {
7045                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7046                     s--;
7047             }
7048             /* XXX could check well-formedness here */
7049         }
7050     }
7051     return s;
7052 }
7053 #endif
7054
7055 STATIC U8 *
7056 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7057 {
7058     dVAR;
7059
7060     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7061
7062     if (off >= 0) {
7063         while (off-- && s < lim) {
7064             /* XXX could check well-formedness here */
7065             s += UTF8SKIP(s);
7066         }
7067         if (off >= 0)
7068             return NULL;
7069     }
7070     else {
7071         while (off++ && s > lim) {
7072             s--;
7073             if (UTF8_IS_CONTINUED(*s)) {
7074                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7075                     s--;
7076             }
7077             /* XXX could check well-formedness here */
7078         }
7079         if (off <= 0)
7080             return NULL;
7081     }
7082     return s;
7083 }
7084
7085 static void
7086 restore_pos(pTHX_ void *arg)
7087 {
7088     dVAR;
7089     regexp * const rex = (regexp *)arg;
7090     if (PL_reg_state.re_state_eval_setup_done) {
7091         if (PL_reg_oldsaved) {
7092             rex->subbeg = PL_reg_oldsaved;
7093             rex->sublen = PL_reg_oldsavedlen;
7094 #ifdef PERL_OLD_COPY_ON_WRITE
7095             rex->saved_copy = PL_nrs;
7096 #endif
7097             RXp_MATCH_COPIED_on(rex);
7098         }
7099         PL_reg_magic->mg_len = PL_reg_oldpos;
7100         PL_reg_state.re_state_eval_setup_done = FALSE;
7101         PL_curpm = PL_reg_oldcurpm;
7102     }
7103 }
7104
7105 STATIC void
7106 S_to_utf8_substr(pTHX_ register regexp *prog)
7107 {
7108     int i = 1;
7109
7110     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7111
7112     do {
7113         if (prog->substrs->data[i].substr
7114             && !prog->substrs->data[i].utf8_substr) {
7115             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7116             prog->substrs->data[i].utf8_substr = sv;
7117             sv_utf8_upgrade(sv);
7118             if (SvVALID(prog->substrs->data[i].substr)) {
7119                 if (SvTAIL(prog->substrs->data[i].substr)) {
7120                     /* Trim the trailing \n that fbm_compile added last
7121                        time.  */
7122                     SvCUR_set(sv, SvCUR(sv) - 1);
7123                     /* Whilst this makes the SV technically "invalid" (as its
7124                        buffer is no longer followed by "\0") when fbm_compile()
7125                        adds the "\n" back, a "\0" is restored.  */
7126                     fbm_compile(sv, FBMcf_TAIL);
7127                 } else
7128                     fbm_compile(sv, 0);
7129             }
7130             if (prog->substrs->data[i].substr == prog->check_substr)
7131                 prog->check_utf8 = sv;
7132         }
7133     } while (i--);
7134 }
7135
7136 STATIC void
7137 S_to_byte_substr(pTHX_ register regexp *prog)
7138 {
7139     dVAR;
7140     int i = 1;
7141
7142     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7143
7144     do {
7145         if (prog->substrs->data[i].utf8_substr
7146             && !prog->substrs->data[i].substr) {
7147             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7148             if (sv_utf8_downgrade(sv, TRUE)) {
7149                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7150                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7151                         /* Trim the trailing \n that fbm_compile added last
7152                            time.  */
7153                         SvCUR_set(sv, SvCUR(sv) - 1);
7154                         fbm_compile(sv, FBMcf_TAIL);
7155                     } else
7156                         fbm_compile(sv, 0);
7157                 }
7158             } else {
7159                 SvREFCNT_dec(sv);
7160                 sv = &PL_sv_undef;
7161             }
7162             prog->substrs->data[i].substr = sv;
7163             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7164                 prog->check_substr = sv;
7165         }
7166     } while (i--);
7167 }
7168
7169 /*
7170  * Local variables:
7171  * c-indentation-style: bsd
7172  * c-basic-offset: 4
7173  * indent-tabs-mode: nil
7174  * End:
7175  *
7176  * ex: set ts=8 sts=4 sw=4 et:
7177  */