regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
  98  * complications: i.e., if everything matchable is straight forward in the
  99  * bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 /*
 154    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 155    so that it is possible to override the option here without having to
 156    rebuild the entire core. as we are required to do if we change regcomp.h
 157    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 158 */
 159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #endif
 162
 163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 164 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 167 #define RE_utf8_perl_word   PL_utf8_alnum
 168 #define RE_utf8_perl_space  PL_utf8_space
 169 #define RE_utf8_posix_digit PL_utf8_digit
 170 #define perl_word  alnum
 171 #define perl_space space
 172 #define posix_digit digit
 173 #else
 174 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 177 #define RE_utf8_perl_word   PL_utf8_perl_word
 178 #define RE_utf8_perl_space  PL_utf8_perl_space
 179 #define RE_utf8_posix_digit PL_utf8_posix_digit
 180 #endif
 181
 182 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 183
 184 /* The actual code for CCC_TRY, which uses several variables from the routine
 185  * it's callable from.  It is designed to be the bulk of a case statement.
 186  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 187  *      nextchr matches the class.
 188  * UTF8_TEST is the whole test string to use for utf8 targets
 189  * LOAD is what to use to test, and if not present to load in the swash for the
 190  *      class
 191  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 192  *      UTF8_TEST test.
 193  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 194  * utf8 and a variant, load the swash if necessary and test using the utf8
 195  * test.  Advance to the next character if test is ok, otherwise fail; If not
 196  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 197  * fails, or advance to the next character */
 198
 199 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 200     if (locinput >= PL_regeol) {                                              \
 201         sayNO;                                                                \
 202     }                                                                         \
 203     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 204         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 205         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 206             sayNO;                                                            \
 207         }                                                                     \
 208         locinput += PL_utf8skip[nextchr];                                     \
 209         nextchr = UCHARAT(locinput);                                          \
 210         break;                                                                \
 211     }                                                                         \
 212     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 213         sayNO;                                                                \
 214     }                                                                         \
 215     nextchr = UCHARAT(++locinput);                                            \
 216     break;
 217
 218 /* Handle the non-locale cases for a character class and its complement.  It
 219  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 220  * This is because that code fails when the test succeeds, so we want to have
 221  * the test fail so that the code succeeds.  The swash is stored in a
 222  * predictable PL_ place */
 223 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 224                            CLASS, STR)                                        \
 225     case NAME:                                                                \
 226         _CCC_TRY_CODE( !, FUNC,                                               \
 227                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 228                                             (U8*)locinput, TRUE)),            \
 229                           CLASS, STR)                                         \
 230     case NNAME:                                                               \
 231         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 232                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 233                                             (U8*)locinput, TRUE)),            \
 234                           CLASS, STR)                                         \
 235
 236 /* Generate the case statements for both locale and non-locale character
 237  * classes in regmatch for classes that don't have special unicode semantics.
 238  * Locales don't use an immediate swash, but an intermediary special locale
 239  * function that is called on the pointer to the current place in the input
 240  * string.  That function will resolve to needing the same swash.  One might
 241  * think that because we don't know what the locale will match, we shouldn't
 242  * check with the swash loading function that it loaded properly; ie, that we
 243  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 244  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 245  * irrelevant here */
 246 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 247                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 248                 NAMEA, NNAMEA, FUNCA,                                         \
 249                 CLASS, STR)                                                   \
 250     case NAMEL:                                                               \
 251         PL_reg_flags |= RF_tainted;                                           \
 252         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 253     case NNAMEL:                                                              \
 254         PL_reg_flags |= RF_tainted;                                           \
 255         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 256                        CLASS, STR)                                            \
 257     case NAMEA:                                                               \
 258         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 259             sayNO;                                                            \
 260         }                                                                     \
 261         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 262         nextchr = UCHARAT(++locinput);                                        \
 263         break;                                                                \
 264     case NNAMEA:                                                              \
 265         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 266             sayNO;                                                            \
 267         }                                                                     \
 268         if (utf8_target) {                                                    \
 269             locinput += PL_utf8skip[nextchr];                                 \
 270             nextchr = UCHARAT(locinput);                                      \
 271         }                                                                     \
 272         else {                                                                \
 273             nextchr = UCHARAT(++locinput);                                    \
 274         }                                                                     \
 275         break;                                                                \
 276     /* Generate the non-locale cases */                                       \
 277     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 278
 279 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 280  * statements to handle separate Unicode semantics nodes */
 281 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 282                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 283                   NAMEU, NNAMEU, FUNCU,                                        \
 284                   NAMEA, NNAMEA, FUNCA,                                        \
 285                   CLASS, STR)                                                  \
 286     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 287             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 288             NAMEA, NNAMEA, FUNCA,                                              \
 289             CLASS, STR)                                                        \
 290     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 291
 292 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 293
 294 /* for use after a quantifier and before an EXACT-like node -- japhy */
 295 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 296  *
 297  * NOTE that *nothing* that affects backtracking should be in here, specifically
 298  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 299  * node that is in between two EXACT like nodes when ascertaining what the required
 300  * "follow" character is. This should probably be moved to regex compile time
 301  * although it may be done at run time beause of the REF possibility - more
 302  * investigation required. -- demerphq
 303 */
 304 #define JUMPABLE(rn) (      \
 305     OP(rn) == OPEN ||       \
 306     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 307     OP(rn) == EVAL ||   \
 308     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 309     OP(rn) == PLUS || OP(rn) == MINMOD || \
 310     OP(rn) == KEEPS || \
 311     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 312 )
 313 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 314
 315 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 316
 317 #if 0
 318 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 319    we don't need this definition. */
 320 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 321 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 322 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 323
 324 #else
 325 /* ... so we use this as its faster. */
 326 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 327 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU )
 328 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 329 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 330
 331 #endif
 332
 333 /*
 334   Search for mandatory following text node; for lookahead, the text must
 335   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 336 */
 337 #define FIND_NEXT_IMPT(rn) STMT_START { \
 338     while (JUMPABLE(rn)) { \
 339         const OPCODE type = OP(rn); \
 340         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 341             rn = NEXTOPER(NEXTOPER(rn)); \
 342         else if (type == PLUS) \
 343             rn = NEXTOPER(rn); \
 344         else if (type == IFMATCH) \
 345             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 346         else rn += NEXT_OFF(rn); \
 347     } \
 348 } STMT_END
 349
 350
 351 static void restore_pos(pTHX_ void *arg);
 352
 353 #define REGCP_PAREN_ELEMS 4
 354 #define REGCP_OTHER_ELEMS 5
 355 #define REGCP_FRAME_ELEMS 1
 356 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 357  * are needed for the regexp context stack bookkeeping. */
 358
 359 STATIC CHECKPOINT
 360 S_regcppush(pTHX_ I32 parenfloor)
 361 {
 362     dVAR;
 363     const int retval = PL_savestack_ix;
 364     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 365     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 366     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 367     int p;
 368     GET_RE_DEBUG_FLAGS_DECL;
 369
 370     if (paren_elems_to_push < 0)
 371         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 372
 373     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 374         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 375                    " out of range (%lu-%ld)",
 376                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 377
 378     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 379
 380     for (p = PL_regsize; p > parenfloor; p--) {
 381 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 382         SSPUSHINT(PL_regoffs[p].end);
 383         SSPUSHINT(PL_regoffs[p].start);
 384         SSPUSHPTR(PL_reg_start_tmp[p]);
 385         SSPUSHINT(p);
 386         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 387           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 388                       (UV)p, (IV)PL_regoffs[p].start,
 389                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 390                       (IV)PL_regoffs[p].end
 391         ));
 392     }
 393 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 394     SSPUSHPTR(PL_regoffs);
 395     SSPUSHINT(PL_regsize);
 396     SSPUSHINT(*PL_reglastparen);
 397     SSPUSHINT(*PL_reglastcloseparen);
 398     SSPUSHPTR(PL_reginput);
 399     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 400
 401     return retval;
 402 }
 403
 404 /* These are needed since we do not localize EVAL nodes: */
 405 #define REGCP_SET(cp)                                           \
 406     DEBUG_STATE_r(                                              \
 407             PerlIO_printf(Perl_debug_log,                       \
 408                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 409                 (IV)PL_savestack_ix));                          \
 410     cp = PL_savestack_ix
 411
 412 #define REGCP_UNWIND(cp)                                        \
 413     DEBUG_STATE_r(                                              \
 414         if (cp != PL_savestack_ix)                              \
 415             PerlIO_printf(Perl_debug_log,                       \
 416                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 417                 (IV)(cp), (IV)PL_savestack_ix));                \
 418     regcpblow(cp)
 419
 420 STATIC char *
 421 S_regcppop(pTHX_ const regexp *rex)
 422 {
 423     dVAR;
 424     UV i;
 425     char *input;
 426     GET_RE_DEBUG_FLAGS_DECL;
 427
 428     PERL_ARGS_ASSERT_REGCPPOP;
 429
 430     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 431     i = SSPOPUV;
 432     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 433     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 434     input = (char *) SSPOPPTR;
 435     *PL_reglastcloseparen = SSPOPINT;
 436     *PL_reglastparen = SSPOPINT;
 437     PL_regsize = SSPOPINT;
 438     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 439
 440     i -= REGCP_OTHER_ELEMS;
 441     /* Now restore the parentheses context. */
 442     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 443         I32 tmps;
 444         U32 paren = (U32)SSPOPINT;
 445         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 446         PL_regoffs[paren].start = SSPOPINT;
 447         tmps = SSPOPINT;
 448         if (paren <= *PL_reglastparen)
 449             PL_regoffs[paren].end = tmps;
 450         DEBUG_BUFFERS_r(
 451             PerlIO_printf(Perl_debug_log,
 452                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 453                           (UV)paren, (IV)PL_regoffs[paren].start,
 454                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 455                           (IV)PL_regoffs[paren].end,
 456                           (paren > *PL_reglastparen ? "(no)" : ""));
 457         );
 458     }
 459     DEBUG_BUFFERS_r(
 460         if (*PL_reglastparen + 1 <= rex->nparens) {
 461             PerlIO_printf(Perl_debug_log,
 462                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 463                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 464         }
 465     );
 466 #if 1
 467     /* It would seem that the similar code in regtry()
 468      * already takes care of this, and in fact it is in
 469      * a better location to since this code can #if 0-ed out
 470      * but the code in regtry() is needed or otherwise tests
 471      * requiring null fields (pat.t#187 and split.t#{13,14}
 472      * (as of patchlevel 7877)  will fail.  Then again,
 473      * this code seems to be necessary or otherwise
 474      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 475      * --jhi updated by dapm */
 476     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 477         if (i > PL_regsize)
 478             PL_regoffs[i].start = -1;
 479         PL_regoffs[i].end = -1;
 480     }
 481 #endif
 482     return input;
 483 }
 484
 485 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 486
 487 /*
 488  * pregexec and friends
 489  */
 490
 491 #ifndef PERL_IN_XSUB_RE
 492 /*
 493  - pregexec - match a regexp against a string
 494  */
 495 I32
 496 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 497          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 498 /* strend: pointer to null at end of string */
 499 /* strbeg: real beginning of string */
 500 /* minend: end of match must be >=minend after stringarg. */
 501 /* nosave: For optimizations. */
 502 {
 503     PERL_ARGS_ASSERT_PREGEXEC;
 504
 505     return
 506         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 507                       nosave ? 0 : REXEC_COPY_STR);
 508 }
 509 #endif
 510
 511 /*
 512  * Need to implement the following flags for reg_anch:
 513  *
 514  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 515  * USE_INTUIT_ML
 516  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 517  * INTUIT_AUTORITATIVE_ML
 518  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 519  * INTUIT_ONCE_ML
 520  *
 521  * Another flag for this function: SECOND_TIME (so that float substrs
 522  * with giant delta may be not rechecked).
 523  */
 524
 525 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 526
 527 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 528    Otherwise, only SvCUR(sv) is used to get strbeg. */
 529
 530 /* XXXX We assume that strpos is strbeg unless sv. */
 531
 532 /* XXXX Some places assume that there is a fixed substring.
 533         An update may be needed if optimizer marks as "INTUITable"
 534         RExen without fixed substrings.  Similarly, it is assumed that
 535         lengths of all the strings are no more than minlen, thus they
 536         cannot come from lookahead.
 537         (Or minlen should take into account lookahead.)
 538   NOTE: Some of this comment is not correct. minlen does now take account
 539   of lookahead/behind. Further research is required. -- demerphq
 540
 541 */
 542
 543 /* A failure to find a constant substring means that there is no need to make
 544    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 545    finding a substring too deep into the string means that less calls to
 546    regtry() should be needed.
 547
 548    REx compiler's optimizer found 4 possible hints:
 549         a) Anchored substring;
 550         b) Fixed substring;
 551         c) Whether we are anchored (beginning-of-line or \G);
 552         d) First node (of those at offset 0) which may distinguish positions;
 553    We use a)b)d) and multiline-part of c), and try to find a position in the
 554    string which does not contradict any of them.
 555  */
 556
 557 /* Most of decisions we do here should have been done at compile time.
 558    The nodes of the REx which we used for the search should have been
 559    deleted from the finite automaton. */
 560
 561 char *
 562 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 563                      char *strend, const U32 flags, re_scream_pos_data *data)
 564 {
 565     dVAR;
 566     struct regexp *const prog = (struct regexp *)SvANY(rx);
 567     register I32 start_shift = 0;
 568     /* Should be nonnegative! */
 569     register I32 end_shift   = 0;
 570     register char *s;
 571     register SV *check;
 572     char *strbeg;
 573     char *t;
 574     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 575     I32 ml_anch;
 576     register char *other_last = NULL;   /* other substr checked before this */
 577     char *check_at = NULL;              /* check substr found at this pos */
 578     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 579     RXi_GET_DECL(prog,progi);
 580 #ifdef DEBUGGING
 581     const char * const i_strpos = strpos;
 582 #endif
 583     GET_RE_DEBUG_FLAGS_DECL;
 584
 585     PERL_ARGS_ASSERT_RE_INTUIT_START;
 586
 587     RX_MATCH_UTF8_set(rx,utf8_target);
 588
 589     if (RX_UTF8(rx)) {
 590         PL_reg_flags |= RF_utf8;
 591     }
 592     DEBUG_EXECUTE_r(
 593         debug_start_match(rx, utf8_target, strpos, strend,
 594             sv ? "Guessing start of match in sv for"
 595                : "Guessing start of match in string for");
 596               );
 597
 598     /* CHR_DIST() would be more correct here but it makes things slow. */
 599     if (prog->minlen > strend - strpos) {
 600         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 601                               "String too short... [re_intuit_start]\n"));
 602         goto fail;
 603     }
 604
 605     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 606     PL_regeol = strend;
 607     if (utf8_target) {
 608         if (!prog->check_utf8 && prog->check_substr)
 609             to_utf8_substr(prog);
 610         check = prog->check_utf8;
 611     } else {
 612         if (!prog->check_substr && prog->check_utf8)
 613             to_byte_substr(prog);
 614         check = prog->check_substr;
 615     }
 616     if (check == &PL_sv_undef) {
 617         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 618                 "Non-utf8 string cannot match utf8 check string\n"));
 619         goto fail;
 620     }
 621     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 622         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 623                      || ( (prog->extflags & RXf_ANCH_BOL)
 624                           && !multiline ) );    /* Check after \n? */
 625
 626         if (!ml_anch) {
 627           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 628                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 629                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 630                && sv && !SvROK(sv)
 631                && (strpos != strbeg)) {
 632               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 633               goto fail;
 634           }
 635           if (prog->check_offset_min == prog->check_offset_max &&
 636               !(prog->extflags & RXf_CANY_SEEN)) {
 637             /* Substring at constant offset from beg-of-str... */
 638             I32 slen;
 639
 640             s = HOP3c(strpos, prog->check_offset_min, strend);
 641
 642             if (SvTAIL(check)) {
 643                 slen = SvCUR(check);    /* >= 1 */
 644
 645                 if ( strend - s > slen || strend - s < slen - 1
 646                      || (strend - s == slen && strend[-1] != '\n')) {
 647                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 648                     goto fail_finish;
 649                 }
 650                 /* Now should match s[0..slen-2] */
 651                 slen--;
 652                 if (slen && (*SvPVX_const(check) != *s
 653                              || (slen > 1
 654                                  && memNE(SvPVX_const(check), s, slen)))) {
 655                   report_neq:
 656                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 657                     goto fail_finish;
 658                 }
 659             }
 660             else if (*SvPVX_const(check) != *s
 661                      || ((slen = SvCUR(check)) > 1
 662                          && memNE(SvPVX_const(check), s, slen)))
 663                 goto report_neq;
 664             check_at = s;
 665             goto success_at_start;
 666           }
 667         }
 668         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 669         s = strpos;
 670         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 671         end_shift = prog->check_end_shift;
 672
 673         if (!ml_anch) {
 674             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 675                                          - (SvTAIL(check) != 0);
 676             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 677
 678             if (end_shift < eshift)
 679                 end_shift = eshift;
 680         }
 681     }
 682     else {                              /* Can match at random position */
 683         ml_anch = 0;
 684         s = strpos;
 685         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 686         end_shift = prog->check_end_shift;
 687
 688         /* end shift should be non negative here */
 689     }
 690
 691 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 692     if (end_shift < 0)
 693         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 694                    (IV)end_shift, RX_PRECOMP(prog));
 695 #endif
 696
 697   restart:
 698     /* Find a possible match in the region s..strend by looking for
 699        the "check" substring in the region corrected by start/end_shift. */
 700
 701     {
 702         I32 srch_start_shift = start_shift;
 703         I32 srch_end_shift = end_shift;
 704         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 705             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 706             srch_start_shift = strbeg - s;
 707         }
 708     DEBUG_OPTIMISE_MORE_r({
 709         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 710             (IV)prog->check_offset_min,
 711             (IV)srch_start_shift,
 712             (IV)srch_end_shift,
 713             (IV)prog->check_end_shift);
 714     });
 715
 716     if (flags & REXEC_SCREAM) {
 717         I32 p = -1;                     /* Internal iterator of scream. */
 718         I32 * const pp = data ? data->scream_pos : &p;
 719
 720         if (PL_screamfirst[BmRARE(check)] >= 0
 721             || ( BmRARE(check) == '\n'
 722                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 723                  && SvTAIL(check) ))
 724             s = screaminstr(sv, check,
 725                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 726         else
 727             goto fail_finish;
 728         /* we may be pointing at the wrong string */
 729         if (s && RXp_MATCH_COPIED(prog))
 730             s = strbeg + (s - SvPVX_const(sv));
 731         if (data)
 732             *data->scream_olds = s;
 733     }
 734     else {
 735         U8* start_point;
 736         U8* end_point;
 737         if (prog->extflags & RXf_CANY_SEEN) {
 738             start_point= (U8*)(s + srch_start_shift);
 739             end_point= (U8*)(strend - srch_end_shift);
 740         } else {
 741             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 742             end_point= HOP3(strend, -srch_end_shift, strbeg);
 743         }
 744         DEBUG_OPTIMISE_MORE_r({
 745             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 746                 (int)(end_point - start_point),
 747                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 748                 start_point);
 749         });
 750
 751         s = fbm_instr( start_point, end_point,
 752                       check, multiline ? FBMrf_MULTILINE : 0);
 753     }
 754     }
 755     /* Update the count-of-usability, remove useless subpatterns,
 756         unshift s.  */
 757
 758     DEBUG_EXECUTE_r({
 759         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 760             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 761         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 762                           (s ? "Found" : "Did not find"),
 763             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 764                 ? "anchored" : "floating"),
 765             quoted,
 766             RE_SV_TAIL(check),
 767             (s ? " at offset " : "...\n") );
 768     });
 769
 770     if (!s)
 771         goto fail_finish;
 772     /* Finish the diagnostic message */
 773     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 774
 775     /* XXX dmq: first branch is for positive lookbehind...
 776        Our check string is offset from the beginning of the pattern.
 777        So we need to do any stclass tests offset forward from that
 778        point. I think. :-(
 779      */
 780
 781
 782
 783     check_at=s;
 784
 785
 786     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 787        Start with the other substr.
 788        XXXX no SCREAM optimization yet - and a very coarse implementation
 789        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 790                 *always* match.  Probably should be marked during compile...
 791        Probably it is right to do no SCREAM here...
 792      */
 793
 794     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 795                 : (prog->float_substr && prog->anchored_substr))
 796     {
 797         /* Take into account the "other" substring. */
 798         /* XXXX May be hopelessly wrong for UTF... */
 799         if (!other_last)
 800             other_last = strpos;
 801         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 802           do_other_anchored:
 803             {
 804                 char * const last = HOP3c(s, -start_shift, strbeg);
 805                 char *last1, *last2;
 806                 char * const saved_s = s;
 807                 SV* must;
 808
 809                 t = s - prog->check_offset_max;
 810                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 811                     && (!utf8_target
 812                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 813                             && t > strpos)))
 814                     NOOP;
 815                 else
 816                     t = strpos;
 817                 t = HOP3c(t, prog->anchored_offset, strend);
 818                 if (t < other_last)     /* These positions already checked */
 819                     t = other_last;
 820                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 821                 if (last < last1)
 822                     last1 = last;
 823                 /* XXXX It is not documented what units *_offsets are in.
 824                    We assume bytes, but this is clearly wrong.
 825                    Meaning this code needs to be carefully reviewed for errors.
 826                    dmq.
 827                   */
 828
 829                 /* On end-of-str: see comment below. */
 830                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 831                 if (must == &PL_sv_undef) {
 832                     s = (char*)NULL;
 833                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 834                 }
 835                 else
 836                     s = fbm_instr(
 837                         (unsigned char*)t,
 838                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 839                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 840                         must,
 841                         multiline ? FBMrf_MULTILINE : 0
 842                     );
 843                 DEBUG_EXECUTE_r({
 844                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 845                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 846                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 847                         (s ? "Found" : "Contradicts"),
 848                         quoted, RE_SV_TAIL(must));
 849                 });
 850
 851
 852                 if (!s) {
 853                     if (last1 >= last2) {
 854                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 855                                                 ", giving up...\n"));
 856                         goto fail_finish;
 857                     }
 858                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 859                         ", trying floating at offset %ld...\n",
 860                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 861                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 862                     s = HOP3c(last, 1, strend);
 863                     goto restart;
 864                 }
 865                 else {
 866                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 867                           (long)(s - i_strpos)));
 868                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 869                     other_last = HOP3c(s, 1, strend);
 870                     s = saved_s;
 871                     if (t == strpos)
 872                         goto try_at_start;
 873                     goto try_at_offset;
 874                 }
 875             }
 876         }
 877         else {          /* Take into account the floating substring. */
 878             char *last, *last1;
 879             char * const saved_s = s;
 880             SV* must;
 881
 882             t = HOP3c(s, -start_shift, strbeg);
 883             last1 = last =
 884                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 885             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 886                 last = HOP3c(t, prog->float_max_offset, strend);
 887             s = HOP3c(t, prog->float_min_offset, strend);
 888             if (s < other_last)
 889                 s = other_last;
 890  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 891             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 892             /* fbm_instr() takes into account exact value of end-of-str
 893                if the check is SvTAIL(ed).  Since false positives are OK,
 894                and end-of-str is not later than strend we are OK. */
 895             if (must == &PL_sv_undef) {
 896                 s = (char*)NULL;
 897                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 898             }
 899             else
 900                 s = fbm_instr((unsigned char*)s,
 901                               (unsigned char*)last + SvCUR(must)
 902                                   - (SvTAIL(must)!=0),
 903                               must, multiline ? FBMrf_MULTILINE : 0);
 904             DEBUG_EXECUTE_r({
 905                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 906                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 907                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 908                     (s ? "Found" : "Contradicts"),
 909                     quoted, RE_SV_TAIL(must));
 910             });
 911             if (!s) {
 912                 if (last1 == last) {
 913                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 914                                             ", giving up...\n"));
 915                     goto fail_finish;
 916                 }
 917                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 918                     ", trying anchored starting at offset %ld...\n",
 919                     (long)(saved_s + 1 - i_strpos)));
 920                 other_last = last;
 921                 s = HOP3c(t, 1, strend);
 922                 goto restart;
 923             }
 924             else {
 925                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 926                       (long)(s - i_strpos)));
 927                 other_last = s; /* Fix this later. --Hugo */
 928                 s = saved_s;
 929                 if (t == strpos)
 930                     goto try_at_start;
 931                 goto try_at_offset;
 932             }
 933         }
 934     }
 935
 936
 937     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 938
 939     DEBUG_OPTIMISE_MORE_r(
 940         PerlIO_printf(Perl_debug_log,
 941             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 942             (IV)prog->check_offset_min,
 943             (IV)prog->check_offset_max,
 944             (IV)(s-strpos),
 945             (IV)(t-strpos),
 946             (IV)(t-s),
 947             (IV)(strend-strpos)
 948         )
 949     );
 950
 951     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 952         && (!utf8_target
 953             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 954                  && t > strpos)))
 955     {
 956         /* Fixed substring is found far enough so that the match
 957            cannot start at strpos. */
 958       try_at_offset:
 959         if (ml_anch && t[-1] != '\n') {
 960             /* Eventually fbm_*() should handle this, but often
 961                anchored_offset is not 0, so this check will not be wasted. */
 962             /* XXXX In the code below we prefer to look for "^" even in
 963                presence of anchored substrings.  And we search even
 964                beyond the found float position.  These pessimizations
 965                are historical artefacts only.  */
 966           find_anchor:
 967             while (t < strend - prog->minlen) {
 968                 if (*t == '\n') {
 969                     if (t < check_at - prog->check_offset_min) {
 970                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 971                             /* Since we moved from the found position,
 972                                we definitely contradict the found anchored
 973                                substr.  Due to the above check we do not
 974                                contradict "check" substr.
 975                                Thus we can arrive here only if check substr
 976                                is float.  Redo checking for "other"=="fixed".
 977                              */
 978                             strpos = t + 1;
 979                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 980                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 981                             goto do_other_anchored;
 982                         }
 983                         /* We don't contradict the found floating substring. */
 984                         /* XXXX Why not check for STCLASS? */
 985                         s = t + 1;
 986                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 987                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 988                         goto set_useful;
 989                     }
 990                     /* Position contradicts check-string */
 991                     /* XXXX probably better to look for check-string
 992                        than for "\n", so one should lower the limit for t? */
 993                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 994                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 995                     other_last = strpos = s = t + 1;
 996                     goto restart;
 997                 }
 998                 t++;
 999             }
1000             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1001                         PL_colors[0], PL_colors[1]));
1002             goto fail_finish;
1003         }
1004         else {
1005             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1006                         PL_colors[0], PL_colors[1]));
1007         }
1008         s = t;
1009       set_useful:
1010         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1011     }
1012     else {
1013         /* The found string does not prohibit matching at strpos,
1014            - no optimization of calling REx engine can be performed,
1015            unless it was an MBOL and we are not after MBOL,
1016            or a future STCLASS check will fail this. */
1017       try_at_start:
1018         /* Even in this situation we may use MBOL flag if strpos is offset
1019            wrt the start of the string. */
1020         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1021             && (strpos != strbeg) && strpos[-1] != '\n'
1022             /* May be due to an implicit anchor of m{.*foo}  */
1023             && !(prog->intflags & PREGf_IMPLICIT))
1024         {
1025             t = strpos;
1026             goto find_anchor;
1027         }
1028         DEBUG_EXECUTE_r( if (ml_anch)
1029             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1030                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1031         );
1032       success_at_start:
1033         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1034             && (utf8_target ? (
1035                 prog->check_utf8                /* Could be deleted already */
1036                 && --BmUSEFUL(prog->check_utf8) < 0
1037                 && (prog->check_utf8 == prog->float_utf8)
1038             ) : (
1039                 prog->check_substr              /* Could be deleted already */
1040                 && --BmUSEFUL(prog->check_substr) < 0
1041                 && (prog->check_substr == prog->float_substr)
1042             )))
1043         {
1044             /* If flags & SOMETHING - do not do it many times on the same match */
1045             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1046             /* XXX Does the destruction order has to change with utf8_target? */
1047             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1048             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1049             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1050             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1051             check = NULL;                       /* abort */
1052             s = strpos;
1053             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1054                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1055             if (prog->intflags & PREGf_IMPLICIT)
1056                 prog->extflags &= ~RXf_ANCH_MBOL;
1057             /* XXXX This is a remnant of the old implementation.  It
1058                     looks wasteful, since now INTUIT can use many
1059                     other heuristics. */
1060             prog->extflags &= ~RXf_USE_INTUIT;
1061             /* XXXX What other flags might need to be cleared in this branch? */
1062         }
1063         else
1064             s = strpos;
1065     }
1066
1067     /* Last resort... */
1068     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1069     /* trie stclasses are too expensive to use here, we are better off to
1070        leave it to regmatch itself */
1071     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1072         /* minlen == 0 is possible if regstclass is \b or \B,
1073            and the fixed substr is ''$.
1074            Since minlen is already taken into account, s+1 is before strend;
1075            accidentally, minlen >= 1 guaranties no false positives at s + 1
1076            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1077            regstclass does not come from lookahead...  */
1078         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1079            This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass().  */
1080         const U8* const str = (U8*)STRING(progi->regstclass);
1081         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1082                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1083                     : 1);
1084         char * endpos;
1085         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1086             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1087         else if (prog->float_substr || prog->float_utf8)
1088             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1089         else
1090             endpos= strend;
1091
1092         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1093                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1094
1095         t = s;
1096         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1097         if (!s) {
1098 #ifdef DEBUGGING
1099             const char *what = NULL;
1100 #endif
1101             if (endpos == strend) {
1102                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1103                                 "Could not match STCLASS...\n") );
1104                 goto fail;
1105             }
1106             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1107                                    "This position contradicts STCLASS...\n") );
1108             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1109                 goto fail;
1110             /* Contradict one of substrings */
1111             if (prog->anchored_substr || prog->anchored_utf8) {
1112                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1113                     DEBUG_EXECUTE_r( what = "anchored" );
1114                   hop_and_restart:
1115                     s = HOP3c(t, 1, strend);
1116                     if (s + start_shift + end_shift > strend) {
1117                         /* XXXX Should be taken into account earlier? */
1118                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                                                "Could not match STCLASS...\n") );
1120                         goto fail;
1121                     }
1122                     if (!check)
1123                         goto giveup;
1124                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1125                                 "Looking for %s substr starting at offset %ld...\n",
1126                                  what, (long)(s + start_shift - i_strpos)) );
1127                     goto restart;
1128                 }
1129                 /* Have both, check_string is floating */
1130                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1131                     goto retry_floating_check;
1132                 /* Recheck anchored substring, but not floating... */
1133                 s = check_at;
1134                 if (!check)
1135                     goto giveup;
1136                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1137                           "Looking for anchored substr starting at offset %ld...\n",
1138                           (long)(other_last - i_strpos)) );
1139                 goto do_other_anchored;
1140             }
1141             /* Another way we could have checked stclass at the
1142                current position only: */
1143             if (ml_anch) {
1144                 s = t = t + 1;
1145                 if (!check)
1146                     goto giveup;
1147                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1148                           "Looking for /%s^%s/m starting at offset %ld...\n",
1149                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1150                 goto try_at_offset;
1151             }
1152             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1153                 goto fail;
1154             /* Check is floating substring. */
1155           retry_floating_check:
1156             t = check_at - start_shift;
1157             DEBUG_EXECUTE_r( what = "floating" );
1158             goto hop_and_restart;
1159         }
1160         if (t != s) {
1161             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1162                         "By STCLASS: moving %ld --> %ld\n",
1163                                   (long)(t - i_strpos), (long)(s - i_strpos))
1164                    );
1165         }
1166         else {
1167             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1168                                   "Does not contradict STCLASS...\n");
1169                    );
1170         }
1171     }
1172   giveup:
1173     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1174                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1175                           PL_colors[5], (long)(s - i_strpos)) );
1176     return s;
1177
1178   fail_finish:                          /* Substring not found */
1179     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1180         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1181   fail:
1182     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1183                           PL_colors[4], PL_colors[5]));
1184     return NULL;
1185 }
1186
1187 #define DECL_TRIE_TYPE(scan) \
1188     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1189                     trie_type = (scan->flags != EXACT) \
1190                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1191                               : (utf8_target ? trie_utf8 : trie_plain)
1192
1193 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1194 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1195     switch (trie_type) {                                                    \
1196     case trie_utf8_fold:                                                    \
1197         if ( foldlen>0 ) {                                                  \
1198             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1199             foldlen -= len;                                                 \
1200             uscan += len;                                                   \
1201             len=0;                                                          \
1202         } else {                                                            \
1203             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1204             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1205             foldlen -= UNISKIP( uvc );                                      \
1206             uscan = foldbuf + UNISKIP( uvc );                               \
1207         }                                                                   \
1208         break;                                                              \
1209     case trie_latin_utf8_fold:                                              \
1210         if ( foldlen>0 ) {                                                  \
1211             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1212             foldlen -= len;                                                 \
1213             uscan += len;                                                   \
1214             len=0;                                                          \
1215         } else {                                                            \
1216             len = 1;                                                        \
1217             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1218             foldlen -= UNISKIP( uvc );                                      \
1219             uscan = foldbuf + UNISKIP( uvc );                               \
1220         }                                                                   \
1221         break;                                                              \
1222     case trie_utf8:                                                         \
1223         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1224         break;                                                              \
1225     case trie_plain:                                                        \
1226         uvc = (UV)*uc;                                                      \
1227         len = 1;                                                            \
1228     }                                                                       \
1229     if (uvc < 256) {                                                        \
1230         charid = trie->charmap[ uvc ];                                      \
1231     }                                                                       \
1232     else {                                                                  \
1233         charid = 0;                                                         \
1234         if (widecharmap) {                                                  \
1235             SV** const svpp = hv_fetch(widecharmap,                         \
1236                         (char*)&uvc, sizeof(UV), 0);                        \
1237             if (svpp)                                                       \
1238                 charid = (U16)SvIV(*svpp);                                  \
1239         }                                                                   \
1240     }                                                                       \
1241 } STMT_END
1242
1243 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1244 {                                                      \
1245     char *my_strend= (char *)strend;                   \
1246     if ( (CoNd)                                        \
1247          && (ln == len ||                              \
1248              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1249                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1250          && (!reginfo || regtry(reginfo, &s)) )        \
1251         goto got_it;                                   \
1252     else {                                             \
1253          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1254          uvchr_to_utf8(tmpbuf, c);                     \
1255          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1256          if ( f != c                                   \
1257               && (f == c1 || f == c2)                  \
1258               && (ln == len ||                         \
1259                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1260                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1261               && (!reginfo || regtry(reginfo, &s)) )   \
1262               goto got_it;                             \
1263     }                                                  \
1264 }                                                      \
1265 s += len
1266
1267 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1268 STMT_START {                                              \
1269     re_fold_t folder;                                   \
1270     switch (OP(c)) {                                      \
1271         case EXACTFU: folder = foldEQ_latin1; break;      \
1272         case EXACTFL: folder = foldEQ_locale; break;      \
1273         case EXACTF:  folder = foldEQ; break;             \
1274         default:                                          \
1275             Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
1276     }                                                     \
1277     while (s <= e) {                                      \
1278         if ( (CoNd)                                       \
1279              && (ln == 1 || folder(s, m, ln))             \
1280              && (!reginfo || regtry(reginfo, &s)) )       \
1281             goto got_it;                                  \
1282         s++;                                              \
1283     }                                                     \
1284 } STMT_END
1285
1286 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1287 STMT_START {                                          \
1288     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1289         CoDe                                          \
1290         s += uskip;                                   \
1291     }                                                 \
1292 } STMT_END
1293
1294 #define REXEC_FBC_SCAN(CoDe)                          \
1295 STMT_START {                                          \
1296     while (s < strend) {                              \
1297         CoDe                                          \
1298         s++;                                          \
1299     }                                                 \
1300 } STMT_END
1301
1302 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1303 REXEC_FBC_UTF8_SCAN(                                  \
1304     if (CoNd) {                                       \
1305         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1306             goto got_it;                              \
1307         else                                          \
1308             tmp = doevery;                            \
1309     }                                                 \
1310     else                                              \
1311         tmp = 1;                                      \
1312 )
1313
1314 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1315 REXEC_FBC_SCAN(                                       \
1316     if (CoNd) {                                       \
1317         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1318             goto got_it;                              \
1319         else                                          \
1320             tmp = doevery;                            \
1321     }                                                 \
1322     else                                              \
1323         tmp = 1;                                      \
1324 )
1325
1326 #define REXEC_FBC_TRYIT               \
1327 if ((!reginfo || regtry(reginfo, &s))) \
1328     goto got_it
1329
1330 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1331     if (utf8_target) {                                             \
1332         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1333     }                                                          \
1334     else {                                                     \
1335         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1336     }
1337
1338 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1339     if (utf8_target) {                                             \
1340         UtFpReLoAd;                                            \
1341         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1342     }                                                          \
1343     else {                                                     \
1344         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1345     }
1346
1347 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1348     PL_reg_flags |= RF_tainted;                                \
1349     if (utf8_target) {                                             \
1350         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1351     }                                                          \
1352     else {                                                     \
1353         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1354     }
1355
1356 #define DUMP_EXEC_POS(li,s,doutf8) \
1357     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1358
1359
1360 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1361         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1362         tmp = TEST_NON_UTF8(tmp);                                              \
1363         REXEC_FBC_UTF8_SCAN(                                                   \
1364             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1365                 tmp = !tmp;                                                    \
1366                 IF_SUCCESS;                                                    \
1367             }                                                                  \
1368             else {                                                             \
1369                 IF_FAIL;                                                       \
1370             }                                                                  \
1371         );                                                                     \
1372
1373 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1374         if (s == PL_bostr) {                                                   \
1375             tmp = '\n';                                                        \
1376         }                                                                      \
1377         else {                                                                 \
1378             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1379             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1380         }                                                                      \
1381         tmp = TeSt1_UtF8;                                                      \
1382         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1383         REXEC_FBC_UTF8_SCAN(                                                   \
1384             if (tmp == ! (TeSt2_UtF8)) { \
1385                 tmp = !tmp;                                                    \
1386                 IF_SUCCESS;                                                    \
1387             }                                                                  \
1388             else {                                                             \
1389                 IF_FAIL;                                                       \
1390             }                                                                  \
1391         );                                                                     \
1392
1393 /* The only difference between the BOUND and NBOUND cases is that
1394  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1395  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1396  * with the other one being empty */
1397 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1398     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1399
1400 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1401     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1402
1403 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1405
1406 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1407     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1408
1409
1410 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1411  * be passed in completely with the variable name being tested, which isn't
1412  * such a clean interface, but this is easier to read than it was before.  We
1413  * are looking for the boundary (or non-boundary between a word and non-word
1414  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1415  * must be different.  Find the "wordness" of the character just prior to this
1416  * one, and compare it with the wordness of this one.  If they differ, we have
1417  * a boundary.  At the beginning of the string, pretend that the previous
1418  * character was a new-line */
1419 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1420     if (utf8_target) {                                                         \
1421                 UTF8_CODE \
1422     }                                                                          \
1423     else {  /* Not utf8 */                                                     \
1424         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1425         tmp = TEST_NON_UTF8(tmp);                                              \
1426         REXEC_FBC_SCAN(                                                        \
1427             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1428                 tmp = !tmp;                                                    \
1429                 IF_SUCCESS;                                                    \
1430             }                                                                  \
1431             else {                                                             \
1432                 IF_FAIL;                                                       \
1433             }                                                                  \
1434         );                                                                     \
1435     }                                                                          \
1436     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1437         goto got_it;
1438
1439 /* We know what class REx starts with.  Try to find this position... */
1440 /* if reginfo is NULL, its a dryrun */
1441 /* annoyingly all the vars in this routine have different names from their counterparts
1442    in regmatch. /grrr */
1443
1444 STATIC char *
1445 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1446     const char *strend, regmatch_info *reginfo)
1447 {
1448         dVAR;
1449         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1450         char *m;
1451         STRLEN ln;
1452         STRLEN lnc;
1453         register STRLEN uskip;
1454         unsigned int c1;
1455         unsigned int c2;
1456         char *e;
1457         register I32 tmp = 1;   /* Scratch variable? */
1458         register const bool utf8_target = PL_reg_match_utf8;
1459         RXi_GET_DECL(prog,progi);
1460
1461         PERL_ARGS_ASSERT_FIND_BYCLASS;
1462
1463         /* We know what class it must start with. */
1464         switch (OP(c)) {
1465         case ANYOFV:
1466         case ANYOF:
1467             if (utf8_target || OP(c) == ANYOFV) {
1468                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
1469                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1470                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1471                           REGINCLASS(prog, c, (U8*)s));
1472             }
1473             else {
1474                  while (s < strend) {
1475                       STRLEN skip = 1;
1476
1477                       if (REGINCLASS(prog, c, (U8*)s) ||
1478                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1479                            /* The assignment of 2 is intentional:
1480                             * for the folded sharp s, the skip is 2. */
1481                            (skip = SHARP_S_SKIP))) {
1482                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1483                                 goto got_it;
1484                            else
1485                                 tmp = doevery;
1486                       }
1487                       else
1488                            tmp = 1;
1489                       s += skip;
1490                  }
1491             }
1492             break;
1493         case CANY:
1494             REXEC_FBC_SCAN(
1495                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1496                     goto got_it;
1497                 else
1498                     tmp = doevery;
1499             );
1500             break;
1501         case EXACTFU:
1502         case EXACTF:
1503             m   = STRING(c);
1504             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1505             lnc = (I32) ln;     /* length to match in characters */
1506             if (UTF_PATTERN) {
1507                 STRLEN ulen1, ulen2;
1508                 U8 *sm = (U8 *) m;
1509                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1510                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1511                 /* used by commented-out code below */
1512                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1513
1514                 /* XXX: Since the node will be case folded at compile
1515                    time this logic is a little odd, although im not
1516                    sure that its actually wrong. --dmq */
1517
1518                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1519                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1520
1521                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1522                    codepoint of the first character in the converted
1523                    form, yet originally we did the extra step.
1524                    No tests fail by commenting this code out however
1525                    so Ive left it out. -- dmq.
1526
1527                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1528                                     0, uniflags);
1529                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1530                                     0, uniflags);
1531                 */
1532
1533                 lnc = 0;
1534                 while (sm < ((U8 *) m + ln)) {
1535                     lnc++;
1536                     sm += UTF8SKIP(sm);
1537                 }
1538             }
1539             else {
1540                 c1 = *(U8*)m;
1541                 if (utf8_target || OP(c) == EXACTFU) {
1542
1543                     /* Micro sign folds to GREEK SMALL LETTER MU;
1544                        LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets
1545                        c2 to the first 's' of the pair, and the code below will
1546                        look for others */
1547                     c2 = (c1 == MICRO_SIGN)
1548                         ? GREEK_SMALL_LETTER_MU
1549                         : (c1 == LATIN_SMALL_LETTER_SHARP_S)
1550                            ? 's'
1551                            : PL_fold_latin1[c1];
1552                 } else c2 = PL_fold[c1];
1553             }
1554             goto do_exactf;
1555         case EXACTFL:
1556             m   = STRING(c);
1557             ln  = STR_LEN(c);
1558             lnc = (I32) ln;
1559             c1 = *(U8*)m;
1560             c2 = PL_fold_locale[c1];
1561           do_exactf:
1562             e = HOP3c(strend, -((I32)lnc), s);
1563
1564             if (!reginfo && e < s)
1565                 e = s;                  /* Due to minlen logic of intuit() */
1566
1567             /* The idea in the EXACTF* cases is to first find the
1568              * first character of the EXACTF* node and then, if
1569              * necessary, case-insensitively compare the full
1570              * text of the node.  The c1 and c2 are the first
1571              * characters (though in Unicode it gets a bit
1572              * more complicated because there are more cases
1573              * than just upper and lower: one needs to use
1574              * the so-called folding case for case-insensitive
1575              * matching (called "loose matching" in Unicode).
1576              * foldEQ_utf8() will do just that. */
1577
1578             if (utf8_target || UTF_PATTERN) {
1579                 UV c, f;
1580                 U8 tmpbuf [UTF8_MAXBYTES+1];
1581                 STRLEN len = 1;
1582                 STRLEN foldlen;
1583                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1584                 if (c1 == c2) {
1585                     /* Upper and lower of 1st char are equal -
1586                      * probably not a "letter". */
1587                     while (s <= e) {
1588                         if (utf8_target) {
1589                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1590                                            uniflags);
1591                         } else {
1592                             c = *((U8*)s);
1593                         }
1594                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1595                     }
1596                 }
1597                 else {
1598                     while (s <= e) {
1599                         if (utf8_target) {
1600                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1601                                            uniflags);
1602                         } else {
1603                             c = *((U8*)s);
1604                         }
1605
1606                         /* Handle some of the three Greek sigmas cases.
1607                          * Note that not all the possible combinations
1608                          * are handled here: some of them are handled
1609                          * by the standard folding rules, and some of
1610                          * them (the character class or ANYOF cases)
1611                          * are handled during compiletime in
1612                          * regexec.c:S_regclass(). */
1613                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1614                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1615                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1616
1617                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1618                     }
1619                 }
1620             }
1621             else {
1622                 /* Neither pattern nor string are UTF8 */
1623                 if (c1 == c2)
1624                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1625                 else
1626                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1627             }
1628             break;
1629         case BOUNDL:
1630             PL_reg_flags |= RF_tainted;
1631             FBC_BOUND(isALNUM_LC,
1632                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1633                       isALNUM_LC_utf8((U8*)s));
1634             break;
1635         case NBOUNDL:
1636             PL_reg_flags |= RF_tainted;
1637             FBC_NBOUND(isALNUM_LC,
1638                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1639                        isALNUM_LC_utf8((U8*)s));
1640             break;
1641         case BOUND:
1642             FBC_BOUND(isWORDCHAR,
1643                       isALNUM_uni(tmp),
1644                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1645             break;
1646         case BOUNDA:
1647             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1648                              isWORDCHAR_A(tmp),
1649                              isWORDCHAR_A((U8*)s));
1650             break;
1651         case NBOUND:
1652             FBC_NBOUND(isWORDCHAR,
1653                        isALNUM_uni(tmp),
1654                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1655             break;
1656         case NBOUNDA:
1657             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1658                               isWORDCHAR_A(tmp),
1659                               isWORDCHAR_A((U8*)s));
1660             break;
1661         case BOUNDU:
1662             FBC_BOUND(isWORDCHAR_L1,
1663                       isALNUM_uni(tmp),
1664                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1665             break;
1666         case NBOUNDU:
1667             FBC_NBOUND(isWORDCHAR_L1,
1668                        isALNUM_uni(tmp),
1669                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1670             break;
1671         case ALNUML:
1672             REXEC_FBC_CSCAN_TAINT(
1673                 isALNUM_LC_utf8((U8*)s),
1674                 isALNUM_LC(*s)
1675             );
1676             break;
1677         case ALNUMU:
1678             REXEC_FBC_CSCAN_PRELOAD(
1679                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1680                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1681                 isWORDCHAR_L1((U8) *s)
1682             );
1683             break;
1684         case ALNUM:
1685             REXEC_FBC_CSCAN_PRELOAD(
1686                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1687                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1688                 isWORDCHAR((U8) *s)
1689             );
1690             break;
1691         case ALNUMA:
1692             /* Don't need to worry about utf8, as it can match only a single
1693              * byte invariant character */
1694             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1695             break;
1696         case NALNUMU:
1697             REXEC_FBC_CSCAN_PRELOAD(
1698                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1699                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1700                 ! isWORDCHAR_L1((U8) *s)
1701             );
1702             break;
1703         case NALNUM:
1704             REXEC_FBC_CSCAN_PRELOAD(
1705                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1706                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1707                 ! isALNUM(*s)
1708             );
1709             break;
1710         case NALNUMA:
1711             REXEC_FBC_CSCAN(
1712                 !isWORDCHAR_A(*s),
1713                 !isWORDCHAR_A(*s)
1714             );
1715             break;
1716         case NALNUML:
1717             REXEC_FBC_CSCAN_TAINT(
1718                 !isALNUM_LC_utf8((U8*)s),
1719                 !isALNUM_LC(*s)
1720             );
1721             break;
1722         case SPACEU:
1723             REXEC_FBC_CSCAN_PRELOAD(
1724                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1725                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1726                 isSPACE_L1((U8) *s)
1727             );
1728             break;
1729         case SPACE:
1730             REXEC_FBC_CSCAN_PRELOAD(
1731                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1732                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1733                 isSPACE((U8) *s)
1734             );
1735             break;
1736         case SPACEA:
1737             /* Don't need to worry about utf8, as it can match only a single
1738              * byte invariant character */
1739             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1740             break;
1741         case SPACEL:
1742             REXEC_FBC_CSCAN_TAINT(
1743                 isSPACE_LC_utf8((U8*)s),
1744                 isSPACE_LC(*s)
1745             );
1746             break;
1747         case NSPACEU:
1748             REXEC_FBC_CSCAN_PRELOAD(
1749                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1750                 !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1751                 ! isSPACE_L1((U8) *s)
1752             );
1753             break;
1754         case NSPACE:
1755             REXEC_FBC_CSCAN_PRELOAD(
1756                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1757                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1758                 ! isSPACE((U8) *s)
1759             );
1760             break;
1761         case NSPACEA:
1762             REXEC_FBC_CSCAN(
1763                 !isSPACE_A(*s),
1764                 !isSPACE_A(*s)
1765             );
1766             break;
1767         case NSPACEL:
1768             REXEC_FBC_CSCAN_TAINT(
1769                 !isSPACE_LC_utf8((U8*)s),
1770                 !isSPACE_LC(*s)
1771             );
1772             break;
1773         case DIGIT:
1774             REXEC_FBC_CSCAN_PRELOAD(
1775                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1776                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1777                 isDIGIT(*s)
1778             );
1779             break;
1780         case DIGITA:
1781             /* Don't need to worry about utf8, as it can match only a single
1782              * byte invariant character */
1783             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1784             break;
1785         case DIGITL:
1786             REXEC_FBC_CSCAN_TAINT(
1787                 isDIGIT_LC_utf8((U8*)s),
1788                 isDIGIT_LC(*s)
1789             );
1790             break;
1791         case NDIGIT:
1792             REXEC_FBC_CSCAN_PRELOAD(
1793                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1794                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1795                 !isDIGIT(*s)
1796             );
1797             break;
1798         case NDIGITA:
1799             REXEC_FBC_CSCAN(
1800                 !isDIGIT_A(*s),
1801                 !isDIGIT_A(*s)
1802             );
1803             break;
1804         case NDIGITL:
1805             REXEC_FBC_CSCAN_TAINT(
1806                 !isDIGIT_LC_utf8((U8*)s),
1807                 !isDIGIT_LC(*s)
1808             );
1809             break;
1810         case LNBREAK:
1811             REXEC_FBC_CSCAN(
1812                 is_LNBREAK_utf8(s),
1813                 is_LNBREAK_latin1(s)
1814             );
1815             break;
1816         case VERTWS:
1817             REXEC_FBC_CSCAN(
1818                 is_VERTWS_utf8(s),
1819                 is_VERTWS_latin1(s)
1820             );
1821             break;
1822         case NVERTWS:
1823             REXEC_FBC_CSCAN(
1824                 !is_VERTWS_utf8(s),
1825                 !is_VERTWS_latin1(s)
1826             );
1827             break;
1828         case HORIZWS:
1829             REXEC_FBC_CSCAN(
1830                 is_HORIZWS_utf8(s),
1831                 is_HORIZWS_latin1(s)
1832             );
1833             break;
1834         case NHORIZWS:
1835             REXEC_FBC_CSCAN(
1836                 !is_HORIZWS_utf8(s),
1837                 !is_HORIZWS_latin1(s)
1838             );
1839             break;
1840         case AHOCORASICKC:
1841         case AHOCORASICK:
1842             {
1843                 DECL_TRIE_TYPE(c);
1844                 /* what trie are we using right now */
1845                 reg_ac_data *aho
1846                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1847                 reg_trie_data *trie
1848                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1849                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1850
1851                 const char *last_start = strend - trie->minlen;
1852 #ifdef DEBUGGING
1853                 const char *real_start = s;
1854 #endif
1855                 STRLEN maxlen = trie->maxlen;
1856                 SV *sv_points;
1857                 U8 **points; /* map of where we were in the input string
1858                                 when reading a given char. For ASCII this
1859                                 is unnecessary overhead as the relationship
1860                                 is always 1:1, but for Unicode, especially
1861                                 case folded Unicode this is not true. */
1862                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1863                 U8 *bitmap=NULL;
1864
1865
1866                 GET_RE_DEBUG_FLAGS_DECL;
1867
1868                 /* We can't just allocate points here. We need to wrap it in
1869                  * an SV so it gets freed properly if there is a croak while
1870                  * running the match */
1871                 ENTER;
1872                 SAVETMPS;
1873                 sv_points=newSV(maxlen * sizeof(U8 *));
1874                 SvCUR_set(sv_points,
1875                     maxlen * sizeof(U8 *));
1876                 SvPOK_on(sv_points);
1877                 sv_2mortal(sv_points);
1878                 points=(U8**)SvPV_nolen(sv_points );
1879                 if ( trie_type != trie_utf8_fold
1880                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1881                 {
1882                     if (trie->bitmap)
1883                         bitmap=(U8*)trie->bitmap;
1884                     else
1885                         bitmap=(U8*)ANYOF_BITMAP(c);
1886                 }
1887                 /* this is the Aho-Corasick algorithm modified a touch
1888                    to include special handling for long "unknown char"
1889                    sequences. The basic idea being that we use AC as long
1890                    as we are dealing with a possible matching char, when
1891                    we encounter an unknown char (and we have not encountered
1892                    an accepting state) we scan forward until we find a legal
1893                    starting char.
1894                    AC matching is basically that of trie matching, except
1895                    that when we encounter a failing transition, we fall back
1896                    to the current states "fail state", and try the current char
1897                    again, a process we repeat until we reach the root state,
1898                    state 1, or a legal transition. If we fail on the root state
1899                    then we can either terminate if we have reached an accepting
1900                    state previously, or restart the entire process from the beginning
1901                    if we have not.
1902
1903                  */
1904                 while (s <= last_start) {
1905                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1906                     U8 *uc = (U8*)s;
1907                     U16 charid = 0;
1908                     U32 base = 1;
1909                     U32 state = 1;
1910                     UV uvc = 0;
1911                     STRLEN len = 0;
1912                     STRLEN foldlen = 0;
1913                     U8 *uscan = (U8*)NULL;
1914                     U8 *leftmost = NULL;
1915 #ifdef DEBUGGING
1916                     U32 accepted_word= 0;
1917 #endif
1918                     U32 pointpos = 0;
1919
1920                     while ( state && uc <= (U8*)strend ) {
1921                         int failed=0;
1922                         U32 word = aho->states[ state ].wordnum;
1923
1924                         if( state==1 ) {
1925                             if ( bitmap ) {
1926                                 DEBUG_TRIE_EXECUTE_r(
1927                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1928                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1929                                             (char *)uc, utf8_target );
1930                                         PerlIO_printf( Perl_debug_log,
1931                                             " Scanning for legal start char...\n");
1932                                     }
1933                                 );
1934                                 if (utf8_target) {
1935                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1936                                         uc += UTF8SKIP(uc);
1937                                     }
1938                                 } else {
1939                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1940                                         uc++;
1941                                     }
1942                                 }
1943                                 s= (char *)uc;
1944                             }
1945                             if (uc >(U8*)last_start) break;
1946                         }
1947
1948                         if ( word ) {
1949                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1950                             if (!leftmost || lpos < leftmost) {
1951                                 DEBUG_r(accepted_word=word);
1952                                 leftmost= lpos;
1953                             }
1954                             if (base==0) break;
1955
1956                         }
1957                         points[pointpos++ % maxlen]= uc;
1958                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1959                                              uscan, len, uvc, charid, foldlen,
1960                                              foldbuf, uniflags);
1961                         DEBUG_TRIE_EXECUTE_r({
1962                             dump_exec_pos( (char *)uc, c, strend, real_start,
1963                                 s,   utf8_target );
1964                             PerlIO_printf(Perl_debug_log,
1965                                 " Charid:%3u CP:%4"UVxf" ",
1966                                  charid, uvc);
1967                         });
1968
1969                         do {
1970 #ifdef DEBUGGING
1971                             word = aho->states[ state ].wordnum;
1972 #endif
1973                             base = aho->states[ state ].trans.base;
1974
1975                             DEBUG_TRIE_EXECUTE_r({
1976                                 if (failed)
1977                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1978                                         s,   utf8_target );
1979                                 PerlIO_printf( Perl_debug_log,
1980                                     "%sState: %4"UVxf", word=%"UVxf,
1981                                     failed ? " Fail transition to " : "",
1982                                     (UV)state, (UV)word);
1983                             });
1984                             if ( base ) {
1985                                 U32 tmp;
1986                                 I32 offset;
1987                                 if (charid &&
1988                                      ( ((offset = base + charid
1989                                         - 1 - trie->uniquecharcount)) >= 0)
1990                                      && ((U32)offset < trie->lasttrans)
1991                                      && trie->trans[offset].check == state
1992                                      && (tmp=trie->trans[offset].next))
1993                                 {
1994                                     DEBUG_TRIE_EXECUTE_r(
1995                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1996                                     state = tmp;
1997                                     break;
1998                                 }
1999                                 else {
2000                                     DEBUG_TRIE_EXECUTE_r(
2001                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2002                                     failed = 1;
2003                                     state = aho->fail[state];
2004                                 }
2005                             }
2006                             else {
2007                                 /* we must be accepting here */
2008                                 DEBUG_TRIE_EXECUTE_r(
2009                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2010                                 failed = 1;
2011                                 break;
2012                             }
2013                         } while(state);
2014                         uc += len;
2015                         if (failed) {
2016                             if (leftmost)
2017                                 break;
2018                             if (!state) state = 1;
2019                         }
2020                     }
2021                     if ( aho->states[ state ].wordnum ) {
2022                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2023                         if (!leftmost || lpos < leftmost) {
2024                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2025                             leftmost = lpos;
2026                         }
2027                     }
2028                     if (leftmost) {
2029                         s = (char*)leftmost;
2030                         DEBUG_TRIE_EXECUTE_r({
2031                             PerlIO_printf(
2032                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2033                                 (UV)accepted_word, (IV)(s - real_start)
2034                             );
2035                         });
2036                         if (!reginfo || regtry(reginfo, &s)) {
2037                             FREETMPS;
2038                             LEAVE;
2039                             goto got_it;
2040                         }
2041                         s = HOPc(s,1);
2042                         DEBUG_TRIE_EXECUTE_r({
2043                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2044                         });
2045                     } else {
2046                         DEBUG_TRIE_EXECUTE_r(
2047                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2048                         break;
2049                     }
2050                 }
2051                 FREETMPS;
2052                 LEAVE;
2053             }
2054             break;
2055         default:
2056             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2057             break;
2058         }
2059         return 0;
2060       got_it:
2061         return s;
2062 }
2063
2064
2065 /*
2066  - regexec_flags - match a regexp against a string
2067  */
2068 I32
2069 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2070               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2071 /* strend: pointer to null at end of string */
2072 /* strbeg: real beginning of string */
2073 /* minend: end of match must be >=minend after stringarg. */
2074 /* data: May be used for some additional optimizations.
2075          Currently its only used, with a U32 cast, for transmitting
2076          the ganch offset when doing a /g match. This will change */
2077 /* nosave: For optimizations. */
2078 {
2079     dVAR;
2080     struct regexp *const prog = (struct regexp *)SvANY(rx);
2081     /*register*/ char *s;
2082     register regnode *c;
2083     /*register*/ char *startpos = stringarg;
2084     I32 minlen;         /* must match at least this many chars */
2085     I32 dontbother = 0; /* how many characters not to try at end */
2086     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2087     I32 scream_pos = -1;                /* Internal iterator of scream. */
2088     char *scream_olds = NULL;
2089     const bool utf8_target = cBOOL(DO_UTF8(sv));
2090     I32 multiline;
2091     RXi_GET_DECL(prog,progi);
2092     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2093     regexp_paren_pair *swap = NULL;
2094     GET_RE_DEBUG_FLAGS_DECL;
2095
2096     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2097     PERL_UNUSED_ARG(data);
2098
2099     /* Be paranoid... */
2100     if (prog == NULL || startpos == NULL) {
2101         Perl_croak(aTHX_ "NULL regexp parameter");
2102         return 0;
2103     }
2104
2105     multiline = prog->extflags & RXf_PMf_MULTILINE;
2106     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2107
2108     RX_MATCH_UTF8_set(rx, utf8_target);
2109     DEBUG_EXECUTE_r(
2110         debug_start_match(rx, utf8_target, startpos, strend,
2111         "Matching");
2112     );
2113
2114     minlen = prog->minlen;
2115
2116     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2117         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2118                               "String too short [regexec_flags]...\n"));
2119         goto phooey;
2120     }
2121
2122
2123     /* Check validity of program. */
2124     if (UCHARAT(progi->program) != REG_MAGIC) {
2125         Perl_croak(aTHX_ "corrupted regexp program");
2126     }
2127
2128     PL_reg_flags = 0;
2129     PL_reg_eval_set = 0;
2130     PL_reg_maxiter = 0;
2131
2132     if (RX_UTF8(rx))
2133         PL_reg_flags |= RF_utf8;
2134
2135     /* Mark beginning of line for ^ and lookbehind. */
2136     reginfo.bol = startpos; /* XXX not used ??? */
2137     PL_bostr  = strbeg;
2138     reginfo.sv = sv;
2139
2140     /* Mark end of line for $ (and such) */
2141     PL_regeol = strend;
2142
2143     /* see how far we have to get to not match where we matched before */
2144     reginfo.till = startpos+minend;
2145
2146     /* If there is a "must appear" string, look for it. */
2147     s = startpos;
2148
2149     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2150         MAGIC *mg;
2151         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2152             reginfo.ganch = startpos + prog->gofs;
2153             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2154               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2155         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2156                   && SvMAGIC(sv)
2157                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2158                   && mg->mg_len >= 0) {
2159             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2160             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2161                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2162
2163             if (prog->extflags & RXf_ANCH_GPOS) {
2164                 if (s > reginfo.ganch)
2165                     goto phooey;
2166                 s = reginfo.ganch - prog->gofs;
2167                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2168                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2169                 if (s < strbeg)
2170                     goto phooey;
2171             }
2172         }
2173         else if (data) {
2174             reginfo.ganch = strbeg + PTR2UV(data);
2175             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2176                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2177
2178         } else {                                /* pos() not defined */
2179             reginfo.ganch = strbeg;
2180             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2181                  "GPOS: reginfo.ganch = strbeg\n"));
2182         }
2183     }
2184     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2185         /* We have to be careful. If the previous successful match
2186            was from this regex we don't want a subsequent partially
2187            successful match to clobber the old results.
2188            So when we detect this possibility we add a swap buffer
2189            to the re, and switch the buffer each match. If we fail
2190            we switch it back, otherwise we leave it swapped.
2191         */
2192         swap = prog->offs;
2193         /* do we need a save destructor here for eval dies? */
2194         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2195     }
2196     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2197         re_scream_pos_data d;
2198
2199         d.scream_olds = &scream_olds;
2200         d.scream_pos = &scream_pos;
2201         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2202         if (!s) {
2203             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2204             goto phooey;        /* not present */
2205         }
2206     }
2207
2208
2209
2210     /* Simplest case:  anchored match need be tried only once. */
2211     /*  [unless only anchor is BOL and multiline is set] */
2212     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2213         if (s == startpos && regtry(&reginfo, &startpos))
2214             goto got_it;
2215         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2216                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2217         {
2218             char *end;
2219
2220             if (minlen)
2221                 dontbother = minlen - 1;
2222             end = HOP3c(strend, -dontbother, strbeg) - 1;
2223             /* for multiline we only have to try after newlines */
2224             if (prog->check_substr || prog->check_utf8) {
2225                 /* because of the goto we can not easily reuse the macros for bifurcating the
2226                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2227                 if (utf8_target) {
2228                     if (s == startpos)
2229                         goto after_try_utf8;
2230                     while (1) {
2231                         if (regtry(&reginfo, &s)) {
2232                             goto got_it;
2233                         }
2234                       after_try_utf8:
2235                         if (s > end) {
2236                             goto phooey;
2237                         }
2238                         if (prog->extflags & RXf_USE_INTUIT) {
2239                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2240                             if (!s) {
2241                                 goto phooey;
2242                             }
2243                         }
2244                         else {
2245                             s += UTF8SKIP(s);
2246                         }
2247                     }
2248                 } /* end search for check string in unicode */
2249                 else {
2250                     if (s == startpos) {
2251                         goto after_try_latin;
2252                     }
2253                     while (1) {
2254                         if (regtry(&reginfo, &s)) {
2255                             goto got_it;
2256                         }
2257                       after_try_latin:
2258                         if (s > end) {
2259                             goto phooey;
2260                         }
2261                         if (prog->extflags & RXf_USE_INTUIT) {
2262                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2263                             if (!s) {
2264                                 goto phooey;
2265                             }
2266                         }
2267                         else {
2268                             s++;
2269                         }
2270                     }
2271                 } /* end search for check string in latin*/
2272             } /* end search for check string */
2273             else { /* search for newline */
2274                 if (s > startpos) {
2275                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2276                     s--;
2277                 }
2278                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2279                 while (s < end) {
2280                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2281                         if (regtry(&reginfo, &s))
2282                             goto got_it;
2283                     }
2284                 }
2285             } /* end search for newline */
2286         } /* end anchored/multiline check string search */
2287         goto phooey;
2288     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2289     {
2290         /* the warning about reginfo.ganch being used without initialization
2291            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2292            and we only enter this block when the same bit is set. */
2293         char *tmp_s = reginfo.ganch - prog->gofs;
2294
2295         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2296             goto got_it;
2297         goto phooey;
2298     }
2299
2300     /* Messy cases:  unanchored match. */
2301     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2302         /* we have /x+whatever/ */
2303         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2304         char ch;
2305 #ifdef DEBUGGING
2306         int did_match = 0;
2307 #endif
2308         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2309             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2310         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2311
2312         if (utf8_target) {
2313             REXEC_FBC_SCAN(
2314                 if (*s == ch) {
2315                     DEBUG_EXECUTE_r( did_match = 1 );
2316                     if (regtry(&reginfo, &s)) goto got_it;
2317                     s += UTF8SKIP(s);
2318                     while (s < strend && *s == ch)
2319                         s += UTF8SKIP(s);
2320                 }
2321             );
2322         }
2323         else {
2324             REXEC_FBC_SCAN(
2325                 if (*s == ch) {
2326                     DEBUG_EXECUTE_r( did_match = 1 );
2327                     if (regtry(&reginfo, &s)) goto got_it;
2328                     s++;
2329                     while (s < strend && *s == ch)
2330                         s++;
2331                 }
2332             );
2333         }
2334         DEBUG_EXECUTE_r(if (!did_match)
2335                 PerlIO_printf(Perl_debug_log,
2336                                   "Did not find anchored character...\n")
2337                );
2338     }
2339     else if (prog->anchored_substr != NULL
2340               || prog->anchored_utf8 != NULL
2341               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2342                   && prog->float_max_offset < strend - s)) {
2343         SV *must;
2344         I32 back_max;
2345         I32 back_min;
2346         char *last;
2347         char *last1;            /* Last position checked before */
2348 #ifdef DEBUGGING
2349         int did_match = 0;
2350 #endif
2351         if (prog->anchored_substr || prog->anchored_utf8) {
2352             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2353                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2354             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2355             back_max = back_min = prog->anchored_offset;
2356         } else {
2357             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2358                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2359             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2360             back_max = prog->float_max_offset;
2361             back_min = prog->float_min_offset;
2362         }
2363
2364
2365         if (must == &PL_sv_undef)
2366             /* could not downgrade utf8 check substring, so must fail */
2367             goto phooey;
2368
2369         if (back_min<0) {
2370             last = strend;
2371         } else {
2372             last = HOP3c(strend,        /* Cannot start after this */
2373                   -(I32)(CHR_SVLEN(must)
2374                          - (SvTAIL(must) != 0) + back_min), strbeg);
2375         }
2376         if (s > PL_bostr)
2377             last1 = HOPc(s, -1);
2378         else
2379             last1 = s - 1;      /* bogus */
2380
2381         /* XXXX check_substr already used to find "s", can optimize if
2382            check_substr==must. */
2383         scream_pos = -1;
2384         dontbother = end_shift;
2385         strend = HOPc(strend, -dontbother);
2386         while ( (s <= last) &&
2387                 ((flags & REXEC_SCREAM)
2388                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2389                                     end_shift, &scream_pos, 0))
2390                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2391                                   (unsigned char*)strend, must,
2392                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2393             /* we may be pointing at the wrong string */
2394             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2395                 s = strbeg + (s - SvPVX_const(sv));
2396             DEBUG_EXECUTE_r( did_match = 1 );
2397             if (HOPc(s, -back_max) > last1) {
2398                 last1 = HOPc(s, -back_min);
2399                 s = HOPc(s, -back_max);
2400             }
2401             else {
2402                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2403
2404                 last1 = HOPc(s, -back_min);
2405                 s = t;
2406             }
2407             if (utf8_target) {
2408                 while (s <= last1) {
2409                     if (regtry(&reginfo, &s))
2410                         goto got_it;
2411                     s += UTF8SKIP(s);
2412                 }
2413             }
2414             else {
2415                 while (s <= last1) {
2416                     if (regtry(&reginfo, &s))
2417                         goto got_it;
2418                     s++;
2419                 }
2420             }
2421         }
2422         DEBUG_EXECUTE_r(if (!did_match) {
2423             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2424                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2425             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2426                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2427                                ? "anchored" : "floating"),
2428                 quoted, RE_SV_TAIL(must));
2429         });
2430         goto phooey;
2431     }
2432     else if ( (c = progi->regstclass) ) {
2433         if (minlen) {
2434             const OPCODE op = OP(progi->regstclass);
2435             /* don't bother with what can't match */
2436             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2437                 strend = HOPc(strend, -(minlen - 1));
2438         }
2439         DEBUG_EXECUTE_r({
2440             SV * const prop = sv_newmortal();
2441             regprop(prog, prop, c);
2442             {
2443                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2444                     s,strend-s,60);
2445                 PerlIO_printf(Perl_debug_log,
2446                     "Matching stclass %.*s against %s (%d bytes)\n",
2447                     (int)SvCUR(prop), SvPVX_const(prop),
2448                      quoted, (int)(strend - s));
2449             }
2450         });
2451         if (find_byclass(prog, c, s, strend, &reginfo))
2452             goto got_it;
2453         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2454     }
2455     else {
2456         dontbother = 0;
2457         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2458             /* Trim the end. */
2459             char *last;
2460             SV* float_real;
2461
2462             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2463                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2464             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2465
2466             if (flags & REXEC_SCREAM) {
2467                 last = screaminstr(sv, float_real, s - strbeg,
2468                                    end_shift, &scream_pos, 1); /* last one */
2469                 if (!last)
2470                     last = scream_olds; /* Only one occurrence. */
2471                 /* we may be pointing at the wrong string */
2472                 else if (RXp_MATCH_COPIED(prog))
2473                     s = strbeg + (s - SvPVX_const(sv));
2474             }
2475             else {
2476                 STRLEN len;
2477                 const char * const little = SvPV_const(float_real, len);
2478
2479                 if (SvTAIL(float_real)) {
2480                     if (memEQ(strend - len + 1, little, len - 1))
2481                         last = strend - len + 1;
2482                     else if (!multiline)
2483                         last = memEQ(strend - len, little, len)
2484                             ? strend - len : NULL;
2485                     else
2486                         goto find_last;
2487                 } else {
2488                   find_last:
2489                     if (len)
2490                         last = rninstr(s, strend, little, little + len);
2491                     else
2492                         last = strend;  /* matching "$" */
2493                 }
2494             }
2495             if (last == NULL) {
2496                 DEBUG_EXECUTE_r(
2497                     PerlIO_printf(Perl_debug_log,
2498                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2499                         PL_colors[4], PL_colors[5]));
2500                 goto phooey; /* Should not happen! */
2501             }
2502             dontbother = strend - last + prog->float_min_offset;
2503         }
2504         if (minlen && (dontbother < minlen))
2505             dontbother = minlen - 1;
2506         strend -= dontbother;              /* this one's always in bytes! */
2507         /* We don't know much -- general case. */
2508         if (utf8_target) {
2509             for (;;) {
2510                 if (regtry(&reginfo, &s))
2511                     goto got_it;
2512                 if (s >= strend)
2513                     break;
2514                 s += UTF8SKIP(s);
2515             };
2516         }
2517         else {
2518             do {
2519                 if (regtry(&reginfo, &s))
2520                     goto got_it;
2521             } while (s++ < strend);
2522         }
2523     }
2524
2525     /* Failure. */
2526     goto phooey;
2527
2528 got_it:
2529     Safefree(swap);
2530     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2531
2532     if (PL_reg_eval_set)
2533         restore_pos(aTHX_ prog);
2534     if (RXp_PAREN_NAMES(prog))
2535         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2536
2537     /* make sure $`, $&, $', and $digit will work later */
2538     if ( !(flags & REXEC_NOT_FIRST) ) {
2539         RX_MATCH_COPY_FREE(rx);
2540         if (flags & REXEC_COPY_STR) {
2541             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2542 #ifdef PERL_OLD_COPY_ON_WRITE
2543             if ((SvIsCOW(sv)
2544                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2545                 if (DEBUG_C_TEST) {
2546                     PerlIO_printf(Perl_debug_log,
2547                                   "Copy on write: regexp capture, type %d\n",
2548                                   (int) SvTYPE(sv));
2549                 }
2550                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2551                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2552                 assert (SvPOKp(prog->saved_copy));
2553             } else
2554 #endif
2555             {
2556                 RX_MATCH_COPIED_on(rx);
2557                 s = savepvn(strbeg, i);
2558                 prog->subbeg = s;
2559             }
2560             prog->sublen = i;
2561         }
2562         else {
2563             prog->subbeg = strbeg;
2564             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2565         }
2566     }
2567
2568     return 1;
2569
2570 phooey:
2571     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2572                           PL_colors[4], PL_colors[5]));
2573     if (PL_reg_eval_set)
2574         restore_pos(aTHX_ prog);
2575     if (swap) {
2576         /* we failed :-( roll it back */
2577         Safefree(prog->offs);
2578         prog->offs = swap;
2579     }
2580
2581     return 0;
2582 }
2583
2584
2585 /*
2586  - regtry - try match at specific point
2587  */
2588 STATIC I32                      /* 0 failure, 1 success */
2589 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2590 {
2591     dVAR;
2592     CHECKPOINT lastcp;
2593     REGEXP *const rx = reginfo->prog;
2594     regexp *const prog = (struct regexp *)SvANY(rx);
2595     RXi_GET_DECL(prog,progi);
2596     GET_RE_DEBUG_FLAGS_DECL;
2597
2598     PERL_ARGS_ASSERT_REGTRY;
2599
2600     reginfo->cutpoint=NULL;
2601
2602     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2603         MAGIC *mg;
2604
2605         PL_reg_eval_set = RS_init;
2606         DEBUG_EXECUTE_r(DEBUG_s(
2607             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2608                           (IV)(PL_stack_sp - PL_stack_base));
2609             ));
2610         SAVESTACK_CXPOS();
2611         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2612         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2613         SAVETMPS;
2614         /* Apparently this is not needed, judging by wantarray. */
2615         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2616            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2617
2618         if (reginfo->sv) {
2619             /* Make $_ available to executed code. */
2620             if (reginfo->sv != DEFSV) {
2621                 SAVE_DEFSV;
2622                 DEFSV_set(reginfo->sv);
2623             }
2624
2625             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2626                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2627                 /* prepare for quick setting of pos */
2628 #ifdef PERL_OLD_COPY_ON_WRITE
2629                 if (SvIsCOW(reginfo->sv))
2630                     sv_force_normal_flags(reginfo->sv, 0);
2631 #endif
2632                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2633                                  &PL_vtbl_mglob, NULL, 0);
2634                 mg->mg_len = -1;
2635             }
2636             PL_reg_magic    = mg;
2637             PL_reg_oldpos   = mg->mg_len;
2638             SAVEDESTRUCTOR_X(restore_pos, prog);
2639         }
2640         if (!PL_reg_curpm) {
2641             Newxz(PL_reg_curpm, 1, PMOP);
2642 #ifdef USE_ITHREADS
2643             {
2644                 SV* const repointer = &PL_sv_undef;
2645                 /* this regexp is also owned by the new PL_reg_curpm, which
2646                    will try to free it.  */
2647                 av_push(PL_regex_padav, repointer);
2648                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2649                 PL_regex_pad = AvARRAY(PL_regex_padav);
2650             }
2651 #endif
2652         }
2653 #ifdef USE_ITHREADS
2654         /* It seems that non-ithreads works both with and without this code.
2655            So for efficiency reasons it seems best not to have the code
2656            compiled when it is not needed.  */
2657         /* This is safe against NULLs: */
2658         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2659         /* PM_reg_curpm owns a reference to this regexp.  */
2660         ReREFCNT_inc(rx);
2661 #endif
2662         PM_SETRE(PL_reg_curpm, rx);
2663         PL_reg_oldcurpm = PL_curpm;
2664         PL_curpm = PL_reg_curpm;
2665         if (RXp_MATCH_COPIED(prog)) {
2666             /*  Here is a serious problem: we cannot rewrite subbeg,
2667                 since it may be needed if this match fails.  Thus
2668                 $` inside (?{}) could fail... */
2669             PL_reg_oldsaved = prog->subbeg;
2670             PL_reg_oldsavedlen = prog->sublen;
2671 #ifdef PERL_OLD_COPY_ON_WRITE
2672             PL_nrs = prog->saved_copy;
2673 #endif
2674             RXp_MATCH_COPIED_off(prog);
2675         }
2676         else
2677             PL_reg_oldsaved = NULL;
2678         prog->subbeg = PL_bostr;
2679         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2680     }
2681     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2682     prog->offs[0].start = *startpos - PL_bostr;
2683     PL_reginput = *startpos;
2684     PL_reglastparen = &prog->lastparen;
2685     PL_reglastcloseparen = &prog->lastcloseparen;
2686     prog->lastparen = 0;
2687     prog->lastcloseparen = 0;
2688     PL_regsize = 0;
2689     PL_regoffs = prog->offs;
2690     if (PL_reg_start_tmpl <= prog->nparens) {
2691         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2692         if(PL_reg_start_tmp)
2693             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2694         else
2695             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2696     }
2697
2698     /* XXXX What this code is doing here?!!!  There should be no need
2699        to do this again and again, PL_reglastparen should take care of
2700        this!  --ilya*/
2701
2702     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2703      * Actually, the code in regcppop() (which Ilya may be meaning by
2704      * PL_reglastparen), is not needed at all by the test suite
2705      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2706      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2707      * Meanwhile, this code *is* needed for the
2708      * above-mentioned test suite tests to succeed.  The common theme
2709      * on those tests seems to be returning null fields from matches.
2710      * --jhi updated by dapm */
2711 #if 1
2712     if (prog->nparens) {
2713         regexp_paren_pair *pp = PL_regoffs;
2714         register I32 i;
2715         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2716             ++pp;
2717             pp->start = -1;
2718             pp->end = -1;
2719         }
2720     }
2721 #endif
2722     REGCP_SET(lastcp);
2723     if (regmatch(reginfo, progi->program + 1)) {
2724         PL_regoffs[0].end = PL_reginput - PL_bostr;
2725         return 1;
2726     }
2727     if (reginfo->cutpoint)
2728         *startpos= reginfo->cutpoint;
2729     REGCP_UNWIND(lastcp);
2730     return 0;
2731 }
2732
2733
2734 #define sayYES goto yes
2735 #define sayNO goto no
2736 #define sayNO_SILENT goto no_silent
2737
2738 /* we dont use STMT_START/END here because it leads to
2739    "unreachable code" warnings, which are bogus, but distracting. */
2740 #define CACHEsayNO \
2741     if (ST.cache_mask) \
2742        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2743     sayNO
2744
2745 /* this is used to determine how far from the left messages like
2746    'failed...' are printed. It should be set such that messages
2747    are inline with the regop output that created them.
2748 */
2749 #define REPORT_CODE_OFF 32
2750
2751
2752 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2753 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2754
2755 #define SLAB_FIRST(s) (&(s)->states[0])
2756 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2757
2758 /* grab a new slab and return the first slot in it */
2759
2760 STATIC regmatch_state *
2761 S_push_slab(pTHX)
2762 {
2763 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2764     dMY_CXT;
2765 #endif
2766     regmatch_slab *s = PL_regmatch_slab->next;
2767     if (!s) {
2768         Newx(s, 1, regmatch_slab);
2769         s->prev = PL_regmatch_slab;
2770         s->next = NULL;
2771         PL_regmatch_slab->next = s;
2772     }
2773     PL_regmatch_slab = s;
2774     return SLAB_FIRST(s);
2775 }
2776
2777
2778 /* push a new state then goto it */
2779
2780 #define PUSH_STATE_GOTO(state, node) \
2781     scan = node; \
2782     st->resume_state = state; \
2783     goto push_state;
2784
2785 /* push a new state with success backtracking, then goto it */
2786
2787 #define PUSH_YES_STATE_GOTO(state, node) \
2788     scan = node; \
2789     st->resume_state = state; \
2790     goto push_yes_state;
2791
2792
2793
2794 /*
2795
2796 regmatch() - main matching routine
2797
2798 This is basically one big switch statement in a loop. We execute an op,
2799 set 'next' to point the next op, and continue. If we come to a point which
2800 we may need to backtrack to on failure such as (A|B|C), we push a
2801 backtrack state onto the backtrack stack. On failure, we pop the top
2802 state, and re-enter the loop at the state indicated. If there are no more
2803 states to pop, we return failure.
2804
2805 Sometimes we also need to backtrack on success; for example /A+/, where
2806 after successfully matching one A, we need to go back and try to
2807 match another one; similarly for lookahead assertions: if the assertion
2808 completes successfully, we backtrack to the state just before the assertion
2809 and then carry on.  In these cases, the pushed state is marked as
2810 'backtrack on success too'. This marking is in fact done by a chain of
2811 pointers, each pointing to the previous 'yes' state. On success, we pop to
2812 the nearest yes state, discarding any intermediate failure-only states.
2813 Sometimes a yes state is pushed just to force some cleanup code to be
2814 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2815 it to free the inner regex.
2816
2817 Note that failure backtracking rewinds the cursor position, while
2818 success backtracking leaves it alone.
2819
2820 A pattern is complete when the END op is executed, while a subpattern
2821 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2822 ops trigger the "pop to last yes state if any, otherwise return true"
2823 behaviour.
2824
2825 A common convention in this function is to use A and B to refer to the two
2826 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2827 the subpattern to be matched possibly multiple times, while B is the entire
2828 rest of the pattern. Variable and state names reflect this convention.
2829
2830 The states in the main switch are the union of ops and failure/success of
2831 substates associated with with that op.  For example, IFMATCH is the op
2832 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2833 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2834 successfully matched A and IFMATCH_A_fail is a state saying that we have
2835 just failed to match A. Resume states always come in pairs. The backtrack
2836 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2837 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2838 on success or failure.
2839
2840 The struct that holds a backtracking state is actually a big union, with
2841 one variant for each major type of op. The variable st points to the
2842 top-most backtrack struct. To make the code clearer, within each
2843 block of code we #define ST to alias the relevant union.
2844
2845 Here's a concrete example of a (vastly oversimplified) IFMATCH
2846 implementation:
2847
2848     switch (state) {
2849     ....
2850
2851 #define ST st->u.ifmatch
2852
2853     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2854         ST.foo = ...; // some state we wish to save
2855         ...
2856         // push a yes backtrack state with a resume value of
2857         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2858         // first node of A:
2859         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2860         // NOTREACHED
2861
2862     case IFMATCH_A: // we have successfully executed A; now continue with B
2863         next = B;
2864         bar = ST.foo; // do something with the preserved value
2865         break;
2866
2867     case IFMATCH_A_fail: // A failed, so the assertion failed
2868         ...;   // do some housekeeping, then ...
2869         sayNO; // propagate the failure
2870
2871 #undef ST
2872
2873     ...
2874     }
2875
2876 For any old-timers reading this who are familiar with the old recursive
2877 approach, the code above is equivalent to:
2878
2879     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2880     {
2881         int foo = ...
2882         ...
2883         if (regmatch(A)) {
2884             next = B;
2885             bar = foo;
2886             break;
2887         }
2888         ...;   // do some housekeeping, then ...
2889         sayNO; // propagate the failure
2890     }
2891
2892 The topmost backtrack state, pointed to by st, is usually free. If you
2893 want to claim it, populate any ST.foo fields in it with values you wish to
2894 save, then do one of
2895
2896         PUSH_STATE_GOTO(resume_state, node);
2897         PUSH_YES_STATE_GOTO(resume_state, node);
2898
2899 which sets that backtrack state's resume value to 'resume_state', pushes a
2900 new free entry to the top of the backtrack stack, then goes to 'node'.
2901 On backtracking, the free slot is popped, and the saved state becomes the
2902 new free state. An ST.foo field in this new top state can be temporarily
2903 accessed to retrieve values, but once the main loop is re-entered, it
2904 becomes available for reuse.
2905
2906 Note that the depth of the backtrack stack constantly increases during the
2907 left-to-right execution of the pattern, rather than going up and down with
2908 the pattern nesting. For example the stack is at its maximum at Z at the
2909 end of the pattern, rather than at X in the following:
2910
2911     /(((X)+)+)+....(Y)+....Z/
2912
2913 The only exceptions to this are lookahead/behind assertions and the cut,
2914 (?>A), which pop all the backtrack states associated with A before
2915 continuing.
2916
2917 Backtrack state structs are allocated in slabs of about 4K in size.
2918 PL_regmatch_state and st always point to the currently active state,
2919 and PL_regmatch_slab points to the slab currently containing
2920 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2921 allocated, and is never freed until interpreter destruction. When the slab
2922 is full, a new one is allocated and chained to the end. At exit from
2923 regmatch(), slabs allocated since entry are freed.
2924
2925 */
2926
2927
2928 #define DEBUG_STATE_pp(pp)                                  \
2929     DEBUG_STATE_r({                                         \
2930         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2931         PerlIO_printf(Perl_debug_log,                       \
2932             "    %*s"pp" %s%s%s%s%s\n",                     \
2933             depth*2, "",                                    \
2934             PL_reg_name[st->resume_state],                     \
2935             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2936             ((st==yes_state) ? "Y" : ""),                   \
2937             ((st==mark_state) ? "M" : ""),                  \
2938             ((st==yes_state||st==mark_state) ? "]" : "")    \
2939         );                                                  \
2940     });
2941
2942
2943 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2944
2945 #ifdef DEBUGGING
2946
2947 STATIC void
2948 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2949     const char *start, const char *end, const char *blurb)
2950 {
2951     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2952
2953     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2954
2955     if (!PL_colorset)
2956             reginitcolors();
2957     {
2958         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2959             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2960
2961         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2962             start, end - start, 60);
2963
2964         PerlIO_printf(Perl_debug_log,
2965             "%s%s REx%s %s against %s\n",
2966                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2967
2968         if (utf8_target||utf8_pat)
2969             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2970                 utf8_pat ? "pattern" : "",
2971                 utf8_pat && utf8_target ? " and " : "",
2972                 utf8_target ? "string" : ""
2973             );
2974     }
2975 }
2976
2977 STATIC void
2978 S_dump_exec_pos(pTHX_ const char *locinput,
2979                       const regnode *scan,
2980                       const char *loc_regeol,
2981                       const char *loc_bostr,
2982                       const char *loc_reg_starttry,
2983                       const bool utf8_target)
2984 {
2985     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2986     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2987     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2988     /* The part of the string before starttry has one color
2989        (pref0_len chars), between starttry and current
2990        position another one (pref_len - pref0_len chars),
2991        after the current position the third one.
2992        We assume that pref0_len <= pref_len, otherwise we
2993        decrease pref0_len.  */
2994     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2995         ? (5 + taill) - l : locinput - loc_bostr;
2996     int pref0_len;
2997
2998     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2999
3000     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3001         pref_len++;
3002     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3003     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3004         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3005               ? (5 + taill) - pref_len : loc_regeol - locinput);
3006     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3007         l--;
3008     if (pref0_len < 0)
3009         pref0_len = 0;
3010     if (pref0_len > pref_len)
3011         pref0_len = pref_len;
3012     {
3013         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3014
3015         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3016             (locinput - pref_len),pref0_len, 60, 4, 5);
3017
3018         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3019                     (locinput - pref_len + pref0_len),
3020                     pref_len - pref0_len, 60, 2, 3);
3021
3022         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3023                     locinput, loc_regeol - locinput, 10, 0, 1);
3024
3025         const STRLEN tlen=len0+len1+len2;
3026         PerlIO_printf(Perl_debug_log,
3027                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3028                     (IV)(locinput - loc_bostr),
3029                     len0, s0,
3030                     len1, s1,
3031                     (docolor ? "" : "> <"),
3032                     len2, s2,
3033                     (int)(tlen > 19 ? 0 :  19 - tlen),
3034                     "");
3035     }
3036 }
3037
3038 #endif
3039
3040 /* reg_check_named_buff_matched()
3041  * Checks to see if a named buffer has matched. The data array of
3042  * buffer numbers corresponding to the buffer is expected to reside
3043  * in the regexp->data->data array in the slot stored in the ARG() of
3044  * node involved. Note that this routine doesn't actually care about the
3045  * name, that information is not preserved from compilation to execution.
3046  * Returns the index of the leftmost defined buffer with the given name
3047  * or 0 if non of the buffers matched.
3048  */
3049 STATIC I32
3050 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3051 {
3052     I32 n;
3053     RXi_GET_DECL(rex,rexi);
3054     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3055     I32 *nums=(I32*)SvPVX(sv_dat);
3056
3057     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3058
3059     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3060         if ((I32)*PL_reglastparen >= nums[n] &&
3061             PL_regoffs[nums[n]].end != -1)
3062         {
3063             return nums[n];
3064         }
3065     }
3066     return 0;
3067 }
3068
3069
3070 /* free all slabs above current one  - called during LEAVE_SCOPE */
3071
3072 STATIC void
3073 S_clear_backtrack_stack(pTHX_ void *p)
3074 {
3075     regmatch_slab *s = PL_regmatch_slab->next;
3076     PERL_UNUSED_ARG(p);
3077
3078     if (!s)
3079         return;
3080     PL_regmatch_slab->next = NULL;
3081     while (s) {
3082         regmatch_slab * const osl = s;
3083         s = s->next;
3084         Safefree(osl);
3085     }
3086 }
3087
3088
3089 #define SETREX(Re1,Re2) \
3090     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3091     Re1 = (Re2)
3092
3093 STATIC I32                      /* 0 failure, 1 success */
3094 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3095 {
3096 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3097     dMY_CXT;
3098 #endif
3099     dVAR;
3100     register const bool utf8_target = PL_reg_match_utf8;
3101     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3102     REGEXP *rex_sv = reginfo->prog;
3103     regexp *rex = (struct regexp *)SvANY(rex_sv);
3104     RXi_GET_DECL(rex,rexi);
3105     I32 oldsave;
3106     /* the current state. This is a cached copy of PL_regmatch_state */
3107     register regmatch_state *st;
3108     /* cache heavy used fields of st in registers */
3109     register regnode *scan;
3110     register regnode *next;
3111     register U32 n = 0; /* general value; init to avoid compiler warning */
3112     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3113     register char *locinput = PL_reginput;
3114     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3115
3116     bool result = 0;        /* return value of S_regmatch */
3117     int depth = 0;          /* depth of backtrack stack */
3118     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3119     const U32 max_nochange_depth =
3120         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3121         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3122     regmatch_state *yes_state = NULL; /* state to pop to on success of
3123                                                             subpattern */
3124     /* mark_state piggy backs on the yes_state logic so that when we unwind
3125        the stack on success we can update the mark_state as we go */
3126     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3127     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3128     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3129     U32 state_num;
3130     bool no_final = 0;      /* prevent failure from backtracking? */
3131     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3132     char *startpoint = PL_reginput;
3133     SV *popmark = NULL;     /* are we looking for a mark? */
3134     SV *sv_commit = NULL;   /* last mark name seen in failure */
3135     SV *sv_yes_mark = NULL; /* last mark name we have seen
3136                                during a successful match */
3137     U32 lastopen = 0;       /* last open we saw */
3138     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3139     SV* const oreplsv = GvSV(PL_replgv);
3140     /* these three flags are set by various ops to signal information to
3141      * the very next op. They have a useful lifetime of exactly one loop
3142      * iteration, and are not preserved or restored by state pushes/pops
3143      */
3144     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3145     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3146     int logical = 0;        /* the following EVAL is:
3147                                 0: (?{...})
3148                                 1: (?(?{...})X|Y)
3149                                 2: (??{...})
3150                                or the following IFMATCH/UNLESSM is:
3151                                 false: plain (?=foo)
3152                                 true:  used as a condition: (?(?=foo))
3153                             */
3154 #ifdef DEBUGGING
3155     GET_RE_DEBUG_FLAGS_DECL;
3156 #endif
3157
3158     PERL_ARGS_ASSERT_REGMATCH;
3159
3160     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3161             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3162     }));
3163     /* on first ever call to regmatch, allocate first slab */
3164     if (!PL_regmatch_slab) {
3165         Newx(PL_regmatch_slab, 1, regmatch_slab);
3166         PL_regmatch_slab->prev = NULL;
3167         PL_regmatch_slab->next = NULL;
3168         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3169     }
3170
3171     oldsave = PL_savestack_ix;
3172     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3173     SAVEVPTR(PL_regmatch_slab);
3174     SAVEVPTR(PL_regmatch_state);
3175
3176     /* grab next free state slot */
3177     st = ++PL_regmatch_state;
3178     if (st >  SLAB_LAST(PL_regmatch_slab))
3179         st = PL_regmatch_state = S_push_slab(aTHX);
3180
3181     /* Note that nextchr is a byte even in UTF */
3182     nextchr = UCHARAT(locinput);
3183     scan = prog;
3184     while (scan != NULL) {
3185
3186         DEBUG_EXECUTE_r( {
3187             SV * const prop = sv_newmortal();
3188             regnode *rnext=regnext(scan);
3189             DUMP_EXEC_POS( locinput, scan, utf8_target );
3190             regprop(rex, prop, scan);
3191
3192             PerlIO_printf(Perl_debug_log,
3193                     "%3"IVdf":%*s%s(%"IVdf")\n",
3194                     (IV)(scan - rexi->program), depth*2, "",
3195                     SvPVX_const(prop),
3196                     (PL_regkind[OP(scan)] == END || !rnext) ?
3197                         0 : (IV)(rnext - rexi->program));
3198         });
3199
3200         next = scan + NEXT_OFF(scan);
3201         if (next == scan)
3202             next = NULL;
3203         state_num = OP(scan);
3204
3205       reenter_switch:
3206
3207         assert(PL_reglastparen == &rex->lastparen);
3208         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3209         assert(PL_regoffs == rex->offs);
3210
3211         switch (state_num) {
3212         case BOL:
3213             if (locinput == PL_bostr)
3214             {
3215                 /* reginfo->till = reginfo->bol; */
3216                 break;
3217             }
3218             sayNO;
3219         case MBOL:
3220             if (locinput == PL_bostr ||
3221                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3222             {
3223                 break;
3224             }
3225             sayNO;
3226         case SBOL:
3227             if (locinput == PL_bostr)
3228                 break;
3229             sayNO;
3230         case GPOS:
3231             if (locinput == reginfo->ganch)
3232                 break;
3233             sayNO;
3234
3235         case KEEPS:
3236             /* update the startpoint */
3237             st->u.keeper.val = PL_regoffs[0].start;
3238             PL_reginput = locinput;
3239             PL_regoffs[0].start = locinput - PL_bostr;
3240             PUSH_STATE_GOTO(KEEPS_next, next);
3241             /*NOT-REACHED*/
3242         case KEEPS_next_fail:
3243             /* rollback the start point change */
3244             PL_regoffs[0].start = st->u.keeper.val;
3245             sayNO_SILENT;
3246             /*NOT-REACHED*/
3247         case EOL:
3248                 goto seol;
3249         case MEOL:
3250             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3251                 sayNO;
3252             break;
3253         case SEOL:
3254           seol:
3255             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3256                 sayNO;
3257             if (PL_regeol - locinput > 1)
3258                 sayNO;
3259             break;
3260         case EOS:
3261             if (PL_regeol != locinput)
3262                 sayNO;
3263             break;
3264         case SANY:
3265             if (!nextchr && locinput >= PL_regeol)
3266                 sayNO;
3267             if (utf8_target) {
3268                 locinput += PL_utf8skip[nextchr];
3269                 if (locinput > PL_regeol)
3270                     sayNO;
3271                 nextchr = UCHARAT(locinput);
3272             }
3273             else
3274                 nextchr = UCHARAT(++locinput);
3275             break;
3276         case CANY:
3277             if (!nextchr && locinput >= PL_regeol)
3278                 sayNO;
3279             nextchr = UCHARAT(++locinput);
3280             break;
3281         case REG_ANY:
3282             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3283                 sayNO;
3284             if (utf8_target) {
3285                 locinput += PL_utf8skip[nextchr];
3286                 if (locinput > PL_regeol)
3287                     sayNO;
3288                 nextchr = UCHARAT(locinput);
3289             }
3290             else
3291                 nextchr = UCHARAT(++locinput);
3292             break;
3293
3294 #undef  ST
3295 #define ST st->u.trie
3296         case TRIEC:
3297             /* In this case the charclass data is available inline so
3298                we can fail fast without a lot of extra overhead.
3299              */
3300             if (scan->flags == EXACT || !utf8_target) {
3301                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3302                     DEBUG_EXECUTE_r(
3303                         PerlIO_printf(Perl_debug_log,
3304                                   "%*s  %sfailed to match trie start class...%s\n",
3305                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3306                     );
3307                     sayNO_SILENT;
3308                     /* NOTREACHED */
3309                 }
3310             }
3311             /* FALL THROUGH */
3312         case TRIE:
3313             /* the basic plan of execution of the trie is:
3314              * At the beginning, run though all the states, and
3315              * find the longest-matching word. Also remember the position
3316              * of the shortest matching word. For example, this pattern:
3317              *    1  2 3 4    5
3318              *    ab|a|x|abcd|abc
3319              * when matched against the string "abcde", will generate
3320              * accept states for all words except 3, with the longest
3321              * matching word being 4, and the shortest being 1 (with
3322              * the position being after char 1 of the string).
3323              *
3324              * Then for each matching word, in word order (i.e. 1,2,4,5),
3325              * we run the remainder of the pattern; on each try setting
3326              * the current position to the character following the word,
3327              * returning to try the next word on failure.
3328              *
3329              * We avoid having to build a list of words at runtime by
3330              * using a compile-time structure, wordinfo[].prev, which
3331              * gives, for each word, the previous accepting word (if any).
3332              * In the case above it would contain the mappings 1->2, 2->0,
3333              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3334              * the longest word (4 above), a list of all words, by
3335              * following the list of prev pointers; this gives us the
3336              * unordered list 4,5,1,2. Then given the current word we have
3337              * just tried, we can go through the list and find the
3338              * next-biggest word to try (so if we just failed on word 2,
3339              * the next in the list is 4).
3340              *
3341              * Since at runtime we don't record the matching position in
3342              * the string for each word, we have to work that out for
3343              * each word we're about to process. The wordinfo table holds
3344              * the character length of each word; given that we recorded
3345              * at the start: the position of the shortest word and its
3346              * length in chars, we just need to move the pointer the
3347              * difference between the two char lengths. Depending on
3348              * Unicode status and folding, that's cheap or expensive.
3349              *
3350              * This algorithm is optimised for the case where are only a
3351              * small number of accept states, i.e. 0,1, or maybe 2.
3352              * With lots of accepts states, and having to try all of them,
3353              * it becomes quadratic on number of accept states to find all
3354              * the next words.
3355              */
3356
3357             {
3358                 /* what type of TRIE am I? (utf8 makes this contextual) */
3359                 DECL_TRIE_TYPE(scan);
3360
3361                 /* what trie are we using right now */
3362                 reg_trie_data * const trie
3363                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3364                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3365                 U32 state = trie->startstate;
3366
3367                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3368                     !TRIE_BITMAP_TEST(trie,*locinput)
3369                 ) {
3370                     if (trie->states[ state ].wordnum) {
3371                          DEBUG_EXECUTE_r(
3372                             PerlIO_printf(Perl_debug_log,
3373                                           "%*s  %smatched empty string...%s\n",
3374                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3375                         );
3376                         if (!trie->jump)
3377                             break;
3378                     } else {
3379                         DEBUG_EXECUTE_r(
3380                             PerlIO_printf(Perl_debug_log,
3381                                           "%*s  %sfailed to match trie start class...%s\n",
3382                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3383                         );
3384                         sayNO_SILENT;
3385                    }
3386                 }
3387
3388             {
3389                 U8 *uc = ( U8* )locinput;
3390
3391                 STRLEN len = 0;
3392                 STRLEN foldlen = 0;
3393                 U8 *uscan = (U8*)NULL;
3394                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3395                 U32 charcount = 0; /* how many input chars we have matched */
3396                 U32 accepted = 0; /* have we seen any accepting states? */
3397
3398                 ST.B = next;
3399                 ST.jump = trie->jump;
3400                 ST.me = scan;
3401                 ST.firstpos = NULL;
3402                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3403                 ST.nextword = 0;
3404
3405                 /* fully traverse the TRIE; note the position of the
3406                    shortest accept state and the wordnum of the longest
3407                    accept state */
3408
3409                 while ( state && uc <= (U8*)PL_regeol ) {
3410                     U32 base = trie->states[ state ].trans.base;
3411                     UV uvc = 0;
3412                     U16 charid = 0;
3413                     U16 wordnum;
3414                     wordnum = trie->states[ state ].wordnum;
3415
3416                     if (wordnum) { /* it's an accept state */
3417                         if (!accepted) {
3418                             accepted = 1;
3419                             /* record first match position */
3420                             if (ST.longfold) {
3421                                 ST.firstpos = (U8*)locinput;
3422                                 ST.firstchars = 0;
3423                             }
3424                             else {
3425                                 ST.firstpos = uc;
3426                                 ST.firstchars = charcount;
3427                             }
3428                         }
3429                         if (!ST.nextword || wordnum < ST.nextword)
3430                             ST.nextword = wordnum;
3431                         ST.topword = wordnum;
3432                     }
3433
3434                     DEBUG_TRIE_EXECUTE_r({
3435                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3436                                 PerlIO_printf( Perl_debug_log,
3437                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3438                                     2+depth * 2, "", PL_colors[4],
3439                                     (UV)state, (accepted ? 'Y' : 'N'));
3440                     });
3441
3442                     /* read a char and goto next state */
3443                     if ( base ) {
3444                         I32 offset;
3445                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3446                                              uscan, len, uvc, charid, foldlen,
3447                                              foldbuf, uniflags);
3448                         charcount++;
3449                         if (foldlen>0)
3450                             ST.longfold = TRUE;
3451                         if (charid &&
3452                              ( ((offset =
3453                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3454
3455                              && ((U32)offset < trie->lasttrans)
3456                              && trie->trans[offset].check == state)
3457                         {
3458                             state = trie->trans[offset].next;
3459                         }
3460                         else {
3461                             state = 0;
3462                         }
3463                         uc += len;
3464
3465                     }
3466                     else {
3467                         state = 0;
3468                     }
3469                     DEBUG_TRIE_EXECUTE_r(
3470                         PerlIO_printf( Perl_debug_log,
3471                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3472                             charid, uvc, (UV)state, PL_colors[5] );
3473                     );
3474                 }
3475                 if (!accepted)
3476                    sayNO;
3477
3478                 /* calculate total number of accept states */
3479                 {
3480                     U16 w = ST.topword;
3481                     accepted = 0;
3482                     while (w) {
3483                         w = trie->wordinfo[w].prev;
3484                         accepted++;
3485                     }
3486                     ST.accepted = accepted;
3487                 }
3488
3489                 DEBUG_EXECUTE_r(
3490                     PerlIO_printf( Perl_debug_log,
3491                         "%*s  %sgot %"IVdf" possible matches%s\n",
3492                         REPORT_CODE_OFF + depth * 2, "",
3493                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3494                 );
3495                 goto trie_first_try; /* jump into the fail handler */
3496             }}
3497             /* NOTREACHED */
3498
3499         case TRIE_next_fail: /* we failed - try next alternative */
3500             if ( ST.jump) {
3501                 REGCP_UNWIND(ST.cp);
3502                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3503                     PL_regoffs[n].end = -1;
3504                 *PL_reglastparen = n;
3505             }
3506             if (!--ST.accepted) {
3507                 DEBUG_EXECUTE_r({
3508                     PerlIO_printf( Perl_debug_log,
3509                         "%*s  %sTRIE failed...%s\n",
3510                         REPORT_CODE_OFF+depth*2, "",
3511                         PL_colors[4],
3512                         PL_colors[5] );
3513                 });
3514                 sayNO_SILENT;
3515             }
3516             {
3517                 /* Find next-highest word to process.  Note that this code
3518                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3519                 register U16 min = 0;
3520                 register U16 word;
3521                 register U16 const nextword = ST.nextword;
3522                 register reg_trie_wordinfo * const wordinfo
3523                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3524                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3525                     if (word > nextword && (!min || word < min))
3526                         min = word;
3527                 }
3528                 ST.nextword = min;
3529             }
3530
3531           trie_first_try:
3532             if (do_cutgroup) {
3533                 do_cutgroup = 0;
3534                 no_final = 0;
3535             }
3536
3537             if ( ST.jump) {
3538                 ST.lastparen = *PL_reglastparen;
3539                 REGCP_SET(ST.cp);
3540             }
3541
3542             /* find start char of end of current word */
3543             {
3544                 U32 chars; /* how many chars to skip */
3545                 U8 *uc = ST.firstpos;
3546                 reg_trie_data * const trie
3547                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3548
3549                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3550                             >=  ST.firstchars);
3551                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3552                             - ST.firstchars;
3553
3554                 if (ST.longfold) {
3555                     /* the hard option - fold each char in turn and find
3556                      * its folded length (which may be different */
3557                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3558                     STRLEN foldlen;
3559                     STRLEN len;
3560                     UV uvc;
3561                     U8 *uscan;
3562
3563                     while (chars) {
3564                         if (utf8_target) {
3565                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3566                                                     uniflags);
3567                             uc += len;
3568                         }
3569                         else {
3570                             uvc = *uc;
3571                             uc++;
3572                         }
3573                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3574                         uscan = foldbuf;
3575                         while (foldlen) {
3576                             if (!--chars)
3577                                 break;
3578                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3579                                             uniflags);
3580                             uscan += len;
3581                             foldlen -= len;
3582                         }
3583                     }
3584                 }
3585                 else {
3586                     if (utf8_target)
3587                         while (chars--)
3588                             uc += UTF8SKIP(uc);
3589                     else
3590                         uc += chars;
3591                 }
3592                 PL_reginput = (char *)uc;
3593             }
3594
3595             scan = (ST.jump && ST.jump[ST.nextword])
3596                         ? ST.me + ST.jump[ST.nextword]
3597                         : ST.B;
3598
3599             DEBUG_EXECUTE_r({
3600                 PerlIO_printf( Perl_debug_log,
3601                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3602                     REPORT_CODE_OFF+depth*2, "",
3603                     PL_colors[4],
3604                     ST.nextword,
3605                     PL_colors[5]
3606                     );
3607             });
3608
3609             if (ST.accepted > 1 || has_cutgroup) {
3610                 PUSH_STATE_GOTO(TRIE_next, scan);
3611                 /* NOTREACHED */
3612             }
3613             /* only one choice left - just continue */
3614             DEBUG_EXECUTE_r({
3615                 AV *const trie_words
3616                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3617                 SV ** const tmp = av_fetch( trie_words,
3618                     ST.nextword-1, 0 );
3619                 SV *sv= tmp ? sv_newmortal() : NULL;
3620
3621                 PerlIO_printf( Perl_debug_log,
3622                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3623                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3624                     ST.nextword,
3625                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3626                             PL_colors[0], PL_colors[1],
3627                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3628                         )
3629                     : "not compiled under -Dr",
3630                     PL_colors[5] );
3631             });
3632
3633             locinput = PL_reginput;
3634             nextchr = UCHARAT(locinput);
3635             continue; /* execute rest of RE */
3636             /* NOTREACHED */
3637 #undef  ST
3638
3639         case EXACT: {
3640             char *s = STRING(scan);
3641             ln = STR_LEN(scan);
3642             if (utf8_target != UTF_PATTERN) {
3643                 /* The target and the pattern have differing utf8ness. */
3644                 char *l = locinput;
3645                 const char * const e = s + ln;
3646
3647                 if (utf8_target) {
3648                     /* The target is utf8, the pattern is not utf8. */
3649                     while (s < e) {
3650                         STRLEN ulen;
3651                         if (l >= PL_regeol)
3652                              sayNO;
3653                         if (NATIVE_TO_UNI(*(U8*)s) !=
3654                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3655                                             uniflags))
3656                              sayNO;
3657                         l += ulen;
3658                         s ++;
3659                     }
3660                 }
3661                 else {
3662                     /* The target is not utf8, the pattern is utf8. */
3663                     while (s < e) {
3664                         STRLEN ulen;
3665                         if (l >= PL_regeol)
3666                             sayNO;
3667                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3668                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3669                                            uniflags))
3670                             sayNO;
3671                         s += ulen;
3672                         l ++;
3673                     }
3674                 }
3675                 locinput = l;
3676                 nextchr = UCHARAT(locinput);
3677                 break;
3678             }
3679             /* The target and the pattern have the same utf8ness. */
3680             /* Inline the first character, for speed. */
3681             if (UCHARAT(s) != nextchr)
3682                 sayNO;
3683             if (PL_regeol - locinput < ln)
3684                 sayNO;
3685             if (ln > 1 && memNE(s, locinput, ln))
3686                 sayNO;
3687             locinput += ln;
3688             nextchr = UCHARAT(locinput);
3689             break;
3690             }
3691         case EXACTFL: {
3692             re_fold_t folder;
3693             const U8 * fold_array;
3694             const char * s;
3695
3696             PL_reg_flags |= RF_tainted;
3697             folder = foldEQ_locale;
3698             fold_array = PL_fold_locale;
3699             goto do_exactf;
3700
3701         case EXACTFU:
3702             folder = foldEQ_latin1;
3703             fold_array = PL_fold_latin1;
3704             goto do_exactf;
3705
3706         case EXACTF:
3707             folder = foldEQ;
3708             fold_array = PL_fold;
3709
3710           do_exactf:
3711             s = STRING(scan);
3712             ln = STR_LEN(scan);
3713
3714             if (utf8_target || UTF_PATTERN) {
3715               /* Either target or the pattern are utf8. */
3716                 const char * const l = locinput;
3717                 char *e = PL_regeol;
3718
3719                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3720                                l, &e, 0,  utf8_target)) {
3721                      /* One more case for the sharp s:
3722                       * pack("U0U*", 0xDF) =~ /ss/i,
3723                       * the 0xC3 0x9F are the UTF-8
3724                       * byte sequence for the U+00DF. */
3725
3726                      if (!(utf8_target &&
3727                            toLOWER(s[0]) == 's' &&
3728                            ln >= 2 &&
3729                            toLOWER(s[1]) == 's' &&
3730                            (U8)l[0] == 0xC3 &&
3731                            e - l >= 2 &&
3732                            (U8)l[1] == 0x9F))
3733                           sayNO;
3734                 }
3735                 locinput = e;
3736                 nextchr = UCHARAT(locinput);
3737                 break;
3738             }
3739
3740             /* Neither the target and the pattern are utf8. */
3741
3742             /* Inline the first character, for speed. */
3743             if (UCHARAT(s) != nextchr &&
3744                 UCHARAT(s) != fold_array[nextchr])
3745             {
3746                 sayNO;
3747             }
3748             if (PL_regeol - locinput < ln)
3749                 sayNO;
3750             if (ln > 1 && ! folder(s, locinput, ln))
3751                 sayNO;
3752             locinput += ln;
3753             nextchr = UCHARAT(locinput);
3754             break;
3755         }
3756
3757         /* XXX Could improve efficiency by separating these all out using a
3758          * macro or in-line function.  At that point regcomp.c would no longer
3759          * have to set the FLAGS fields of these */
3760         case BOUNDL:
3761         case NBOUNDL:
3762             PL_reg_flags |= RF_tainted;
3763             /* FALL THROUGH */
3764         case BOUND:
3765         case BOUNDU:
3766         case BOUNDA:
3767         case NBOUND:
3768         case NBOUNDU:
3769         case NBOUNDA:
3770             /* was last char in word? */
3771             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3772                 if (locinput == PL_bostr)
3773                     ln = '\n';
3774                 else {
3775                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3776
3777                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3778                 }
3779                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3780                     ln = isALNUM_uni(ln);
3781                     LOAD_UTF8_CHARCLASS_ALNUM();
3782                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3783                 }
3784                 else {
3785                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3786                     n = isALNUM_LC_utf8((U8*)locinput);
3787                 }
3788             }
3789             else {
3790
3791                 /* Here the string isn't utf8, or is utf8 and only ascii
3792                  * characters are to match \w.  In the latter case looking at
3793                  * the byte just prior to the current one may be just the final
3794                  * byte of a multi-byte character.  This is ok.  There are two
3795                  * cases:
3796                  * 1) it is a single byte character, and then the test is doing
3797                  *      just what it's supposed to.
3798                  * 2) it is a multi-byte character, in which case the final
3799                  *      byte is never mistakable for ASCII, and so the test
3800                  *      will say it is not a word character, which is the
3801                  *      correct answer. */
3802                 ln = (locinput != PL_bostr) ?
3803                     UCHARAT(locinput - 1) : '\n';
3804                 switch (FLAGS(scan)) {
3805                     case REGEX_UNICODE_CHARSET:
3806                         ln = isWORDCHAR_L1(ln);
3807                         n = isWORDCHAR_L1(nextchr);
3808                         break;
3809                     case REGEX_LOCALE_CHARSET:
3810                         ln = isALNUM_LC(ln);
3811                         n = isALNUM_LC(nextchr);
3812                         break;
3813                     case REGEX_DEPENDS_CHARSET:
3814                         ln = isALNUM(ln);
3815                         n = isALNUM(nextchr);
3816                         break;
3817                     case REGEX_ASCII_RESTRICTED_CHARSET:
3818                         ln = isWORDCHAR_A(ln);
3819                         n = isWORDCHAR_A(nextchr);
3820                         break;
3821                     default:
3822                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3823                         break;
3824                 }
3825             }
3826             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3827              * regcomp.sym */
3828             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3829                     sayNO;
3830             break;
3831         case ANYOFV:
3832         case ANYOF:
3833             if (utf8_target || state_num == ANYOFV) {
3834                 STRLEN inclasslen = PL_regeol - locinput;
3835                 if (locinput >= PL_regeol)
3836                     sayNO;
3837
3838                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3839                     sayNO;
3840                 locinput += inclasslen;
3841                 nextchr = UCHARAT(locinput);
3842                 break;
3843             }
3844             else {
3845                 if (nextchr < 0)
3846                     nextchr = UCHARAT(locinput);
3847                 if (!nextchr && locinput >= PL_regeol)
3848                     sayNO;
3849                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3850                     sayNO;
3851                 nextchr = UCHARAT(++locinput);
3852                 break;
3853             }
3854             break;
3855         /* Special char classes - The defines start on line 129 or so */
3856         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3857                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3858                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3859                   ALNUMA, NALNUMA, isWORDCHAR_A,
3860                   perl_word, "a");
3861
3862         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3863                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3864                   SPACEU, NSPACEU, isSPACE_L1,
3865                   SPACEA, NSPACEA, isSPACE_A,
3866                   perl_space, " ");
3867
3868         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3869                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3870                 DIGITA, NDIGITA, isDIGIT_A,
3871                 posix_digit, "0");
3872
3873         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3874                        a Unicode extended Grapheme Cluster */
3875             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3876               extended Grapheme Cluster is:
3877
3878                CR LF
3879                | Prepend* Begin Extend*
3880                | .
3881
3882                Begin is (Hangul-syllable | ! Control)
3883                Extend is (Grapheme_Extend | Spacing_Mark)
3884                Control is [ GCB_Control CR LF ]
3885
3886                The discussion below shows how the code for CLUMP is derived
3887                from this regex.  Note that most of these concepts are from
3888                property values of the Grapheme Cluster Boundary (GCB) property.
3889                No code point can have multiple property values for a given
3890                property.  Thus a code point in Prepend can't be in Control, but
3891                it must be in !Control.  This is why Control above includes
3892                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3893                property separately, and so can't be in GCB_Control, even though
3894                they logically are controls.  Control is not the same as gc=cc,
3895                but includes format and other characters as well.
3896
3897                The Unicode definition of Hangul-syllable is:
3898                    L+
3899                    | (L* ( ( V | LV ) V* | LVT ) T*)
3900                    | T+
3901                   )
3902                Each of these is a value for the GCB property, and hence must be
3903                disjoint, so the order they are tested is immaterial, so the
3904                above can safely be changed to
3905                    T+
3906                    | L+
3907                    | (L* ( LVT | ( V | LV ) V*) T*)
3908
3909                The last two terms can be combined like this:
3910                    L* ( L
3911                         | (( LVT | ( V | LV ) V*) T*))
3912
3913                And refactored into this:
3914                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3915
3916                That means that if we have seen any L's at all we can quit
3917                there, but if the next character is a LVT, a V or and LV we
3918                should keep going.
3919
3920                There is a subtlety with Prepend* which showed up in testing.
3921                Note that the Begin, and only the Begin is required in:
3922                 | Prepend* Begin Extend*
3923                Also, Begin contains '! Control'.  A Prepend must be a '!
3924                Control', which means it must be a Begin.  What it comes down to
3925                is that if we match Prepend* and then find no suitable Begin
3926                afterwards, that if we backtrack the last Prepend, that one will
3927                be a suitable Begin.
3928             */
3929
3930             if (locinput >= PL_regeol)
3931                 sayNO;
3932             if  (! utf8_target) {
3933
3934                 /* Match either CR LF  or '.', as all the other possibilities
3935                  * require utf8 */
3936                 locinput++;         /* Match the . or CR */
3937                 if (nextchr == '\r'
3938                     && locinput < PL_regeol
3939                     && UCHARAT(locinput) == '\n') locinput++;
3940             }
3941             else {
3942
3943                 /* Utf8: See if is ( CR LF ); already know that locinput <
3944                  * PL_regeol, so locinput+1 is in bounds */
3945                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3946                     locinput += 2;
3947                 }
3948                 else {
3949                     /* In case have to backtrack to beginning, then match '.' */
3950                     char *starting = locinput;
3951
3952                     /* In case have to backtrack the last prepend */
3953                     char *previous_prepend = 0;
3954
3955                     LOAD_UTF8_CHARCLASS_GCB();
3956
3957                     /* Match (prepend)* */
3958                     while (locinput < PL_regeol
3959                            && swash_fetch(PL_utf8_X_prepend,
3960                                           (U8*)locinput, utf8_target))
3961                     {
3962                         previous_prepend = locinput;
3963                         locinput += UTF8SKIP(locinput);
3964                     }
3965
3966                     /* As noted above, if we matched a prepend character, but
3967                      * the next thing won't match, back off the last prepend we
3968                      * matched, as it is guaranteed to match the begin */
3969                     if (previous_prepend
3970                         && (locinput >=  PL_regeol
3971                             || ! swash_fetch(PL_utf8_X_begin,
3972                                              (U8*)locinput, utf8_target)))
3973                     {
3974                         locinput = previous_prepend;
3975                     }
3976
3977                     /* Note that here we know PL_regeol > locinput, as we
3978                      * tested that upon input to this switch case, and if we
3979                      * moved locinput forward, we tested the result just above
3980                      * and it either passed, or we backed off so that it will
3981                      * now pass */
3982                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3983
3984                         /* Here did not match the required 'Begin' in the
3985                          * second term.  So just match the very first
3986                          * character, the '.' of the final term of the regex */
3987                         locinput = starting + UTF8SKIP(starting);
3988                     } else {
3989
3990                         /* Here is the beginning of a character that can have
3991                          * an extender.  It is either a hangul syllable, or a
3992                          * non-control */
3993                         if (swash_fetch(PL_utf8_X_non_hangul,
3994                                         (U8*)locinput, utf8_target))
3995                         {
3996
3997                             /* Here not a Hangul syllable, must be a
3998                              * ('!  * Control') */
3999                             locinput += UTF8SKIP(locinput);
4000                         } else {
4001
4002                             /* Here is a Hangul syllable.  It can be composed
4003                              * of several individual characters.  One
4004                              * possibility is T+ */
4005                             if (swash_fetch(PL_utf8_X_T,
4006                                             (U8*)locinput, utf8_target))
4007                             {
4008                                 while (locinput < PL_regeol
4009                                         && swash_fetch(PL_utf8_X_T,
4010                                                         (U8*)locinput, utf8_target))
4011                                 {
4012                                     locinput += UTF8SKIP(locinput);
4013                                 }
4014                             } else {
4015
4016                                 /* Here, not T+, but is a Hangul.  That means
4017                                  * it is one of the others: L, LV, LVT or V,
4018                                  * and matches:
4019                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
4020
4021                                 /* Match L*           */
4022                                 while (locinput < PL_regeol
4023                                         && swash_fetch(PL_utf8_X_L,
4024                                                         (U8*)locinput, utf8_target))
4025                                 {
4026                                     locinput += UTF8SKIP(locinput);
4027                                 }
4028
4029                                 /* Here, have exhausted L*.  If the next
4030                                  * character is not an LV, LVT nor V, it means
4031                                  * we had to have at least one L, so matches L+
4032                                  * in the original equation, we have a complete
4033                                  * hangul syllable.  Are done. */
4034
4035                                 if (locinput < PL_regeol
4036                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4037                                                     (U8*)locinput, utf8_target))
4038                                 {
4039
4040                                     /* Otherwise keep going.  Must be LV, LVT
4041                                      * or V.  See if LVT */
4042                                     if (swash_fetch(PL_utf8_X_LVT,
4043                                                     (U8*)locinput, utf8_target))
4044                                     {
4045                                         locinput += UTF8SKIP(locinput);
4046                                     } else {
4047
4048                                         /* Must be  V or LV.  Take it, then
4049                                          * match V*     */
4050                                         locinput += UTF8SKIP(locinput);
4051                                         while (locinput < PL_regeol
4052                                                 && swash_fetch(PL_utf8_X_V,
4053                                                          (U8*)locinput, utf8_target))
4054                                         {
4055                                             locinput += UTF8SKIP(locinput);
4056                                         }
4057                                     }
4058
4059                                     /* And any of LV, LVT, or V can be followed
4060                                      * by T*            */
4061                                     while (locinput < PL_regeol
4062                                            && swash_fetch(PL_utf8_X_T,
4063                                                            (U8*)locinput,
4064                                                            utf8_target))
4065                                     {
4066                                         locinput += UTF8SKIP(locinput);
4067                                     }
4068                                 }
4069                             }
4070                         }
4071
4072                         /* Match any extender */
4073                         while (locinput < PL_regeol
4074                                 && swash_fetch(PL_utf8_X_extend,
4075                                                 (U8*)locinput, utf8_target))
4076                         {
4077                             locinput += UTF8SKIP(locinput);
4078                         }
4079                     }
4080                 }
4081                 if (locinput > PL_regeol) sayNO;
4082             }
4083             nextchr = UCHARAT(locinput);
4084             break;
4085
4086         case NREFFL:
4087         {   /* The capture buffer cases.  The ones beginning with N for the
4088                named buffers just convert to the equivalent numbered and
4089                pretend they were called as the corresponding numbered buffer
4090                op.  */
4091             /* don't initialize these, it makes C++ unhappy */
4092             char *s;
4093             char type;
4094             re_fold_t folder;
4095             const U8 *fold_array;
4096
4097             PL_reg_flags |= RF_tainted;
4098             folder = foldEQ_locale;
4099             fold_array = PL_fold_locale;
4100             type = REFFL;
4101             goto do_nref;
4102
4103         case NREFFU:
4104             folder = foldEQ_latin1;
4105             fold_array = PL_fold_latin1;
4106             type = REFFU;
4107             goto do_nref;
4108
4109         case NREFF:
4110             folder = foldEQ;
4111             fold_array = PL_fold;
4112             type = REFF;
4113             goto do_nref;
4114
4115         case NREF:
4116             type = REF;
4117             folder = NULL;
4118             fold_array = NULL;
4119           do_nref:
4120
4121             /* For the named back references, find the corresponding buffer
4122              * number */
4123             n = reg_check_named_buff_matched(rex,scan);
4124
4125             if ( ! n ) {
4126                 sayNO;
4127             }
4128             goto do_nref_ref_common;
4129
4130         case REFFL:
4131             PL_reg_flags |= RF_tainted;
4132             folder = foldEQ_locale;
4133             fold_array = PL_fold_locale;
4134             goto do_ref;
4135
4136         case REFFU:
4137             folder = foldEQ_latin1;
4138             fold_array = PL_fold_latin1;
4139             goto do_ref;
4140
4141         case REFF:
4142             folder = foldEQ;
4143             fold_array = PL_fold;
4144             goto do_ref;
4145
4146         case REF:
4147             folder = NULL;
4148             fold_array = NULL;
4149
4150           do_ref:
4151             type = OP(scan);
4152             n = ARG(scan);  /* which paren pair */
4153
4154           do_nref_ref_common:
4155             ln = PL_regoffs[n].start;
4156             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4157             if (*PL_reglastparen < n || ln == -1)
4158                 sayNO;                  /* Do not match unless seen CLOSEn. */
4159             if (ln == PL_regoffs[n].end)
4160                 break;
4161
4162             s = PL_bostr + ln;
4163             if (type != REF     /* REF can do byte comparison */
4164                 && (utf8_target
4165                     || (type == REFFU
4166                         && (*s == (char) LATIN_SMALL_LETTER_SHARP_S
4167                             || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S))))
4168             { /* XXX handle REFFL better */
4169                 char * limit = PL_regeol;
4170
4171                 /* This call case insensitively compares the entire buffer
4172                     * at s, with the current input starting at locinput, but
4173                     * not going off the end given by PL_regeol, and returns in
4174                     * limit upon success, how much of the current input was
4175                     * matched */
4176                 if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4177                                     locinput, &limit, 0, utf8_target))
4178                 {
4179                     sayNO;
4180                 }
4181                 locinput = limit;
4182                 nextchr = UCHARAT(locinput);
4183                 break;
4184             }
4185
4186             /* Not utf8:  Inline the first character, for speed. */
4187             if (UCHARAT(s) != nextchr &&
4188                 (type == REF ||
4189                  UCHARAT(s) != fold_array[nextchr]))
4190                 sayNO;
4191             ln = PL_regoffs[n].end - ln;
4192             if (locinput + ln > PL_regeol)
4193                 sayNO;
4194             if (ln > 1 && (type == REF
4195                            ? memNE(s, locinput, ln)
4196                            : ! folder(s, locinput, ln)))
4197                 sayNO;
4198             locinput += ln;
4199             nextchr = UCHARAT(locinput);
4200             break;
4201         }
4202         case NOTHING:
4203         case TAIL:
4204             break;
4205         case BACK:
4206             break;
4207
4208 #undef  ST
4209 #define ST st->u.eval
4210         {
4211             SV *ret;
4212             REGEXP *re_sv;
4213             regexp *re;
4214             regexp_internal *rei;
4215             regnode *startpoint;
4216
4217         case GOSTART:
4218         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4219             if (cur_eval && cur_eval->locinput==locinput) {
4220                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4221                     Perl_croak(aTHX_ "Infinite recursion in regex");
4222                 if ( ++nochange_depth > max_nochange_depth )
4223                     Perl_croak(aTHX_
4224                         "Pattern subroutine nesting without pos change"
4225                         " exceeded limit in regex");
4226             } else {
4227                 nochange_depth = 0;
4228             }
4229             re_sv = rex_sv;
4230             re = rex;
4231             rei = rexi;
4232             (void)ReREFCNT_inc(rex_sv);
4233             if (OP(scan)==GOSUB) {
4234                 startpoint = scan + ARG2L(scan);
4235                 ST.close_paren = ARG(scan);
4236             } else {
4237                 startpoint = rei->program+1;
4238                 ST.close_paren = 0;
4239             }
4240             goto eval_recurse_doit;
4241             /* NOTREACHED */
4242         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4243             if (cur_eval && cur_eval->locinput==locinput) {
4244                 if ( ++nochange_depth > max_nochange_depth )
4245                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4246             } else {
4247                 nochange_depth = 0;
4248             }
4249             {
4250                 /* execute the code in the {...} */
4251                 dSP;
4252                 SV ** const before = SP;
4253                 OP_4tree * const oop = PL_op;
4254                 COP * const ocurcop = PL_curcop;
4255                 PAD *old_comppad;
4256                 char *saved_regeol = PL_regeol;
4257                 struct re_save_state saved_state;
4258
4259                 /* To not corrupt the existing regex state while executing the
4260                  * eval we would normally put it on the save stack, like with
4261                  * save_re_context. However, re-evals have a weird scoping so we
4262                  * can't just add ENTER/LEAVE here. With that, things like
4263                  *
4264                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4265                  *
4266                  * would break, as they expect the localisation to be unwound
4267                  * only when the re-engine backtracks through the bit that
4268                  * localised it.
4269                  *
4270                  * What we do instead is just saving the state in a local c
4271                  * variable.
4272                  */
4273                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4274
4275                 n = ARG(scan);
4276                 PL_op = (OP_4tree*)rexi->data->data[n];
4277                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4278                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4279                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4280                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4281
4282                 if (sv_yes_mark) {
4283                     SV *sv_mrk = get_sv("REGMARK", 1);
4284                     sv_setsv(sv_mrk, sv_yes_mark);
4285                 }
4286
4287                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4288                 SPAGAIN;
4289                 if (SP == before)
4290                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4291                 else {
4292                     ret = POPs;
4293                     PUTBACK;
4294                 }
4295
4296                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4297
4298                 PL_op = oop;
4299                 PAD_RESTORE_LOCAL(old_comppad);
4300                 PL_curcop = ocurcop;
4301                 PL_regeol = saved_regeol;
4302                 if (!logical) {
4303                     /* /(?{...})/ */
4304                     sv_setsv(save_scalar(PL_replgv), ret);
4305                     break;
4306                 }
4307             }
4308             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4309                 logical = 0;
4310                 {
4311                     /* extract RE object from returned value; compiling if
4312                      * necessary */
4313                     MAGIC *mg = NULL;
4314                     REGEXP *rx = NULL;
4315
4316                     if (SvROK(ret)) {
4317                         SV *const sv = SvRV(ret);
4318
4319                         if (SvTYPE(sv) == SVt_REGEXP) {
4320                             rx = (REGEXP*) sv;
4321                         } else if (SvSMAGICAL(sv)) {
4322                             mg = mg_find(sv, PERL_MAGIC_qr);
4323                             assert(mg);
4324                         }
4325                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4326                         rx = (REGEXP*) ret;
4327                     } else if (SvSMAGICAL(ret)) {
4328                         if (SvGMAGICAL(ret)) {
4329                             /* I don't believe that there is ever qr magic
4330                                here.  */
4331                             assert(!mg_find(ret, PERL_MAGIC_qr));
4332                             sv_unmagic(ret, PERL_MAGIC_qr);
4333                         }
4334                         else {
4335                             mg = mg_find(ret, PERL_MAGIC_qr);
4336                             /* testing suggests mg only ends up non-NULL for
4337                                scalars who were upgraded and compiled in the
4338                                else block below. In turn, this is only
4339                                triggered in the "postponed utf8 string" tests
4340                                in t/op/pat.t  */
4341                         }
4342                     }
4343
4344                     if (mg) {
4345                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4346                         assert(rx);
4347                     }
4348                     if (rx) {
4349                         rx = reg_temp_copy(NULL, rx);
4350                     }
4351                     else {
4352                         U32 pm_flags = 0;
4353                         const I32 osize = PL_regsize;
4354
4355                         if (DO_UTF8(ret)) {
4356                             assert (SvUTF8(ret));
4357                         } else if (SvUTF8(ret)) {
4358                             /* Not doing UTF-8, despite what the SV says. Is
4359                                this only if we're trapped in use 'bytes'?  */
4360                             /* Make a copy of the octet sequence, but without
4361                                the flag on, as the compiler now honours the
4362                                SvUTF8 flag on ret.  */
4363                             STRLEN len;
4364                             const char *const p = SvPV(ret, len);
4365                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4366                         }
4367                         rx = CALLREGCOMP(ret, pm_flags);
4368                         if (!(SvFLAGS(ret)
4369                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4370                                  | SVs_GMG))) {
4371                             /* This isn't a first class regexp. Instead, it's
4372                                caching a regexp onto an existing, Perl visible
4373                                scalar.  */
4374                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4375                         }
4376                         PL_regsize = osize;
4377                     }
4378                     re_sv = rx;
4379                     re = (struct regexp *)SvANY(rx);
4380                 }
4381                 RXp_MATCH_COPIED_off(re);
4382                 re->subbeg = rex->subbeg;
4383                 re->sublen = rex->sublen;
4384                 rei = RXi_GET(re);
4385                 DEBUG_EXECUTE_r(
4386                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4387                         "Matching embedded");
4388                 );
4389                 startpoint = rei->program + 1;
4390                 ST.close_paren = 0; /* only used for GOSUB */
4391                 /* borrowed from regtry */
4392                 if (PL_reg_start_tmpl <= re->nparens) {
4393                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4394                     if(PL_reg_start_tmp)
4395                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4396                     else
4397                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4398                 }
4399
4400         eval_recurse_doit: /* Share code with GOSUB below this line */
4401                 /* run the pattern returned from (??{...}) */
4402                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4403                 REGCP_SET(ST.lastcp);
4404
4405                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4406
4407                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4408                 PL_reglastparen = &re->lastparen;
4409                 PL_reglastcloseparen = &re->lastcloseparen;
4410                 re->lastparen = 0;
4411                 re->lastcloseparen = 0;
4412
4413                 PL_reginput = locinput;
4414                 PL_regsize = 0;
4415
4416                 /* XXXX This is too dramatic a measure... */
4417                 PL_reg_maxiter = 0;
4418
4419                 ST.toggle_reg_flags = PL_reg_flags;
4420                 if (RX_UTF8(re_sv))
4421                     PL_reg_flags |= RF_utf8;
4422                 else
4423                     PL_reg_flags &= ~RF_utf8;
4424                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4425
4426                 ST.prev_rex = rex_sv;
4427                 ST.prev_curlyx = cur_curlyx;
4428                 SETREX(rex_sv,re_sv);
4429                 rex = re;
4430                 rexi = rei;
4431                 cur_curlyx = NULL;
4432                 ST.B = next;
4433                 ST.prev_eval = cur_eval;
4434                 cur_eval = st;
4435                 /* now continue from first node in postoned RE */
4436                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4437                 /* NOTREACHED */
4438             }
4439             /* logical is 1,   /(?(?{...})X|Y)/ */
4440             sw = cBOOL(SvTRUE(ret));
4441             logical = 0;
4442             break;
4443         }
4444
4445         case EVAL_AB: /* cleanup after a successful (??{A})B */
4446             /* note: this is called twice; first after popping B, then A */
4447             PL_reg_flags ^= ST.toggle_reg_flags;
4448             ReREFCNT_dec(rex_sv);
4449             SETREX(rex_sv,ST.prev_rex);
4450             rex = (struct regexp *)SvANY(rex_sv);
4451             rexi = RXi_GET(rex);
4452             regcpblow(ST.cp);
4453             cur_eval = ST.prev_eval;
4454             cur_curlyx = ST.prev_curlyx;
4455
4456             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4457             PL_reglastparen = &rex->lastparen;
4458             PL_reglastcloseparen = &rex->lastcloseparen;
4459             /* also update PL_regoffs */
4460             PL_regoffs = rex->offs;
4461
4462             /* XXXX This is too dramatic a measure... */
4463             PL_reg_maxiter = 0;
4464             if ( nochange_depth )
4465                 nochange_depth--;
4466             sayYES;
4467
4468
4469         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4470             /* note: this is called twice; first after popping B, then A */
4471             PL_reg_flags ^= ST.toggle_reg_flags;
4472             ReREFCNT_dec(rex_sv);
4473             SETREX(rex_sv,ST.prev_rex);
4474             rex = (struct regexp *)SvANY(rex_sv);
4475             rexi = RXi_GET(rex);
4476             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4477             PL_reglastparen = &rex->lastparen;
4478             PL_reglastcloseparen = &rex->lastcloseparen;
4479
4480             PL_reginput = locinput;
4481             REGCP_UNWIND(ST.lastcp);
4482             regcppop(rex);
4483             cur_eval = ST.prev_eval;
4484             cur_curlyx = ST.prev_curlyx;
4485             /* XXXX This is too dramatic a measure... */
4486             PL_reg_maxiter = 0;
4487             if ( nochange_depth )
4488                 nochange_depth--;
4489             sayNO_SILENT;
4490 #undef ST
4491
4492         case OPEN:
4493             n = ARG(scan);  /* which paren pair */
4494             PL_reg_start_tmp[n] = locinput;
4495             if (n > PL_regsize)
4496                 PL_regsize = n;
4497             lastopen = n;
4498             break;
4499         case CLOSE:
4500             n = ARG(scan);  /* which paren pair */
4501             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4502             PL_regoffs[n].end = locinput - PL_bostr;
4503             /*if (n > PL_regsize)
4504                 PL_regsize = n;*/
4505             if (n > *PL_reglastparen)
4506                 *PL_reglastparen = n;
4507             *PL_reglastcloseparen = n;
4508             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4509                 goto fake_end;
4510             }
4511             break;
4512         case ACCEPT:
4513             if (ARG(scan)){
4514                 regnode *cursor;
4515                 for (cursor=scan;
4516                      cursor && OP(cursor)!=END;
4517                      cursor=regnext(cursor))
4518                 {
4519                     if ( OP(cursor)==CLOSE ){
4520                         n = ARG(cursor);
4521                         if ( n <= lastopen ) {
4522                             PL_regoffs[n].start
4523                                 = PL_reg_start_tmp[n] - PL_bostr;
4524                             PL_regoffs[n].end = locinput - PL_bostr;
4525                             /*if (n > PL_regsize)
4526                             PL_regsize = n;*/
4527                             if (n > *PL_reglastparen)
4528                                 *PL_reglastparen = n;
4529                             *PL_reglastcloseparen = n;
4530                             if ( n == ARG(scan) || (cur_eval &&
4531                                 cur_eval->u.eval.close_paren == n))
4532                                 break;
4533                         }
4534                     }
4535                 }
4536             }
4537             goto fake_end;
4538             /*NOTREACHED*/
4539         case GROUPP:
4540             n = ARG(scan);  /* which paren pair */
4541             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4542             break;
4543         case NGROUPP:
4544             /* reg_check_named_buff_matched returns 0 for no match */
4545             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4546             break;
4547         case INSUBP:
4548             n = ARG(scan);
4549             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4550             break;
4551         case DEFINEP:
4552             sw = 0;
4553             break;
4554         case IFTHEN:
4555             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4556             if (sw)
4557                 next = NEXTOPER(NEXTOPER(scan));
4558             else {
4559                 next = scan + ARG(scan);
4560                 if (OP(next) == IFTHEN) /* Fake one. */
4561                     next = NEXTOPER(NEXTOPER(next));
4562             }
4563             break;
4564         case LOGICAL:
4565             logical = scan->flags;
4566             break;
4567
4568 /*******************************************************************
4569
4570 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4571 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4572 STAR/PLUS/CURLY/CURLYN are used instead.)
4573
4574 A*B is compiled as <CURLYX><A><WHILEM><B>
4575
4576 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4577 state, which contains the current count, initialised to -1. It also sets
4578 cur_curlyx to point to this state, with any previous value saved in the
4579 state block.
4580
4581 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4582 since the pattern may possibly match zero times (i.e. it's a while {} loop
4583 rather than a do {} while loop).
4584
4585 Each entry to WHILEM represents a successful match of A. The count in the
4586 CURLYX block is incremented, another WHILEM state is pushed, and execution
4587 passes to A or B depending on greediness and the current count.
4588
4589 For example, if matching against the string a1a2a3b (where the aN are
4590 substrings that match /A/), then the match progresses as follows: (the
4591 pushed states are interspersed with the bits of strings matched so far):
4592
4593     <CURLYX cnt=-1>
4594     <CURLYX cnt=0><WHILEM>
4595     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4596     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4597     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4598     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4599
4600 (Contrast this with something like CURLYM, which maintains only a single
4601 backtrack state:
4602
4603     <CURLYM cnt=0> a1
4604     a1 <CURLYM cnt=1> a2
4605     a1 a2 <CURLYM cnt=2> a3
4606     a1 a2 a3 <CURLYM cnt=3> b
4607 )
4608
4609 Each WHILEM state block marks a point to backtrack to upon partial failure
4610 of A or B, and also contains some minor state data related to that
4611 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4612 overall state, such as the count, and pointers to the A and B ops.
4613
4614 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4615 must always point to the *current* CURLYX block, the rules are:
4616
4617 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4618 and set cur_curlyx to point the new block.
4619
4620 When popping the CURLYX block after a successful or unsuccessful match,
4621 restore the previous cur_curlyx.
4622
4623 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4624 to the outer one saved in the CURLYX block.
4625
4626 When popping the WHILEM block after a successful or unsuccessful B match,
4627 restore the previous cur_curlyx.
4628
4629 Here's an example for the pattern (AI* BI)*BO
4630 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4631
4632 cur_
4633 curlyx backtrack stack
4634 ------ ---------------
4635 NULL
4636 CO     <CO prev=NULL> <WO>
4637 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4638 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4639 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4640
4641 At this point the pattern succeeds, and we work back down the stack to
4642 clean up, restoring as we go:
4643
4644 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4645 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4646 CO     <CO prev=NULL> <WO>
4647 NULL
4648
4649 *******************************************************************/
4650
4651 #define ST st->u.curlyx
4652
4653         case CURLYX:    /* start of /A*B/  (for complex A) */
4654         {
4655             /* No need to save/restore up to this paren */
4656             I32 parenfloor = scan->flags;
4657
4658             assert(next); /* keep Coverity happy */
4659             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4660                 next += ARG(next);
4661
4662             /* XXXX Probably it is better to teach regpush to support
4663                parenfloor > PL_regsize... */
4664             if (parenfloor > (I32)*PL_reglastparen)
4665                 parenfloor = *PL_reglastparen; /* Pessimization... */
4666
4667             ST.prev_curlyx= cur_curlyx;
4668             cur_curlyx = st;
4669             ST.cp = PL_savestack_ix;
4670
4671             /* these fields contain the state of the current curly.
4672              * they are accessed by subsequent WHILEMs */
4673             ST.parenfloor = parenfloor;
4674             ST.me = scan;
4675             ST.B = next;
4676             ST.minmod = minmod;
4677             minmod = 0;
4678             ST.count = -1;      /* this will be updated by WHILEM */
4679             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4680
4681             PL_reginput = locinput;
4682             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4683             /* NOTREACHED */
4684         }
4685
4686         case CURLYX_end: /* just finished matching all of A*B */
4687             cur_curlyx = ST.prev_curlyx;
4688             sayYES;
4689             /* NOTREACHED */
4690
4691         case CURLYX_end_fail: /* just failed to match all of A*B */
4692             regcpblow(ST.cp);
4693             cur_curlyx = ST.prev_curlyx;
4694             sayNO;
4695             /* NOTREACHED */
4696
4697
4698 #undef ST
4699 #define ST st->u.whilem
4700
4701         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4702         {
4703             /* see the discussion above about CURLYX/WHILEM */
4704             I32 n;
4705             int min = ARG1(cur_curlyx->u.curlyx.me);
4706             int max = ARG2(cur_curlyx->u.curlyx.me);
4707             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4708
4709             assert(cur_curlyx); /* keep Coverity happy */
4710             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4711             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4712             ST.cache_offset = 0;
4713             ST.cache_mask = 0;
4714
4715             PL_reginput = locinput;
4716
4717             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4718                   "%*s  whilem: matched %ld out of %d..%d\n",
4719                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4720             );
4721
4722             /* First just match a string of min A's. */
4723
4724             if (n < min) {
4725                 cur_curlyx->u.curlyx.lastloc = locinput;
4726                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4727                 /* NOTREACHED */
4728             }
4729
4730             /* If degenerate A matches "", assume A done. */
4731
4732             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4733                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4734                    "%*s  whilem: empty match detected, trying continuation...\n",
4735                    REPORT_CODE_OFF+depth*2, "")
4736                 );
4737                 goto do_whilem_B_max;
4738             }
4739
4740             /* super-linear cache processing */
4741
4742             if (scan->flags) {
4743
4744                 if (!PL_reg_maxiter) {
4745                     /* start the countdown: Postpone detection until we
4746                      * know the match is not *that* much linear. */
4747                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4748                     /* possible overflow for long strings and many CURLYX's */
4749                     if (PL_reg_maxiter < 0)
4750                         PL_reg_maxiter = I32_MAX;
4751                     PL_reg_leftiter = PL_reg_maxiter;
4752                 }
4753
4754                 if (PL_reg_leftiter-- == 0) {
4755                     /* initialise cache */
4756                     const I32 size = (PL_reg_maxiter + 7)/8;
4757                     if (PL_reg_poscache) {
4758                         if ((I32)PL_reg_poscache_size < size) {
4759                             Renew(PL_reg_poscache, size, char);
4760                             PL_reg_poscache_size = size;
4761                         }
4762                         Zero(PL_reg_poscache, size, char);
4763                     }
4764                     else {
4765                         PL_reg_poscache_size = size;
4766                         Newxz(PL_reg_poscache, size, char);
4767                     }
4768                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4769       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4770                               PL_colors[4], PL_colors[5])
4771                     );
4772                 }
4773
4774                 if (PL_reg_leftiter < 0) {
4775                     /* have we already failed at this position? */
4776                     I32 offset, mask;
4777                     offset  = (scan->flags & 0xf) - 1
4778                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4779                     mask    = 1 << (offset % 8);
4780                     offset /= 8;
4781                     if (PL_reg_poscache[offset] & mask) {
4782                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4783                             "%*s  whilem: (cache) already tried at this position...\n",
4784                             REPORT_CODE_OFF+depth*2, "")
4785                         );
4786                         sayNO; /* cache records failure */
4787                     }
4788                     ST.cache_offset = offset;
4789                     ST.cache_mask   = mask;
4790                 }
4791             }
4792
4793             /* Prefer B over A for minimal matching. */
4794
4795             if (cur_curlyx->u.curlyx.minmod) {
4796                 ST.save_curlyx = cur_curlyx;
4797                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4798                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4799                 REGCP_SET(ST.lastcp);
4800                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4801                 /* NOTREACHED */
4802             }
4803
4804             /* Prefer A over B for maximal matching. */
4805
4806             if (n < max) { /* More greed allowed? */
4807                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4808                 cur_curlyx->u.curlyx.lastloc = locinput;
4809                 REGCP_SET(ST.lastcp);
4810                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4811                 /* NOTREACHED */
4812             }
4813             goto do_whilem_B_max;
4814         }
4815         /* NOTREACHED */
4816
4817         case WHILEM_B_min: /* just matched B in a minimal match */
4818         case WHILEM_B_max: /* just matched B in a maximal match */
4819             cur_curlyx = ST.save_curlyx;
4820             sayYES;
4821             /* NOTREACHED */
4822
4823         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4824             cur_curlyx = ST.save_curlyx;
4825             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4826             cur_curlyx->u.curlyx.count--;
4827             CACHEsayNO;
4828             /* NOTREACHED */
4829
4830         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4831             REGCP_UNWIND(ST.lastcp);
4832             regcppop(rex);
4833             /* FALL THROUGH */
4834         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4835             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4836             cur_curlyx->u.curlyx.count--;
4837             CACHEsayNO;
4838             /* NOTREACHED */
4839
4840         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4841             REGCP_UNWIND(ST.lastcp);
4842             regcppop(rex);      /* Restore some previous $<digit>s? */
4843             PL_reginput = locinput;
4844             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4845                 "%*s  whilem: failed, trying continuation...\n",
4846                 REPORT_CODE_OFF+depth*2, "")
4847             );
4848           do_whilem_B_max:
4849             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4850                 && ckWARN(WARN_REGEXP)
4851                 && !(PL_reg_flags & RF_warned))
4852             {
4853                 PL_reg_flags |= RF_warned;
4854                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4855                      "Complex regular subexpression recursion",
4856                      REG_INFTY - 1);
4857             }
4858
4859             /* now try B */
4860             ST.save_curlyx = cur_curlyx;
4861             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4862             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4863             /* NOTREACHED */
4864
4865         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4866             cur_curlyx = ST.save_curlyx;
4867             REGCP_UNWIND(ST.lastcp);
4868             regcppop(rex);
4869
4870             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4871                 /* Maximum greed exceeded */
4872                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4873                     && ckWARN(WARN_REGEXP)
4874                     && !(PL_reg_flags & RF_warned))
4875                 {
4876                     PL_reg_flags |= RF_warned;
4877                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4878                         "%s limit (%d) exceeded",
4879                         "Complex regular subexpression recursion",
4880                         REG_INFTY - 1);
4881                 }
4882                 cur_curlyx->u.curlyx.count--;
4883                 CACHEsayNO;
4884             }
4885
4886             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4887                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4888             );
4889             /* Try grabbing another A and see if it helps. */
4890             PL_reginput = locinput;
4891             cur_curlyx->u.curlyx.lastloc = locinput;
4892             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4893             REGCP_SET(ST.lastcp);
4894             PUSH_STATE_GOTO(WHILEM_A_min,
4895                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4896             /* NOTREACHED */
4897
4898 #undef  ST
4899 #define ST st->u.branch
4900
4901         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4902             next = scan + ARG(scan);
4903             if (next == scan)
4904                 next = NULL;
4905             scan = NEXTOPER(scan);
4906             /* FALL THROUGH */
4907
4908         case BRANCH:        /*  /(...|A|...)/ */
4909             scan = NEXTOPER(scan); /* scan now points to inner node */
4910             ST.lastparen = *PL_reglastparen;
4911             ST.next_branch = next;
4912             REGCP_SET(ST.cp);
4913             PL_reginput = locinput;
4914
4915             /* Now go into the branch */
4916             if (has_cutgroup) {
4917                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4918             } else {
4919                 PUSH_STATE_GOTO(BRANCH_next, scan);
4920             }
4921             /* NOTREACHED */
4922         case CUTGROUP:
4923             PL_reginput = locinput;
4924             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4925                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4926             PUSH_STATE_GOTO(CUTGROUP_next,next);
4927             /* NOTREACHED */
4928         case CUTGROUP_next_fail:
4929             do_cutgroup = 1;
4930             no_final = 1;
4931             if (st->u.mark.mark_name)
4932                 sv_commit = st->u.mark.mark_name;
4933             sayNO;
4934             /* NOTREACHED */
4935         case BRANCH_next:
4936             sayYES;
4937             /* NOTREACHED */
4938         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4939             if (do_cutgroup) {
4940                 do_cutgroup = 0;
4941                 no_final = 0;
4942             }
4943             REGCP_UNWIND(ST.cp);
4944             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4945                 PL_regoffs[n].end = -1;
4946             *PL_reglastparen = n;
4947             /*dmq: *PL_reglastcloseparen = n; */
4948             scan = ST.next_branch;
4949             /* no more branches? */
4950             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4951                 DEBUG_EXECUTE_r({
4952                     PerlIO_printf( Perl_debug_log,
4953                         "%*s  %sBRANCH failed...%s\n",
4954                         REPORT_CODE_OFF+depth*2, "",
4955                         PL_colors[4],
4956                         PL_colors[5] );
4957                 });
4958                 sayNO_SILENT;
4959             }
4960             continue; /* execute next BRANCH[J] op */
4961             /* NOTREACHED */
4962
4963         case MINMOD:
4964             minmod = 1;
4965             break;
4966
4967 #undef  ST
4968 #define ST st->u.curlym
4969
4970         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4971
4972             /* This is an optimisation of CURLYX that enables us to push
4973              * only a single backtracking state, no matter how many matches
4974              * there are in {m,n}. It relies on the pattern being constant
4975              * length, with no parens to influence future backrefs
4976              */
4977
4978             ST.me = scan;
4979             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4980
4981             /* if paren positive, emulate an OPEN/CLOSE around A */
4982             if (ST.me->flags) {
4983                 U32 paren = ST.me->flags;
4984                 if (paren > PL_regsize)
4985                     PL_regsize = paren;
4986                 if (paren > *PL_reglastparen)
4987                     *PL_reglastparen = paren;
4988                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4989             }
4990             ST.A = scan;
4991             ST.B = next;
4992             ST.alen = 0;
4993             ST.count = 0;
4994             ST.minmod = minmod;
4995             minmod = 0;
4996             ST.c1 = CHRTEST_UNINIT;
4997             REGCP_SET(ST.cp);
4998
4999             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5000                 goto curlym_do_B;
5001
5002           curlym_do_A: /* execute the A in /A{m,n}B/  */
5003             PL_reginput = locinput;
5004             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5005             /* NOTREACHED */
5006
5007         case CURLYM_A: /* we've just matched an A */
5008             locinput = st->locinput;
5009             nextchr = UCHARAT(locinput);
5010
5011             ST.count++;
5012             /* after first match, determine A's length: u.curlym.alen */
5013             if (ST.count == 1) {
5014                 if (PL_reg_match_utf8) {
5015                     char *s = locinput;
5016                     while (s < PL_reginput) {
5017                         ST.alen++;
5018                         s += UTF8SKIP(s);
5019                     }
5020                 }
5021                 else {
5022                     ST.alen = PL_reginput - locinput;
5023                 }
5024                 if (ST.alen == 0)
5025                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5026             }
5027             DEBUG_EXECUTE_r(
5028                 PerlIO_printf(Perl_debug_log,
5029                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5030                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5031                           (IV) ST.count, (IV)ST.alen)
5032             );
5033
5034             locinput = PL_reginput;
5035
5036             if (cur_eval && cur_eval->u.eval.close_paren &&
5037                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5038                 goto fake_end;
5039
5040             {
5041                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5042                 if ( max == REG_INFTY || ST.count < max )
5043                     goto curlym_do_A; /* try to match another A */
5044             }
5045             goto curlym_do_B; /* try to match B */
5046
5047         case CURLYM_A_fail: /* just failed to match an A */
5048             REGCP_UNWIND(ST.cp);
5049
5050             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5051                 || (cur_eval && cur_eval->u.eval.close_paren &&
5052                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5053                 sayNO;
5054
5055           curlym_do_B: /* execute the B in /A{m,n}B/  */
5056             PL_reginput = locinput;
5057             if (ST.c1 == CHRTEST_UNINIT) {
5058                 /* calculate c1 and c2 for possible match of 1st char
5059                  * following curly */
5060                 ST.c1 = ST.c2 = CHRTEST_VOID;
5061                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5062                     regnode *text_node = ST.B;
5063                     if (! HAS_TEXT(text_node))
5064                         FIND_NEXT_IMPT(text_node);
5065                     /* this used to be
5066
5067                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5068
5069                         But the former is redundant in light of the latter.
5070
5071                         if this changes back then the macro for
5072                         IS_TEXT and friends need to change.
5073                      */
5074                     if (PL_regkind[OP(text_node)] == EXACT)
5075                     {
5076
5077                         ST.c1 = (U8)*STRING(text_node);
5078                         switch (OP(text_node)) {
5079                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5080                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5081                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5082                             default: ST.c2 = ST.c1;
5083                         }
5084                     }
5085                 }
5086             }
5087
5088             DEBUG_EXECUTE_r(
5089                 PerlIO_printf(Perl_debug_log,
5090                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5091                     (int)(REPORT_CODE_OFF+(depth*2)),
5092                     "", (IV)ST.count)
5093                 );
5094             if (ST.c1 != CHRTEST_VOID
5095                     && UCHARAT(PL_reginput) != ST.c1
5096                     && UCHARAT(PL_reginput) != ST.c2)
5097             {
5098                 /* simulate B failing */
5099                 DEBUG_OPTIMISE_r(
5100                     PerlIO_printf(Perl_debug_log,
5101                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5102                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5103                         (IV)ST.c1,(IV)ST.c2
5104                 ));
5105                 state_num = CURLYM_B_fail;
5106                 goto reenter_switch;
5107             }
5108
5109             if (ST.me->flags) {
5110                 /* mark current A as captured */
5111                 I32 paren = ST.me->flags;
5112                 if (ST.count) {
5113                     PL_regoffs[paren].start
5114                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5115                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5116                     /*dmq: *PL_reglastcloseparen = paren; */
5117                 }
5118                 else
5119                     PL_regoffs[paren].end = -1;
5120                 if (cur_eval && cur_eval->u.eval.close_paren &&
5121                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5122                 {
5123                     if (ST.count)
5124                         goto fake_end;
5125                     else
5126                         sayNO;
5127                 }
5128             }
5129
5130             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5131             /* NOTREACHED */
5132
5133         case CURLYM_B_fail: /* just failed to match a B */
5134             REGCP_UNWIND(ST.cp);
5135             if (ST.minmod) {
5136                 I32 max = ARG2(ST.me);
5137                 if (max != REG_INFTY && ST.count == max)
5138                     sayNO;
5139                 goto curlym_do_A; /* try to match a further A */
5140             }
5141             /* backtrack one A */
5142             if (ST.count == ARG1(ST.me) /* min */)
5143                 sayNO;
5144             ST.count--;
5145             locinput = HOPc(locinput, -ST.alen);
5146             goto curlym_do_B; /* try to match B */
5147
5148 #undef ST
5149 #define ST st->u.curly
5150
5151 #define CURLY_SETPAREN(paren, success) \
5152     if (paren) { \
5153         if (success) { \
5154             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5155             PL_regoffs[paren].end = locinput - PL_bostr; \
5156             *PL_reglastcloseparen = paren; \
5157         } \
5158         else \
5159             PL_regoffs[paren].end = -1; \
5160     }
5161
5162         case STAR:              /*  /A*B/ where A is width 1 */
5163             ST.paren = 0;
5164             ST.min = 0;
5165             ST.max = REG_INFTY;
5166             scan = NEXTOPER(scan);
5167             goto repeat;
5168         case PLUS:              /*  /A+B/ where A is width 1 */
5169             ST.paren = 0;
5170             ST.min = 1;
5171             ST.max = REG_INFTY;
5172             scan = NEXTOPER(scan);
5173             goto repeat;
5174         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5175             ST.paren = scan->flags;     /* Which paren to set */
5176             if (ST.paren > PL_regsize)
5177                 PL_regsize = ST.paren;
5178             if (ST.paren > *PL_reglastparen)
5179                 *PL_reglastparen = ST.paren;
5180             ST.min = ARG1(scan);  /* min to match */
5181             ST.max = ARG2(scan);  /* max to match */
5182             if (cur_eval && cur_eval->u.eval.close_paren &&
5183                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5184                 ST.min=1;
5185                 ST.max=1;
5186             }
5187             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5188             goto repeat;
5189         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5190             ST.paren = 0;
5191             ST.min = ARG1(scan);  /* min to match */
5192             ST.max = ARG2(scan);  /* max to match */
5193             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5194           repeat:
5195             /*
5196             * Lookahead to avoid useless match attempts
5197             * when we know what character comes next.
5198             *
5199             * Used to only do .*x and .*?x, but now it allows
5200             * for )'s, ('s and (?{ ... })'s to be in the way
5201             * of the quantifier and the EXACT-like node.  -- japhy
5202             */
5203
5204             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5205                 sayNO;
5206             if (HAS_TEXT(next) || JUMPABLE(next)) {
5207                 U8 *s;
5208                 regnode *text_node = next;
5209
5210                 if (! HAS_TEXT(text_node))
5211                     FIND_NEXT_IMPT(text_node);
5212
5213                 if (! HAS_TEXT(text_node))
5214                     ST.c1 = ST.c2 = CHRTEST_VOID;
5215                 else {
5216                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5217                         ST.c1 = ST.c2 = CHRTEST_VOID;
5218                         goto assume_ok_easy;
5219                     }
5220                     else
5221                         s = (U8*)STRING(text_node);
5222
5223                     /*  Currently we only get here when
5224
5225                         PL_rekind[OP(text_node)] == EXACT
5226
5227                         if this changes back then the macro for IS_TEXT and
5228                         friends need to change. */
5229                     if (!UTF_PATTERN) {
5230                         ST.c1 = *s;
5231                         switch (OP(text_node)) {
5232                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5233                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5234                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5235                             default: ST.c2 = ST.c1; break;
5236                         }
5237                     }
5238                     else { /* UTF_PATTERN */
5239                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5240                              STRLEN ulen1, ulen2;
5241                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5242                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5243
5244                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5245                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5246 #ifdef EBCDIC
5247                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5248                                                     ckWARN(WARN_UTF8) ?
5249                                                     0 : UTF8_ALLOW_ANY);
5250                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5251                                                     ckWARN(WARN_UTF8) ?
5252                                                     0 : UTF8_ALLOW_ANY);
5253 #else
5254                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5255                                                     uniflags);
5256                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5257                                                     uniflags);
5258 #endif
5259                         }
5260                         else {
5261                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5262                                                      uniflags);
5263                         }
5264                     }
5265                 }
5266             }
5267             else
5268                 ST.c1 = ST.c2 = CHRTEST_VOID;
5269         assume_ok_easy:
5270
5271             ST.A = scan;
5272             ST.B = next;
5273             PL_reginput = locinput;
5274             if (minmod) {
5275                 minmod = 0;
5276                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5277                     sayNO;
5278                 ST.count = ST.min;
5279                 locinput = PL_reginput;
5280                 REGCP_SET(ST.cp);
5281                 if (ST.c1 == CHRTEST_VOID)
5282                     goto curly_try_B_min;
5283
5284                 ST.oldloc = locinput;
5285
5286                 /* set ST.maxpos to the furthest point along the
5287                  * string that could possibly match */
5288                 if  (ST.max == REG_INFTY) {
5289                     ST.maxpos = PL_regeol - 1;
5290                     if (utf8_target)
5291                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5292                             ST.maxpos--;
5293                 }
5294                 else if (utf8_target) {
5295                     int m = ST.max - ST.min;
5296                     for (ST.maxpos = locinput;
5297                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5298                         ST.maxpos += UTF8SKIP(ST.maxpos);
5299                 }
5300                 else {
5301                     ST.maxpos = locinput + ST.max - ST.min;
5302                     if (ST.maxpos >= PL_regeol)
5303                         ST.maxpos = PL_regeol - 1;
5304                 }
5305                 goto curly_try_B_min_known;
5306
5307             }
5308             else {
5309                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5310                 locinput = PL_reginput;
5311                 if (ST.count < ST.min)
5312                     sayNO;
5313                 if ((ST.count > ST.min)
5314                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5315                 {
5316                     /* A{m,n} must come at the end of the string, there's
5317                      * no point in backing off ... */
5318                     ST.min = ST.count;
5319                     /* ...except that $ and \Z can match before *and* after
5320                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5321                        We may back off by one in this case. */
5322                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5323                         ST.min--;
5324                 }
5325                 REGCP_SET(ST.cp);
5326                 goto curly_try_B_max;
5327             }
5328             /* NOTREACHED */
5329
5330
5331         case CURLY_B_min_known_fail:
5332             /* failed to find B in a non-greedy match where c1,c2 valid */
5333             if (ST.paren && ST.count)
5334                 PL_regoffs[ST.paren].end = -1;
5335
5336             PL_reginput = locinput;     /* Could be reset... */
5337             REGCP_UNWIND(ST.cp);
5338             /* Couldn't or didn't -- move forward. */
5339             ST.oldloc = locinput;
5340             if (utf8_target)
5341                 locinput += UTF8SKIP(locinput);
5342             else
5343                 locinput++;
5344             ST.count++;
5345           curly_try_B_min_known:
5346              /* find the next place where 'B' could work, then call B */
5347             {
5348                 int n;
5349                 if (utf8_target) {
5350                     n = (ST.oldloc == locinput) ? 0 : 1;
5351                     if (ST.c1 == ST.c2) {
5352                         STRLEN len;
5353                         /* set n to utf8_distance(oldloc, locinput) */
5354                         while (locinput <= ST.maxpos &&
5355                                utf8n_to_uvchr((U8*)locinput,
5356                                               UTF8_MAXBYTES, &len,
5357                                               uniflags) != (UV)ST.c1) {
5358                             locinput += len;
5359                             n++;
5360                         }
5361                     }
5362                     else {
5363                         /* set n to utf8_distance(oldloc, locinput) */
5364                         while (locinput <= ST.maxpos) {
5365                             STRLEN len;
5366                             const UV c = utf8n_to_uvchr((U8*)locinput,
5367                                                   UTF8_MAXBYTES, &len,
5368                                                   uniflags);
5369                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5370                                 break;
5371                             locinput += len;
5372                             n++;
5373                         }
5374                     }
5375                 }
5376                 else {
5377                     if (ST.c1 == ST.c2) {
5378                         while (locinput <= ST.maxpos &&
5379                                UCHARAT(locinput) != ST.c1)
5380                             locinput++;
5381                     }
5382                     else {
5383                         while (locinput <= ST.maxpos
5384                                && UCHARAT(locinput) != ST.c1
5385                                && UCHARAT(locinput) != ST.c2)
5386                             locinput++;
5387                     }
5388                     n = locinput - ST.oldloc;
5389                 }
5390                 if (locinput > ST.maxpos)
5391                     sayNO;
5392                 /* PL_reginput == oldloc now */
5393                 if (n) {
5394                     ST.count += n;
5395                     if (regrepeat(rex, ST.A, n, depth) < n)
5396                         sayNO;
5397                 }
5398                 PL_reginput = locinput;
5399                 CURLY_SETPAREN(ST.paren, ST.count);
5400                 if (cur_eval && cur_eval->u.eval.close_paren &&
5401                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5402                     goto fake_end;
5403                 }
5404                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5405             }
5406             /* NOTREACHED */
5407
5408
5409         case CURLY_B_min_fail:
5410             /* failed to find B in a non-greedy match where c1,c2 invalid */
5411             if (ST.paren && ST.count)
5412                 PL_regoffs[ST.paren].end = -1;
5413
5414             REGCP_UNWIND(ST.cp);
5415             /* failed -- move forward one */
5416             PL_reginput = locinput;
5417             if (regrepeat(rex, ST.A, 1, depth)) {
5418                 ST.count++;
5419                 locinput = PL_reginput;
5420                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5421                         ST.count > 0)) /* count overflow ? */
5422                 {
5423                   curly_try_B_min:
5424                     CURLY_SETPAREN(ST.paren, ST.count);
5425                     if (cur_eval && cur_eval->u.eval.close_paren &&
5426                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5427                         goto fake_end;
5428                     }
5429                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5430                 }
5431             }
5432             sayNO;
5433             /* NOTREACHED */
5434
5435
5436         curly_try_B_max:
5437             /* a successful greedy match: now try to match B */
5438             if (cur_eval && cur_eval->u.eval.close_paren &&
5439                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5440                 goto fake_end;
5441             }
5442             {
5443                 UV c = 0;
5444                 if (ST.c1 != CHRTEST_VOID)
5445                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5446                                            UTF8_MAXBYTES, 0, uniflags)
5447                                 : (UV) UCHARAT(PL_reginput);
5448                 /* If it could work, try it. */
5449                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5450                     CURLY_SETPAREN(ST.paren, ST.count);
5451                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5452                     /* NOTREACHED */
5453                 }
5454             }
5455             /* FALL THROUGH */
5456         case CURLY_B_max_fail:
5457             /* failed to find B in a greedy match */
5458             if (ST.paren && ST.count)
5459                 PL_regoffs[ST.paren].end = -1;
5460
5461             REGCP_UNWIND(ST.cp);
5462             /*  back up. */
5463             if (--ST.count < ST.min)
5464                 sayNO;
5465             PL_reginput = locinput = HOPc(locinput, -1);
5466             goto curly_try_B_max;
5467
5468 #undef ST
5469
5470         case END:
5471             fake_end:
5472             if (cur_eval) {
5473                 /* we've just finished A in /(??{A})B/; now continue with B */
5474                 I32 tmpix;
5475                 st->u.eval.toggle_reg_flags
5476                             = cur_eval->u.eval.toggle_reg_flags;
5477                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5478
5479                 st->u.eval.prev_rex = rex_sv;           /* inner */
5480                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5481                 rex = (struct regexp *)SvANY(rex_sv);
5482                 rexi = RXi_GET(rex);
5483                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5484                 ReREFCNT_inc(rex_sv);
5485                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5486
5487                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5488                 PL_reglastparen = &rex->lastparen;
5489                 PL_reglastcloseparen = &rex->lastcloseparen;
5490
5491                 REGCP_SET(st->u.eval.lastcp);
5492                 PL_reginput = locinput;
5493
5494                 /* Restore parens of the outer rex without popping the
5495                  * savestack */
5496                 tmpix = PL_savestack_ix;
5497                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5498                 regcppop(rex);
5499                 PL_savestack_ix = tmpix;
5500
5501                 st->u.eval.prev_eval = cur_eval;
5502                 cur_eval = cur_eval->u.eval.prev_eval;
5503                 DEBUG_EXECUTE_r(
5504                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5505                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5506                 if ( nochange_depth )
5507                     nochange_depth--;
5508
5509                 PUSH_YES_STATE_GOTO(EVAL_AB,
5510                         st->u.eval.prev_eval->u.eval.B); /* match B */
5511             }
5512
5513             if (locinput < reginfo->till) {
5514                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5515                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5516                                       PL_colors[4],
5517                                       (long)(locinput - PL_reg_starttry),
5518                                       (long)(reginfo->till - PL_reg_starttry),
5519                                       PL_colors[5]));
5520
5521                 sayNO_SILENT;           /* Cannot match: too short. */
5522             }
5523             PL_reginput = locinput;     /* put where regtry can find it */
5524             sayYES;                     /* Success! */
5525
5526         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5527             DEBUG_EXECUTE_r(
5528             PerlIO_printf(Perl_debug_log,
5529                 "%*s  %ssubpattern success...%s\n",
5530                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5531             PL_reginput = locinput;     /* put where regtry can find it */
5532             sayYES;                     /* Success! */
5533
5534 #undef  ST
5535 #define ST st->u.ifmatch
5536
5537         case SUSPEND:   /* (?>A) */
5538             ST.wanted = 1;
5539             PL_reginput = locinput;
5540             goto do_ifmatch;
5541
5542         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5543             ST.wanted = 0;
5544             goto ifmatch_trivial_fail_test;
5545
5546         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5547             ST.wanted = 1;
5548           ifmatch_trivial_fail_test:
5549             if (scan->flags) {
5550                 char * const s = HOPBACKc(locinput, scan->flags);
5551                 if (!s) {
5552                     /* trivial fail */
5553                     if (logical) {
5554                         logical = 0;
5555                         sw = 1 - cBOOL(ST.wanted);
5556                     }
5557                     else if (ST.wanted)
5558                         sayNO;
5559                     next = scan + ARG(scan);
5560                     if (next == scan)
5561                         next = NULL;
5562                     break;
5563                 }
5564                 PL_reginput = s;
5565             }
5566             else
5567                 PL_reginput = locinput;
5568
5569           do_ifmatch:
5570             ST.me = scan;
5571             ST.logical = logical;
5572             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5573
5574             /* execute body of (?...A) */
5575             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5576             /* NOTREACHED */
5577
5578         case IFMATCH_A_fail: /* body of (?...A) failed */
5579             ST.wanted = !ST.wanted;
5580             /* FALL THROUGH */
5581
5582         case IFMATCH_A: /* body of (?...A) succeeded */
5583             if (ST.logical) {
5584                 sw = cBOOL(ST.wanted);
5585             }
5586             else if (!ST.wanted)
5587                 sayNO;
5588
5589             if (OP(ST.me) == SUSPEND)
5590                 locinput = PL_reginput;
5591             else {
5592                 locinput = PL_reginput = st->locinput;
5593                 nextchr = UCHARAT(locinput);
5594             }
5595             scan = ST.me + ARG(ST.me);
5596             if (scan == ST.me)
5597                 scan = NULL;
5598             continue; /* execute B */
5599
5600 #undef ST
5601
5602         case LONGJMP:
5603             next = scan + ARG(scan);
5604             if (next == scan)
5605                 next = NULL;
5606             break;
5607         case COMMIT:
5608             reginfo->cutpoint = PL_regeol;
5609             /* FALLTHROUGH */
5610         case PRUNE:
5611             PL_reginput = locinput;
5612             if (!scan->flags)
5613                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5614             PUSH_STATE_GOTO(COMMIT_next,next);
5615             /* NOTREACHED */
5616         case COMMIT_next_fail:
5617             no_final = 1;
5618             /* FALLTHROUGH */
5619         case OPFAIL:
5620             sayNO;
5621             /* NOTREACHED */
5622
5623 #define ST st->u.mark
5624         case MARKPOINT:
5625             ST.prev_mark = mark_state;
5626             ST.mark_name = sv_commit = sv_yes_mark
5627                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5628             mark_state = st;
5629             ST.mark_loc = PL_reginput = locinput;
5630             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5631             /* NOTREACHED */
5632         case MARKPOINT_next:
5633             mark_state = ST.prev_mark;
5634             sayYES;
5635             /* NOTREACHED */
5636         case MARKPOINT_next_fail:
5637             if (popmark && sv_eq(ST.mark_name,popmark))
5638             {
5639                 if (ST.mark_loc > startpoint)
5640                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5641                 popmark = NULL; /* we found our mark */
5642                 sv_commit = ST.mark_name;
5643
5644                 DEBUG_EXECUTE_r({
5645                         PerlIO_printf(Perl_debug_log,
5646                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5647                             REPORT_CODE_OFF+depth*2, "",
5648                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5649                 });
5650             }
5651             mark_state = ST.prev_mark;
5652             sv_yes_mark = mark_state ?
5653                 mark_state->u.mark.mark_name : NULL;
5654             sayNO;
5655             /* NOTREACHED */
5656         case SKIP:
5657             PL_reginput = locinput;
5658             if (scan->flags) {
5659                 /* (*SKIP) : if we fail we cut here*/
5660                 ST.mark_name = NULL;
5661                 ST.mark_loc = locinput;
5662                 PUSH_STATE_GOTO(SKIP_next,next);
5663             } else {
5664                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5665                    otherwise do nothing.  Meaning we need to scan
5666                  */
5667                 regmatch_state *cur = mark_state;
5668                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5669
5670                 while (cur) {
5671                     if ( sv_eq( cur->u.mark.mark_name,
5672                                 find ) )
5673                     {
5674                         ST.mark_name = find;
5675                         PUSH_STATE_GOTO( SKIP_next, next );
5676                     }
5677                     cur = cur->u.mark.prev_mark;
5678                 }
5679             }
5680             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5681             break;
5682         case SKIP_next_fail:
5683             if (ST.mark_name) {
5684                 /* (*CUT:NAME) - Set up to search for the name as we
5685                    collapse the stack*/
5686                 popmark = ST.mark_name;
5687             } else {
5688                 /* (*CUT) - No name, we cut here.*/
5689                 if (ST.mark_loc > startpoint)
5690                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5691                 /* but we set sv_commit to latest mark_name if there
5692                    is one so they can test to see how things lead to this
5693                    cut */
5694                 if (mark_state)
5695                     sv_commit=mark_state->u.mark.mark_name;
5696             }
5697             no_final = 1;
5698             sayNO;
5699             /* NOTREACHED */
5700 #undef ST
5701         case FOLDCHAR:
5702             n = ARG(scan);
5703             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5704                 locinput += ln;
5705             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5706                 sayNO;
5707             } else  {
5708                 U8 folded[UTF8_MAXBYTES_CASE+1];
5709                 STRLEN foldlen;
5710                 const char * const l = locinput;
5711                 char *e = PL_regeol;
5712                 to_uni_fold(n, folded, &foldlen);
5713
5714                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5715                                l, &e, 0,  utf8_target)) {
5716                         sayNO;
5717                 }
5718                 locinput = e;
5719             }
5720             nextchr = UCHARAT(locinput);
5721             break;
5722         case LNBREAK:
5723             if ((n=is_LNBREAK(locinput,utf8_target))) {
5724                 locinput += n;
5725                 nextchr = UCHARAT(locinput);
5726             } else
5727                 sayNO;
5728             break;
5729
5730 #define CASE_CLASS(nAmE)                              \
5731         case nAmE:                                    \
5732             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5733                 locinput += n;                        \
5734                 nextchr = UCHARAT(locinput);          \
5735             } else                                    \
5736                 sayNO;                                \
5737             break;                                    \
5738         case N##nAmE:                                 \
5739             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5740                 sayNO;                                \
5741             } else {                                  \
5742                 locinput += UTF8SKIP(locinput);       \
5743                 nextchr = UCHARAT(locinput);          \
5744             }                                         \
5745             break
5746
5747         CASE_CLASS(VERTWS);
5748         CASE_CLASS(HORIZWS);
5749 #undef CASE_CLASS
5750
5751         default:
5752             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5753                           PTR2UV(scan), OP(scan));
5754             Perl_croak(aTHX_ "regexp memory corruption");
5755
5756         } /* end switch */
5757
5758         /* switch break jumps here */
5759         scan = next; /* prepare to execute the next op and ... */
5760         continue;    /* ... jump back to the top, reusing st */
5761         /* NOTREACHED */
5762
5763       push_yes_state:
5764         /* push a state that backtracks on success */
5765         st->u.yes.prev_yes_state = yes_state;
5766         yes_state = st;
5767         /* FALL THROUGH */
5768       push_state:
5769         /* push a new regex state, then continue at scan  */
5770         {
5771             regmatch_state *newst;
5772
5773             DEBUG_STACK_r({
5774                 regmatch_state *cur = st;
5775                 regmatch_state *curyes = yes_state;
5776                 int curd = depth;
5777                 regmatch_slab *slab = PL_regmatch_slab;
5778                 for (;curd > -1;cur--,curd--) {
5779                     if (cur < SLAB_FIRST(slab)) {
5780                         slab = slab->prev;
5781                         cur = SLAB_LAST(slab);
5782                     }
5783                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5784                         REPORT_CODE_OFF + 2 + depth * 2,"",
5785                         curd, PL_reg_name[cur->resume_state],
5786                         (curyes == cur) ? "yes" : ""
5787                     );
5788                     if (curyes == cur)
5789                         curyes = cur->u.yes.prev_yes_state;
5790                 }
5791             } else
5792                 DEBUG_STATE_pp("push")
5793             );
5794             depth++;
5795             st->locinput = locinput;
5796             newst = st+1;
5797             if (newst >  SLAB_LAST(PL_regmatch_slab))
5798                 newst = S_push_slab(aTHX);
5799             PL_regmatch_state = newst;
5800
5801             locinput = PL_reginput;
5802             nextchr = UCHARAT(locinput);
5803             st = newst;
5804             continue;
5805             /* NOTREACHED */
5806         }
5807     }
5808
5809     /*
5810     * We get here only if there's trouble -- normally "case END" is
5811     * the terminating point.
5812     */
5813     Perl_croak(aTHX_ "corrupted regexp pointers");
5814     /*NOTREACHED*/
5815     sayNO;
5816
5817 yes:
5818     if (yes_state) {
5819         /* we have successfully completed a subexpression, but we must now
5820          * pop to the state marked by yes_state and continue from there */
5821         assert(st != yes_state);
5822 #ifdef DEBUGGING
5823         while (st != yes_state) {
5824             st--;
5825             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5826                 PL_regmatch_slab = PL_regmatch_slab->prev;
5827                 st = SLAB_LAST(PL_regmatch_slab);
5828             }
5829             DEBUG_STATE_r({
5830                 if (no_final) {
5831                     DEBUG_STATE_pp("pop (no final)");
5832                 } else {
5833                     DEBUG_STATE_pp("pop (yes)");
5834                 }
5835             });
5836             depth--;
5837         }
5838 #else
5839         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5840             || yes_state > SLAB_LAST(PL_regmatch_slab))
5841         {
5842             /* not in this slab, pop slab */
5843             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5844             PL_regmatch_slab = PL_regmatch_slab->prev;
5845             st = SLAB_LAST(PL_regmatch_slab);
5846         }
5847         depth -= (st - yes_state);
5848 #endif
5849         st = yes_state;
5850         yes_state = st->u.yes.prev_yes_state;
5851         PL_regmatch_state = st;
5852
5853         if (no_final) {
5854             locinput= st->locinput;
5855             nextchr = UCHARAT(locinput);
5856         }
5857         state_num = st->resume_state + no_final;
5858         goto reenter_switch;
5859     }
5860
5861     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5862                           PL_colors[4], PL_colors[5]));
5863
5864     if (PL_reg_eval_set) {
5865         /* each successfully executed (?{...}) block does the equivalent of
5866          *   local $^R = do {...}
5867          * When popping the save stack, all these locals would be undone;
5868          * bypass this by setting the outermost saved $^R to the latest
5869          * value */
5870         if (oreplsv != GvSV(PL_replgv))
5871             sv_setsv(oreplsv, GvSV(PL_replgv));
5872     }
5873     result = 1;
5874     goto final_exit;
5875
5876 no:
5877     DEBUG_EXECUTE_r(
5878         PerlIO_printf(Perl_debug_log,
5879             "%*s  %sfailed...%s\n",
5880             REPORT_CODE_OFF+depth*2, "",
5881             PL_colors[4], PL_colors[5])
5882         );
5883
5884 no_silent:
5885     if (no_final) {
5886         if (yes_state) {
5887             goto yes;
5888         } else {
5889             goto final_exit;
5890         }
5891     }
5892     if (depth) {
5893         /* there's a previous state to backtrack to */
5894         st--;
5895         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5896             PL_regmatch_slab = PL_regmatch_slab->prev;
5897             st = SLAB_LAST(PL_regmatch_slab);
5898         }
5899         PL_regmatch_state = st;
5900         locinput= st->locinput;
5901         nextchr = UCHARAT(locinput);
5902
5903         DEBUG_STATE_pp("pop");
5904         depth--;
5905         if (yes_state == st)
5906             yes_state = st->u.yes.prev_yes_state;
5907
5908         state_num = st->resume_state + 1; /* failure = success + 1 */
5909         goto reenter_switch;
5910     }
5911     result = 0;
5912
5913   final_exit:
5914     if (rex->intflags & PREGf_VERBARG_SEEN) {
5915         SV *sv_err = get_sv("REGERROR", 1);
5916         SV *sv_mrk = get_sv("REGMARK", 1);
5917         if (result) {
5918             sv_commit = &PL_sv_no;
5919             if (!sv_yes_mark)
5920                 sv_yes_mark = &PL_sv_yes;
5921         } else {
5922             if (!sv_commit)
5923                 sv_commit = &PL_sv_yes;
5924             sv_yes_mark = &PL_sv_no;
5925         }
5926         sv_setsv(sv_err, sv_commit);
5927         sv_setsv(sv_mrk, sv_yes_mark);
5928     }
5929
5930     /* clean up; in particular, free all slabs above current one */
5931     LEAVE_SCOPE(oldsave);
5932
5933     return result;
5934 }
5935
5936 /*
5937  - regrepeat - repeatedly match something simple, report how many
5938  */
5939 /*
5940  * [This routine now assumes that it will only match on things of length 1.
5941  * That was true before, but now we assume scan - reginput is the count,
5942  * rather than incrementing count on every character.  [Er, except utf8.]]
5943  */
5944 STATIC I32
5945 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5946 {
5947     dVAR;
5948     register char *scan;
5949     register I32 c;
5950     register char *loceol = PL_regeol;
5951     register I32 hardcount = 0;
5952     register bool utf8_target = PL_reg_match_utf8;
5953 #ifndef DEBUGGING
5954     PERL_UNUSED_ARG(depth);
5955 #endif
5956
5957     PERL_ARGS_ASSERT_REGREPEAT;
5958
5959     scan = PL_reginput;
5960     if (max == REG_INFTY)
5961         max = I32_MAX;
5962     else if (max < loceol - scan)
5963         loceol = scan + max;
5964     switch (OP(p)) {
5965     case REG_ANY:
5966         if (utf8_target) {
5967             loceol = PL_regeol;
5968             while (scan < loceol && hardcount < max && *scan != '\n') {
5969                 scan += UTF8SKIP(scan);
5970                 hardcount++;
5971             }
5972         } else {
5973             while (scan < loceol && *scan != '\n')
5974                 scan++;
5975         }
5976         break;
5977     case SANY:
5978         if (utf8_target) {
5979             loceol = PL_regeol;
5980             while (scan < loceol && hardcount < max) {
5981                 scan += UTF8SKIP(scan);
5982                 hardcount++;
5983             }
5984         }
5985         else
5986             scan = loceol;
5987         break;
5988     case CANY:
5989         scan = loceol;
5990         break;
5991     case EXACT:
5992         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5993          * means they match only characters in the string that can be expressed
5994          * as a single byte.  For non-utf8 strings, that means a simple match.
5995          * For utf8 strings, the character matched must be an invariant, or
5996          * downgradable to a single byte.  The pattern's utf8ness is
5997          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5998          * it is, it's an invariant */
5999
6000         c = (U8)*STRING(p);
6001         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6002
6003         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6004             while (scan < loceol && UCHARAT(scan) == c) {
6005                 scan++;
6006             }
6007         }
6008         else {
6009
6010             /* Here, the string is utf8, and the pattern char is different
6011              * in utf8 than not, so can't compare them directly.  Outside the
6012              * loop, find find the two utf8 bytes that represent c, and then
6013              * look for those in sequence in the utf8 string */
6014             U8 high = UTF8_TWO_BYTE_HI(c);
6015             U8 low = UTF8_TWO_BYTE_LO(c);
6016             loceol = PL_regeol;
6017
6018             while (hardcount < max
6019                     && scan + 1 < loceol
6020                     && UCHARAT(scan) == high
6021                     && UCHARAT(scan + 1) == low)
6022             {
6023                 scan += 2;
6024                 hardcount++;
6025             }
6026         }
6027         break;
6028     case EXACTFL:
6029         PL_reg_flags |= RF_tainted;
6030         /* FALL THROUGH */
6031     case EXACTF:
6032     case EXACTFU:
6033
6034         /* The comments for the EXACT case above apply as well to these fold
6035          * ones */
6036
6037         c = (U8)*STRING(p);
6038         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6039
6040         if (utf8_target) { /* Use full Unicode fold matching */
6041
6042             /* For the EXACTFL case, It doesn't really make sense to compare
6043              * locale and utf8, but it is best we can do.  The documents warn
6044              * against mixing them */
6045
6046             char *tmpeol = loceol;
6047             while (hardcount < max
6048                     && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
6049                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN)))
6050             {
6051                 scan = tmpeol;
6052                 tmpeol = loceol;
6053                 hardcount++;
6054             }
6055
6056             /* XXX Note that the above handles properly the German sharp s in
6057              * the pattern matching ss in the string.  But it doesn't handle
6058              * properly cases where the string contains say 'LIGATURE ff' and
6059              * the pattern is 'f+'.  This would require, say, a new function or
6060              * revised interface to foldEQ_utf8(), in which the maximum number
6061              * of characters to match could be passed and it would return how
6062              * many actually did.  This is just one of many cases where
6063              * multi-char folds don't work properly, and so the fix is being
6064              * deferred */
6065         }
6066         else {
6067             U8 folded;
6068
6069             /* Here, the string isn't utf8 and c is a single byte; and either
6070              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6071              * doesn't affect c.  Can just do simple comparisons for exact or
6072              * fold matching. */
6073             switch (OP(p)) {
6074                 case EXACTF: folded = PL_fold[c]; break;
6075                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6076                 case EXACTFL: folded = PL_fold_locale[c]; break;
6077                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6078             }
6079             while (scan < loceol &&
6080                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6081             {
6082                 scan++;
6083             }
6084         }
6085         break;
6086     case ANYOFV:
6087     case ANYOF:
6088         if (utf8_target) {
6089             loceol = PL_regeol;
6090             while (hardcount < max && scan < loceol &&
6091                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
6092                 scan += UTF8SKIP(scan);
6093                 hardcount++;
6094             }
6095         } else {
6096             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6097                 scan++;
6098         }
6099         break;
6100     case ALNUMU:
6101         if (utf8_target) {
6102     utf8_wordchar:
6103             loceol = PL_regeol;
6104             LOAD_UTF8_CHARCLASS_ALNUM();
6105             while (hardcount < max && scan < loceol &&
6106                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6107             {
6108                 scan += UTF8SKIP(scan);
6109                 hardcount++;
6110             }
6111         } else {
6112             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6113                 scan++;
6114             }
6115         }
6116         break;
6117     case ALNUM:
6118         if (utf8_target)
6119             goto utf8_wordchar;
6120         while (scan < loceol && isALNUM((U8) *scan)) {
6121             scan++;
6122         }
6123         break;
6124     case ALNUMA:
6125         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6126             scan++;
6127         }
6128         break;
6129     case ALNUML:
6130         PL_reg_flags |= RF_tainted;
6131         if (utf8_target) {
6132             loceol = PL_regeol;
6133             while (hardcount < max && scan < loceol &&
6134                    isALNUM_LC_utf8((U8*)scan)) {
6135                 scan += UTF8SKIP(scan);
6136                 hardcount++;
6137             }
6138         } else {
6139             while (scan < loceol && isALNUM_LC(*scan))
6140                 scan++;
6141         }
6142         break;
6143     case NALNUMU:
6144         if (utf8_target) {
6145
6146     utf8_Nwordchar:
6147
6148             loceol = PL_regeol;
6149             LOAD_UTF8_CHARCLASS_ALNUM();
6150             while (hardcount < max && scan < loceol &&
6151                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6152             {
6153                 scan += UTF8SKIP(scan);
6154                 hardcount++;
6155             }
6156         } else {
6157             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6158                 scan++;
6159             }
6160         }
6161         break;
6162     case NALNUM:
6163         if (utf8_target)
6164             goto utf8_Nwordchar;
6165         while (scan < loceol && ! isALNUM((U8) *scan)) {
6166             scan++;
6167         }
6168         break;
6169     case NALNUMA:
6170         if (utf8_target) {
6171             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6172                 scan += UTF8SKIP(scan);
6173             }
6174         }
6175         else {
6176             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6177                 scan++;
6178             }
6179         }
6180         break;
6181     case NALNUML:
6182         PL_reg_flags |= RF_tainted;
6183         if (utf8_target) {
6184             loceol = PL_regeol;
6185             while (hardcount < max && scan < loceol &&
6186                    !isALNUM_LC_utf8((U8*)scan)) {
6187                 scan += UTF8SKIP(scan);
6188                 hardcount++;
6189             }
6190         } else {
6191             while (scan < loceol && !isALNUM_LC(*scan))
6192                 scan++;
6193         }
6194         break;
6195     case SPACEU:
6196         if (utf8_target) {
6197
6198     utf8_space:
6199
6200             loceol = PL_regeol;
6201             LOAD_UTF8_CHARCLASS_SPACE();
6202             while (hardcount < max && scan < loceol &&
6203                    (*scan == ' ' ||
6204                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6205             {
6206                 scan += UTF8SKIP(scan);
6207                 hardcount++;
6208             }
6209             break;
6210         }
6211         else {
6212             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6213                 scan++;
6214             }
6215             break;
6216         }
6217     case SPACE:
6218         if (utf8_target)
6219             goto utf8_space;
6220
6221         while (scan < loceol && isSPACE((U8) *scan)) {
6222             scan++;
6223         }
6224         break;
6225     case SPACEA:
6226         while (scan < loceol && isSPACE_A((U8) *scan)) {
6227             scan++;
6228         }
6229         break;
6230     case SPACEL:
6231         PL_reg_flags |= RF_tainted;
6232         if (utf8_target) {
6233             loceol = PL_regeol;
6234             while (hardcount < max && scan < loceol &&
6235                    isSPACE_LC_utf8((U8*)scan)) {
6236                 scan += UTF8SKIP(scan);
6237                 hardcount++;
6238             }
6239         } else {
6240             while (scan < loceol && isSPACE_LC(*scan))
6241                 scan++;
6242         }
6243         break;
6244     case NSPACEU:
6245         if (utf8_target) {
6246
6247     utf8_Nspace:
6248
6249             loceol = PL_regeol;
6250             LOAD_UTF8_CHARCLASS_SPACE();
6251             while (hardcount < max && scan < loceol &&
6252                    ! (*scan == ' ' ||
6253                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6254             {
6255                 scan += UTF8SKIP(scan);
6256                 hardcount++;
6257             }
6258             break;
6259         }
6260         else {
6261             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6262                 scan++;
6263             }
6264         }
6265         break;
6266     case NSPACE:
6267         if (utf8_target)
6268             goto utf8_Nspace;
6269
6270         while (scan < loceol && ! isSPACE((U8) *scan)) {
6271             scan++;
6272         }
6273         break;
6274     case NSPACEA:
6275         if (utf8_target) {
6276             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6277                 scan += UTF8SKIP(scan);
6278             }
6279         }
6280         else {
6281             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6282                 scan++;
6283             }
6284         }
6285         break;
6286     case NSPACEL:
6287         PL_reg_flags |= RF_tainted;
6288         if (utf8_target) {
6289             loceol = PL_regeol;
6290             while (hardcount < max && scan < loceol &&
6291                    !isSPACE_LC_utf8((U8*)scan)) {
6292                 scan += UTF8SKIP(scan);
6293                 hardcount++;
6294             }
6295         } else {
6296             while (scan < loceol && !isSPACE_LC(*scan))
6297                 scan++;
6298         }
6299         break;
6300     case DIGIT:
6301         if (utf8_target) {
6302             loceol = PL_regeol;
6303             LOAD_UTF8_CHARCLASS_DIGIT();
6304             while (hardcount < max && scan < loceol &&
6305                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6306                 scan += UTF8SKIP(scan);
6307                 hardcount++;
6308             }
6309         } else {
6310             while (scan < loceol && isDIGIT(*scan))
6311                 scan++;
6312         }
6313         break;
6314     case DIGITA:
6315         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6316             scan++;
6317         }
6318         break;
6319     case DIGITL:
6320         PL_reg_flags |= RF_tainted;
6321         if (utf8_target) {
6322             loceol = PL_regeol;
6323             while (hardcount < max && scan < loceol &&
6324                    isDIGIT_LC_utf8((U8*)scan)) {
6325                 scan += UTF8SKIP(scan);
6326                 hardcount++;
6327             }
6328         } else {
6329             while (scan < loceol && isDIGIT_LC(*scan))
6330                 scan++;
6331         }
6332         break;
6333     case NDIGIT:
6334         if (utf8_target) {
6335             loceol = PL_regeol;
6336             LOAD_UTF8_CHARCLASS_DIGIT();
6337             while (hardcount < max && scan < loceol &&
6338                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6339                 scan += UTF8SKIP(scan);
6340                 hardcount++;
6341             }
6342         } else {
6343             while (scan < loceol && !isDIGIT(*scan))
6344                 scan++;
6345         }
6346         break;
6347     case NDIGITA:
6348         if (utf8_target) {
6349             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6350                 scan += UTF8SKIP(scan);
6351             }
6352         }
6353         else {
6354             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6355                 scan++;
6356             }
6357         }
6358         break;
6359     case NDIGITL:
6360         PL_reg_flags |= RF_tainted;
6361         if (utf8_target) {
6362             loceol = PL_regeol;
6363             while (hardcount < max && scan < loceol &&
6364                    !isDIGIT_LC_utf8((U8*)scan)) {
6365                 scan += UTF8SKIP(scan);
6366                 hardcount++;
6367             }
6368         } else {
6369             while (scan < loceol && !isDIGIT_LC(*scan))
6370                 scan++;
6371         }
6372         break;
6373     case LNBREAK:
6374         if (utf8_target) {
6375             loceol = PL_regeol;
6376             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6377                 scan += c;
6378                 hardcount++;
6379             }
6380         } else {
6381             /*
6382               LNBREAK can match two latin chars, which is ok,
6383               because we have a null terminated string, but we
6384               have to use hardcount in this situation
6385             */
6386             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6387                 scan+=c;
6388                 hardcount++;
6389             }
6390         }
6391         break;
6392     case HORIZWS:
6393         if (utf8_target) {
6394             loceol = PL_regeol;
6395             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6396                 scan += c;
6397                 hardcount++;
6398             }
6399         } else {
6400             while (scan < loceol && is_HORIZWS_latin1(scan))
6401                 scan++;
6402         }
6403         break;
6404     case NHORIZWS:
6405         if (utf8_target) {
6406             loceol = PL_regeol;
6407             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6408                 scan += UTF8SKIP(scan);
6409                 hardcount++;
6410             }
6411         } else {
6412             while (scan < loceol && !is_HORIZWS_latin1(scan))
6413                 scan++;
6414
6415         }
6416         break;
6417     case VERTWS:
6418         if (utf8_target) {
6419             loceol = PL_regeol;
6420             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6421                 scan += c;
6422                 hardcount++;
6423             }
6424         } else {
6425             while (scan < loceol && is_VERTWS_latin1(scan))
6426                 scan++;
6427
6428         }
6429         break;
6430     case NVERTWS:
6431         if (utf8_target) {
6432             loceol = PL_regeol;
6433             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6434                 scan += UTF8SKIP(scan);
6435                 hardcount++;
6436             }
6437         } else {
6438             while (scan < loceol && !is_VERTWS_latin1(scan))
6439                 scan++;
6440
6441         }
6442         break;
6443
6444     default:            /* Called on something of 0 width. */
6445         break;          /* So match right here or not at all. */
6446     }
6447
6448     if (hardcount)
6449         c = hardcount;
6450     else
6451         c = scan - PL_reginput;
6452     PL_reginput = scan;
6453
6454     DEBUG_r({
6455         GET_RE_DEBUG_FLAGS_DECL;
6456         DEBUG_EXECUTE_r({
6457             SV * const prop = sv_newmortal();
6458             regprop(prog, prop, p);
6459             PerlIO_printf(Perl_debug_log,
6460                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6461                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6462         });
6463     });
6464
6465     return(c);
6466 }
6467
6468
6469 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6470 /*
6471 - regclass_swash - prepare the utf8 swash
6472 */
6473
6474 SV *
6475 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6476 {
6477     dVAR;
6478     SV *sw  = NULL;
6479     SV *si  = NULL;
6480     SV *alt = NULL;
6481     RXi_GET_DECL(prog,progi);
6482     const struct reg_data * const data = prog ? progi->data : NULL;
6483
6484     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6485
6486     if (data && data->count) {
6487         const U32 n = ARG(node);
6488
6489         if (data->what[n] == 's') {
6490             SV * const rv = MUTABLE_SV(data->data[n]);
6491             AV * const av = MUTABLE_AV(SvRV(rv));
6492             SV **const ary = AvARRAY(av);
6493             SV **a, **b;
6494
6495             /* See the end of regcomp.c:S_regclass() for
6496              * documentation of these array elements. */
6497
6498             si = *ary;
6499             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6500             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6501
6502             if (a)
6503                 sw = *a;
6504             else if (si && doinit) {
6505                 sw = swash_init("utf8", "", si, 1, 0);
6506                 (void)av_store(av, 1, sw);
6507             }
6508             if (b)
6509                 alt = *b;
6510         }
6511     }
6512
6513     if (listsvp)
6514         *listsvp = si;
6515     if (altsvp)
6516         *altsvp  = alt;
6517
6518     return sw;
6519 }
6520 #endif
6521
6522 /*
6523  - reginclass - determine if a character falls into a character class
6524
6525   n is the ANYOF regnode
6526   p is the target string
6527   lenp is pointer to the maximum number of bytes of how far to go in p
6528     (This is assumed wthout checking to always be at least the current
6529     character's size)
6530   utf8_target tells whether p is in UTF-8.
6531
6532   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6533   from a successful match, the value it points to will be updated to how many
6534   bytes in p were matched.  If there was no match, the value is undefined,
6535   possibly changed from the input.
6536
6537   Note that this can be a synthetic start class, a combination of various
6538   nodes, so things you think might be mutually exclusive, such as locale,
6539   aren't.  It can match both locale and non-locale
6540
6541  */
6542
6543 STATIC bool
6544 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6545 {
6546     dVAR;
6547     const char flags = ANYOF_FLAGS(n);
6548     bool match = FALSE;
6549     UV c = *p;
6550     STRLEN c_len = 0;
6551     STRLEN maxlen;
6552
6553     PERL_ARGS_ASSERT_REGINCLASS;
6554
6555     /* If c is not already the code point, get it */
6556     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6557         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6558                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6559                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6560                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6561                  * UTF8_ALLOW_FFFF */
6562         if (c_len == (STRLEN)-1)
6563             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6564     }
6565     else {
6566         c_len = 1;
6567     }
6568
6569     /* Use passed in max length, or one character if none passed in or less
6570      * than one character.  And assume will match just one character.  This is
6571      * overwritten later if matched more. */
6572     if (lenp) {
6573         maxlen = (*lenp > c_len) ? *lenp : c_len;
6574         *lenp = c_len;
6575
6576     }
6577     else {
6578         maxlen = c_len;
6579     }
6580
6581     /* If this character is potentially in the bitmap, check it */
6582     if (c < 256) {
6583         if (ANYOF_BITMAP_TEST(n, c))
6584             match = TRUE;
6585         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6586                 && ! utf8_target
6587                 && ! isASCII(c))
6588         {
6589             match = TRUE;
6590         }
6591
6592         else if (flags & ANYOF_LOCALE) {
6593             PL_reg_flags |= RF_tainted;
6594
6595             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6596                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6597             {
6598                 match = TRUE;
6599             }
6600             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6601                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6602                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6603                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6604                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6605                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6606                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6607                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6608                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6609                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6610                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6611                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6612                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6613                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6614                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6615                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6616                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6617                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6618                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6619                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6620                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6621                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6622                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6623                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6624                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6625                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6626                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6627                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6628                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6629                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6630                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6631                      ) /* How's that for a conditional? */
6632             ) {
6633                 match = TRUE;
6634             }
6635         }
6636     }
6637
6638     /* If the bitmap didn't (or couldn't) match, and something outside the
6639      * bitmap could match, try that */
6640     if (!match) {
6641         if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
6642             if (c >= 256
6643                 || ((flags & ANYOF_LOC_NONBITMAP_FOLD) /* Latin1 1 that has a
6644                                                           non-Latin1 fold
6645                                                           should match */
6646                     && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
6647             {
6648                 match = TRUE;
6649             }
6650         }
6651         if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6652                        || (utf8_target && flags & ANYOF_UTF8)))
6653         {
6654             AV *av;
6655             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6656
6657             if (sw) {
6658                 U8 * utf8_p;
6659                 if (utf8_target) {
6660                     utf8_p = (U8 *) p;
6661                 } else {
6662
6663                     /* Not utf8.  Convert as much of the string as available up
6664                      * to the limit of how far the (single) character in the
6665                      * pattern can possibly match (no need to go further).  If
6666                      * the node is a straight ANYOF or not folding, it can't
6667                      * match more than one.  Otherwise, It can match up to how
6668                      * far a single char can fold to.  Since not utf8, each
6669                      * character is a single byte, so the max it can be in
6670                      * bytes is the same as the max it can be in characters */
6671                     STRLEN len = (OP(n) == ANYOF
6672                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6673                                   ? 1
6674                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6675                                     ? maxlen
6676                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6677                     utf8_p = bytes_to_utf8(p, &len);
6678                 }
6679
6680                 if (swash_fetch(sw, utf8_p, TRUE))
6681                     match = TRUE;
6682                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6683
6684                     /* Here, we need to test if the fold of the target string
6685                      * matches.  In the case of a multi-char fold that is
6686                      * caught by regcomp.c, it has stored all such folds into
6687                      * 'av'; we linearly check to see if any match the target
6688                      * string (folded).   We know that the originals were each
6689                      * one character, but we don't currently know how many
6690                      * characters/bytes each folded to, except we do know that
6691                      * there are small limits imposed by Unicode.  XXX A
6692                      * performance enhancement would be to have regcomp.c store
6693                      * the max number of chars/bytes that are in an av entry,
6694                      * as, say the 0th element.  Even better would be to have a
6695                      * hash of the few characters that can start a multi-char
6696                      * fold to the max number of chars of those folds.
6697                      *
6698                      * Further down, if there isn't a
6699                      * match in the av, we will check if there is another
6700                      * fold-type match.  For that, we also need the fold, but
6701                      * only the first character.  No sense in folding it twice,
6702                      * so we do it here, even if there isn't any multi-char
6703                      * fold, so we always fold at least the first character.
6704                      * If the node is a straight ANYOF node, or there is only
6705                      * one character available in the string, or if there isn't
6706                      * any av, that's all we have to fold.  In the case of a
6707                      * multi-char fold, we do have guarantees in Unicode that
6708                      * it can only expand up to so many characters and so many
6709                      * bytes.  We keep track so don't exceed either.
6710                      *
6711                      * If there is a match, we will need to advance (if lenp is
6712                      * specified) the match pointer in the target string.  But
6713                      * what we are comparing here isn't that string directly,
6714                      * but its fold, whose length may differ from the original.
6715                      * As we go along in constructing the fold, therefore, we
6716                      * create a map so that we know how many bytes in the
6717                      * source to advance given that we have matched a certain
6718                      * number of bytes in the fold.  This map is stored in
6719                      * 'map_fold_len_back'.  The first character in the fold
6720                      * has array element 1 contain the number of bytes in the
6721                      * source that folded to it; the 2nd is the cumulative
6722                      * number to match it; ... */
6723                     U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND] = { 0 };
6724                     U8 folded[UTF8_MAXBYTES_CASE+1];
6725                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6726                     STRLEN foldlen_for_av; /* num bytes in fold of all chars */
6727
6728                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6729
6730                         /* Here, only need to fold the first char of the target
6731                          * string */
6732                         to_utf8_fold(utf8_p, folded, &foldlen);
6733                         foldlen_for_av = foldlen;
6734                         map_fold_len_back[1] = UTF8SKIP(utf8_p);
6735                     }
6736                     else {
6737
6738                         /* Here, need to fold more than the first char.  Do so
6739                          * up to the limits */
6740                         UV which_char = 0;
6741                         U8* source_ptr = utf8_p;    /* The source for the fold
6742                                                        is the regex target
6743                                                        string */
6744                         U8* folded_ptr = folded;
6745                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6746                                                        available byte in the
6747                                                        target string */
6748                         while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND
6749                                && source_ptr < e)
6750                         {
6751
6752                             /* Fold the next character */
6753                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6754                             STRLEN this_char_foldlen;
6755                             to_utf8_fold(source_ptr,
6756                                          this_char_folded,
6757                                          &this_char_foldlen);
6758
6759                             /* Bail if it would exceed the byte limit for
6760                              * folding a single char. */
6761                             if (this_char_foldlen + folded_ptr - folded >
6762                                                             UTF8_MAXBYTES_CASE)
6763                             {
6764                                 break;
6765                             }
6766
6767                             /* Save the first character's folded length, in
6768                              * case we have to use it later */
6769                             if (! foldlen) {
6770                                 foldlen = this_char_foldlen;
6771                             }
6772
6773                             /* Here, add the fold of this character */
6774                             Copy(this_char_folded,
6775                                  folded_ptr,
6776                                  this_char_foldlen,
6777                                  U8);
6778                             which_char++;
6779                             map_fold_len_back[which_char] =
6780                                 map_fold_len_back[which_char - 1]
6781                                 + UTF8SKIP(source_ptr);
6782                             folded_ptr += this_char_foldlen;
6783                             source_ptr += UTF8SKIP(source_ptr);
6784                         }
6785                         *folded_ptr = '\0';
6786                         foldlen_for_av = folded_ptr - folded;
6787                     }
6788
6789
6790                     /* Do the linear search to see if the fold is in the list
6791                      * of multi-char folds.  (Useless to look if won't be able
6792                      * to store that it is a multi-char fold in *lenp) */
6793                     if (lenp && av) {
6794                         I32 i;
6795                         for (i = 0; i <= av_len(av); i++) {
6796                             SV* const sv = *av_fetch(av, i, FALSE);
6797                             STRLEN len;
6798                             const char * const s = SvPV_const(sv, len);
6799                             if (len <= foldlen_for_av && memEQ(s,
6800                                                                (char*)folded,
6801                                                                len))
6802                             {
6803
6804                                 /* Advance the target string ptr to account for
6805                                  * this fold, but have to translate from the
6806                                  * folded length to the corresponding source
6807                                  * length.  The array is indexed by how many
6808                                  * characters in the match */
6809                                 *lenp = map_fold_len_back[
6810                                         utf8_length(folded, folded + len)];
6811                                 match = TRUE;
6812                                 break;
6813                             }
6814                         }
6815                     }
6816 #if 0
6817                     if (!match) { /* See if the folded version matches */
6818                         SV** listp;
6819
6820                         /* Consider "k" =~ /[K]/i.  The line above would have
6821                          * just folded the 'k' to itself, and that isn't going
6822                          * to match 'K'.  So we look through the closure of
6823                          * everything that folds to 'k'.  That will find the
6824                          * 'K'.  Initialize the list, if necessary */
6825                         if (! PL_utf8_foldclosures) {
6826
6827                             /* If the folds haven't been read in, call a fold
6828                              * function to force that */
6829                             if (! PL_utf8_tofold) {
6830                                 U8 dummy[UTF8_MAXBYTES+1];
6831                                 STRLEN dummy_len;
6832                                 to_utf8_fold((U8*) "A", dummy, &dummy_len);
6833                             }
6834                             PL_utf8_foldclosures =
6835                                   _swash_inversion_hash(PL_utf8_tofold);
6836                         }
6837
6838                         /* The data structure is a hash with the keys every
6839                          * character that is folded to, like 'k', and the
6840                          * values each an array of everything that folds to its
6841                          * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6842                         if ((listp = hv_fetch(PL_utf8_foldclosures,
6843                                       (char *) folded, foldlen, FALSE)))
6844                         {
6845                             AV* list = (AV*) *listp;
6846                             IV i;
6847                             for (i = 0; i <= av_len(list); i++) {
6848                                 SV** try_p = av_fetch(list, i, FALSE);
6849                                 char* try_c;
6850                                 if (try_p == NULL) {
6851                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6852                                 }
6853                                 /* Don't have to worry about embedded nulls
6854                                  * since NULL isn't folded or foldable */
6855                                 try_c = SvPVX(*try_p);
6856
6857                                 /* The fold in a few cases  of an above Latin1
6858                                  * char is in the Latin1 range, and hence may
6859                                  * be in the bitmap */
6860                                 if (UTF8_IS_INVARIANT(*try_c)
6861                                     && ANYOF_BITMAP_TEST(n,
6862                                                     UNI_TO_NATIVE(*try_c)))
6863                                 {
6864                                     match = TRUE;
6865                                     break;
6866                                 }
6867                                 else if
6868                                     (UTF8_IS_DOWNGRADEABLE_START(*try_c)
6869                                      && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
6870                                                 TWO_BYTE_UTF8_TO_UNI(try_c[0],
6871                                                                     try_c[1]))))
6872                                 {
6873                                    /* Since the fold comes from internally
6874                                     * generated data, we can safely assume it
6875                                     * is valid utf8 in the test above */
6876                                     match = TRUE;
6877                                     break;
6878                                 } else if (swash_fetch(sw, (U8*) try_c, TRUE)) {
6879                                     match = TRUE;
6880                                     break;
6881                                 }
6882                             }
6883                         }
6884                     }
6885 #endif
6886                 }
6887
6888                 /* If we allocated a string above, free it */
6889                 if (! utf8_target) Safefree(utf8_p);
6890             }
6891         }
6892     }
6893
6894     return (flags & ANYOF_INVERT) ? !match : match;
6895 }
6896
6897 STATIC U8 *
6898 S_reghop3(U8 *s, I32 off, const U8* lim)
6899 {
6900     dVAR;
6901
6902     PERL_ARGS_ASSERT_REGHOP3;
6903
6904     if (off >= 0) {
6905         while (off-- && s < lim) {
6906             /* XXX could check well-formedness here */
6907             s += UTF8SKIP(s);
6908         }
6909     }
6910     else {
6911         while (off++ && s > lim) {
6912             s--;
6913             if (UTF8_IS_CONTINUED(*s)) {
6914                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6915                     s--;
6916             }
6917             /* XXX could check well-formedness here */
6918         }
6919     }
6920     return s;
6921 }
6922
6923 #ifdef XXX_dmq
6924 /* there are a bunch of places where we use two reghop3's that should
6925    be replaced with this routine. but since thats not done yet
6926    we ifdef it out - dmq
6927 */
6928 STATIC U8 *
6929 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6930 {
6931     dVAR;
6932
6933     PERL_ARGS_ASSERT_REGHOP4;
6934
6935     if (off >= 0) {
6936         while (off-- && s < rlim) {
6937             /* XXX could check well-formedness here */
6938             s += UTF8SKIP(s);
6939         }
6940     }
6941     else {
6942         while (off++ && s > llim) {
6943             s--;
6944             if (UTF8_IS_CONTINUED(*s)) {
6945                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6946                     s--;
6947             }
6948             /* XXX could check well-formedness here */
6949         }
6950     }
6951     return s;
6952 }
6953 #endif
6954
6955 STATIC U8 *
6956 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6957 {
6958     dVAR;
6959
6960     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6961
6962     if (off >= 0) {
6963         while (off-- && s < lim) {
6964             /* XXX could check well-formedness here */
6965             s += UTF8SKIP(s);
6966         }
6967         if (off >= 0)
6968             return NULL;
6969     }
6970     else {
6971         while (off++ && s > lim) {
6972             s--;
6973             if (UTF8_IS_CONTINUED(*s)) {
6974                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6975                     s--;
6976             }
6977             /* XXX could check well-formedness here */
6978         }
6979         if (off <= 0)
6980             return NULL;
6981     }
6982     return s;
6983 }
6984
6985 static void
6986 restore_pos(pTHX_ void *arg)
6987 {
6988     dVAR;
6989     regexp * const rex = (regexp *)arg;
6990     if (PL_reg_eval_set) {
6991         if (PL_reg_oldsaved) {
6992             rex->subbeg = PL_reg_oldsaved;
6993             rex->sublen = PL_reg_oldsavedlen;
6994 #ifdef PERL_OLD_COPY_ON_WRITE
6995             rex->saved_copy = PL_nrs;
6996 #endif
6997             RXp_MATCH_COPIED_on(rex);
6998         }
6999         PL_reg_magic->mg_len = PL_reg_oldpos;
7000         PL_reg_eval_set = 0;
7001         PL_curpm = PL_reg_oldcurpm;
7002     }
7003 }
7004
7005 STATIC void
7006 S_to_utf8_substr(pTHX_ register regexp *prog)
7007 {
7008     int i = 1;
7009
7010     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7011
7012     do {
7013         if (prog->substrs->data[i].substr
7014             && !prog->substrs->data[i].utf8_substr) {
7015             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7016             prog->substrs->data[i].utf8_substr = sv;
7017             sv_utf8_upgrade(sv);
7018             if (SvVALID(prog->substrs->data[i].substr)) {
7019                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
7020                 if (flags & FBMcf_TAIL) {
7021                     /* Trim the trailing \n that fbm_compile added last
7022                        time.  */
7023                     SvCUR_set(sv, SvCUR(sv) - 1);
7024                     /* Whilst this makes the SV technically "invalid" (as its
7025                        buffer is no longer followed by "\0") when fbm_compile()
7026                        adds the "\n" back, a "\0" is restored.  */
7027                 }
7028                 fbm_compile(sv, flags);
7029             }
7030             if (prog->substrs->data[i].substr == prog->check_substr)
7031                 prog->check_utf8 = sv;
7032         }
7033     } while (i--);
7034 }
7035
7036 STATIC void
7037 S_to_byte_substr(pTHX_ register regexp *prog)
7038 {
7039     dVAR;
7040     int i = 1;
7041
7042     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7043
7044     do {
7045         if (prog->substrs->data[i].utf8_substr
7046             && !prog->substrs->data[i].substr) {
7047             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7048             if (sv_utf8_downgrade(sv, TRUE)) {
7049                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7050                     const U8 flags
7051                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
7052                     if (flags & FBMcf_TAIL) {
7053                         /* Trim the trailing \n that fbm_compile added last
7054                            time.  */
7055                         SvCUR_set(sv, SvCUR(sv) - 1);
7056                     }
7057                     fbm_compile(sv, flags);
7058                 }
7059             } else {
7060                 SvREFCNT_dec(sv);
7061                 sv = &PL_sv_undef;
7062             }
7063             prog->substrs->data[i].substr = sv;
7064             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7065                 prog->check_substr = sv;
7066         }
7067     } while (i--);
7068 }
7069
7070 /*
7071  * Local variables:
7072  * c-indentation-style: bsd
7073  * c-basic-offset: 4
7074  * indent-tabs-mode: t
7075  * End:
7076  *
7077  * ex: set ts=8 sts=4 sw=4 noet:
7078  */