regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 154
 155 /* The actual code for CCC_TRY, which uses several variables from the routine
 156  * it's callable from.  It is designed to be the bulk of a case statement.
 157  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 158  *      nextchr matches the class.
 159  * UTF8_TEST is the whole test string to use for utf8 targets
 160  * LOAD is what to use to test, and if not present to load in the swash for the
 161  *      class
 162  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 163  *      UTF8_TEST test.
 164  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 165  * utf8 and a variant, load the swash if necessary and test using the utf8
 166  * test.  Advance to the next character if test is ok, otherwise fail; If not
 167  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 168  * fails, or advance to the next character */
 169
 170 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 171     if (locinput >= PL_regeol) {                                              \
 172         sayNO;                                                                \
 173     }                                                                         \
 174     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 175         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 176         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 177             sayNO;                                                            \
 178         }                                                                     \
 179         locinput += PL_utf8skip[nextchr];                                     \
 180         nextchr = UCHARAT(locinput);                                          \
 181         break;                                                                \
 182     }                                                                         \
 183     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 184         sayNO;                                                                \
 185     }                                                                         \
 186     nextchr = UCHARAT(++locinput);                                            \
 187     break;
 188
 189 /* Handle the non-locale cases for a character class and its complement.  It
 190  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 191  * This is because that code fails when the test succeeds, so we want to have
 192  * the test fail so that the code succeeds.  The swash is stored in a
 193  * predictable PL_ place */
 194 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 195                            CLASS, STR)                                        \
 196     case NAME:                                                                \
 197         _CCC_TRY_CODE( !, FUNC,                                               \
 198                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 199                                             (U8*)locinput, TRUE)),            \
 200                           CLASS, STR)                                         \
 201     case NNAME:                                                               \
 202         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 203                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 204                                             (U8*)locinput, TRUE)),            \
 205                           CLASS, STR)                                         \
 206
 207 /* Generate the case statements for both locale and non-locale character
 208  * classes in regmatch for classes that don't have special unicode semantics.
 209  * Locales don't use an immediate swash, but an intermediary special locale
 210  * function that is called on the pointer to the current place in the input
 211  * string.  That function will resolve to needing the same swash.  One might
 212  * think that because we don't know what the locale will match, we shouldn't
 213  * check with the swash loading function that it loaded properly; ie, that we
 214  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 215  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 216  * irrelevant here */
 217 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 218                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 219                 NAMEA, NNAMEA, FUNCA,                                         \
 220                 CLASS, STR)                                                   \
 221     case NAMEL:                                                               \
 222         PL_reg_flags |= RF_tainted;                                           \
 223         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 224     case NNAMEL:                                                              \
 225         PL_reg_flags |= RF_tainted;                                           \
 226         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 227                        CLASS, STR)                                            \
 228     case NAMEA:                                                               \
 229         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 230             sayNO;                                                            \
 231         }                                                                     \
 232         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 233         nextchr = UCHARAT(++locinput);                                        \
 234         break;                                                                \
 235     case NNAMEA:                                                              \
 236         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 237             sayNO;                                                            \
 238         }                                                                     \
 239         if (utf8_target) {                                                    \
 240             locinput += PL_utf8skip[nextchr];                                 \
 241             nextchr = UCHARAT(locinput);                                      \
 242         }                                                                     \
 243         else {                                                                \
 244             nextchr = UCHARAT(++locinput);                                    \
 245         }                                                                     \
 246         break;                                                                \
 247     /* Generate the non-locale cases */                                       \
 248     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 249
 250 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 251  * statements to handle separate Unicode semantics nodes */
 252 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 253                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 254                   NAMEU, NNAMEU, FUNCU,                                        \
 255                   NAMEA, NNAMEA, FUNCA,                                        \
 256                   CLASS, STR)                                                  \
 257     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 258             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 259             NAMEA, NNAMEA, FUNCA,                                              \
 260             CLASS, STR)                                                        \
 261     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 262
 263 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 264
 265 /* for use after a quantifier and before an EXACT-like node -- japhy */
 266 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 267  *
 268  * NOTE that *nothing* that affects backtracking should be in here, specifically
 269  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 270  * node that is in between two EXACT like nodes when ascertaining what the required
 271  * "follow" character is. This should probably be moved to regex compile time
 272  * although it may be done at run time beause of the REF possibility - more
 273  * investigation required. -- demerphq
 274 */
 275 #define JUMPABLE(rn) (      \
 276     OP(rn) == OPEN ||       \
 277     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 278     OP(rn) == EVAL ||   \
 279     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 280     OP(rn) == PLUS || OP(rn) == MINMOD || \
 281     OP(rn) == KEEPS || \
 282     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 283 )
 284 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 285
 286 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 287
 288 #if 0
 289 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 290    we don't need this definition. */
 291 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 292 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 293 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 294
 295 #else
 296 /* ... so we use this as its faster. */
 297 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 298 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 299 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 300 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 301
 302 #endif
 303
 304 /*
 305   Search for mandatory following text node; for lookahead, the text must
 306   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 307 */
 308 #define FIND_NEXT_IMPT(rn) STMT_START { \
 309     while (JUMPABLE(rn)) { \
 310         const OPCODE type = OP(rn); \
 311         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 312             rn = NEXTOPER(NEXTOPER(rn)); \
 313         else if (type == PLUS) \
 314             rn = NEXTOPER(rn); \
 315         else if (type == IFMATCH) \
 316             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 317         else rn += NEXT_OFF(rn); \
 318     } \
 319 } STMT_END
 320
 321
 322 static void restore_pos(pTHX_ void *arg);
 323
 324 #define REGCP_PAREN_ELEMS 4
 325 #define REGCP_OTHER_ELEMS 5
 326 #define REGCP_FRAME_ELEMS 1
 327 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 328  * are needed for the regexp context stack bookkeeping. */
 329
 330 STATIC CHECKPOINT
 331 S_regcppush(pTHX_ I32 parenfloor)
 332 {
 333     dVAR;
 334     const int retval = PL_savestack_ix;
 335     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 336     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 337     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 338     int p;
 339     GET_RE_DEBUG_FLAGS_DECL;
 340
 341     if (paren_elems_to_push < 0)
 342         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 343
 344     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 345         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 346                    " out of range (%lu-%ld)",
 347                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 348
 349     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 350
 351     for (p = PL_regsize; p > parenfloor; p--) {
 352 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 353         SSPUSHINT(PL_regoffs[p].end);
 354         SSPUSHINT(PL_regoffs[p].start);
 355         SSPUSHPTR(PL_reg_start_tmp[p]);
 356         SSPUSHINT(p);
 357         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 358           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 359                       (UV)p, (IV)PL_regoffs[p].start,
 360                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 361                       (IV)PL_regoffs[p].end
 362         ));
 363     }
 364 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 365     SSPUSHPTR(PL_regoffs);
 366     SSPUSHINT(PL_regsize);
 367     SSPUSHINT(*PL_reglastparen);
 368     SSPUSHINT(*PL_reglastcloseparen);
 369     SSPUSHPTR(PL_reginput);
 370     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 371
 372     return retval;
 373 }
 374
 375 /* These are needed since we do not localize EVAL nodes: */
 376 #define REGCP_SET(cp)                                           \
 377     DEBUG_STATE_r(                                              \
 378             PerlIO_printf(Perl_debug_log,                       \
 379                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 380                 (IV)PL_savestack_ix));                          \
 381     cp = PL_savestack_ix
 382
 383 #define REGCP_UNWIND(cp)                                        \
 384     DEBUG_STATE_r(                                              \
 385         if (cp != PL_savestack_ix)                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 388                 (IV)(cp), (IV)PL_savestack_ix));                \
 389     regcpblow(cp)
 390
 391 STATIC char *
 392 S_regcppop(pTHX_ const regexp *rex)
 393 {
 394     dVAR;
 395     UV i;
 396     char *input;
 397     GET_RE_DEBUG_FLAGS_DECL;
 398
 399     PERL_ARGS_ASSERT_REGCPPOP;
 400
 401     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 402     i = SSPOPUV;
 403     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 404     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 405     input = (char *) SSPOPPTR;
 406     *PL_reglastcloseparen = SSPOPINT;
 407     *PL_reglastparen = SSPOPINT;
 408     PL_regsize = SSPOPINT;
 409     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 410
 411     i -= REGCP_OTHER_ELEMS;
 412     /* Now restore the parentheses context. */
 413     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 414         I32 tmps;
 415         U32 paren = (U32)SSPOPINT;
 416         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 417         PL_regoffs[paren].start = SSPOPINT;
 418         tmps = SSPOPINT;
 419         if (paren <= *PL_reglastparen)
 420             PL_regoffs[paren].end = tmps;
 421         DEBUG_BUFFERS_r(
 422             PerlIO_printf(Perl_debug_log,
 423                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 424                           (UV)paren, (IV)PL_regoffs[paren].start,
 425                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 426                           (IV)PL_regoffs[paren].end,
 427                           (paren > *PL_reglastparen ? "(no)" : ""));
 428         );
 429     }
 430     DEBUG_BUFFERS_r(
 431         if (*PL_reglastparen + 1 <= rex->nparens) {
 432             PerlIO_printf(Perl_debug_log,
 433                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 434                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 435         }
 436     );
 437 #if 1
 438     /* It would seem that the similar code in regtry()
 439      * already takes care of this, and in fact it is in
 440      * a better location to since this code can #if 0-ed out
 441      * but the code in regtry() is needed or otherwise tests
 442      * requiring null fields (pat.t#187 and split.t#{13,14}
 443      * (as of patchlevel 7877)  will fail.  Then again,
 444      * this code seems to be necessary or otherwise
 445      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 446      * --jhi updated by dapm */
 447     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 448         if (i > PL_regsize)
 449             PL_regoffs[i].start = -1;
 450         PL_regoffs[i].end = -1;
 451     }
 452 #endif
 453     return input;
 454 }
 455
 456 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 457
 458 /*
 459  * pregexec and friends
 460  */
 461
 462 #ifndef PERL_IN_XSUB_RE
 463 /*
 464  - pregexec - match a regexp against a string
 465  */
 466 I32
 467 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 468          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 469 /* strend: pointer to null at end of string */
 470 /* strbeg: real beginning of string */
 471 /* minend: end of match must be >=minend after stringarg. */
 472 /* nosave: For optimizations. */
 473 {
 474     PERL_ARGS_ASSERT_PREGEXEC;
 475
 476     return
 477         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 478                       nosave ? 0 : REXEC_COPY_STR);
 479 }
 480 #endif
 481
 482 /*
 483  * Need to implement the following flags for reg_anch:
 484  *
 485  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 486  * USE_INTUIT_ML
 487  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 488  * INTUIT_AUTORITATIVE_ML
 489  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 490  * INTUIT_ONCE_ML
 491  *
 492  * Another flag for this function: SECOND_TIME (so that float substrs
 493  * with giant delta may be not rechecked).
 494  */
 495
 496 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 497
 498 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 499    Otherwise, only SvCUR(sv) is used to get strbeg. */
 500
 501 /* XXXX We assume that strpos is strbeg unless sv. */
 502
 503 /* XXXX Some places assume that there is a fixed substring.
 504         An update may be needed if optimizer marks as "INTUITable"
 505         RExen without fixed substrings.  Similarly, it is assumed that
 506         lengths of all the strings are no more than minlen, thus they
 507         cannot come from lookahead.
 508         (Or minlen should take into account lookahead.)
 509   NOTE: Some of this comment is not correct. minlen does now take account
 510   of lookahead/behind. Further research is required. -- demerphq
 511
 512 */
 513
 514 /* A failure to find a constant substring means that there is no need to make
 515    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 516    finding a substring too deep into the string means that less calls to
 517    regtry() should be needed.
 518
 519    REx compiler's optimizer found 4 possible hints:
 520         a) Anchored substring;
 521         b) Fixed substring;
 522         c) Whether we are anchored (beginning-of-line or \G);
 523         d) First node (of those at offset 0) which may distinguish positions;
 524    We use a)b)d) and multiline-part of c), and try to find a position in the
 525    string which does not contradict any of them.
 526  */
 527
 528 /* Most of decisions we do here should have been done at compile time.
 529    The nodes of the REx which we used for the search should have been
 530    deleted from the finite automaton. */
 531
 532 char *
 533 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 534                      char *strend, const U32 flags, re_scream_pos_data *data)
 535 {
 536     dVAR;
 537     struct regexp *const prog = (struct regexp *)SvANY(rx);
 538     register I32 start_shift = 0;
 539     /* Should be nonnegative! */
 540     register I32 end_shift   = 0;
 541     register char *s;
 542     register SV *check;
 543     char *strbeg;
 544     char *t;
 545     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 546     I32 ml_anch;
 547     register char *other_last = NULL;   /* other substr checked before this */
 548     char *check_at = NULL;              /* check substr found at this pos */
 549     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 550     RXi_GET_DECL(prog,progi);
 551 #ifdef DEBUGGING
 552     const char * const i_strpos = strpos;
 553 #endif
 554     GET_RE_DEBUG_FLAGS_DECL;
 555
 556     PERL_ARGS_ASSERT_RE_INTUIT_START;
 557
 558     RX_MATCH_UTF8_set(rx,utf8_target);
 559
 560     if (RX_UTF8(rx)) {
 561         PL_reg_flags |= RF_utf8;
 562     }
 563     DEBUG_EXECUTE_r(
 564         debug_start_match(rx, utf8_target, strpos, strend,
 565             sv ? "Guessing start of match in sv for"
 566                : "Guessing start of match in string for");
 567               );
 568
 569     /* CHR_DIST() would be more correct here but it makes things slow. */
 570     if (prog->minlen > strend - strpos) {
 571         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 572                               "String too short... [re_intuit_start]\n"));
 573         goto fail;
 574     }
 575
 576     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 577     PL_regeol = strend;
 578     if (utf8_target) {
 579         if (!prog->check_utf8 && prog->check_substr)
 580             to_utf8_substr(prog);
 581         check = prog->check_utf8;
 582     } else {
 583         if (!prog->check_substr && prog->check_utf8)
 584             to_byte_substr(prog);
 585         check = prog->check_substr;
 586     }
 587     if (check == &PL_sv_undef) {
 588         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 589                 "Non-utf8 string cannot match utf8 check string\n"));
 590         goto fail;
 591     }
 592     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 593         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 594                      || ( (prog->extflags & RXf_ANCH_BOL)
 595                           && !multiline ) );    /* Check after \n? */
 596
 597         if (!ml_anch) {
 598           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 599                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 600                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 601                && sv && !SvROK(sv)
 602                && (strpos != strbeg)) {
 603               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 604               goto fail;
 605           }
 606           if (prog->check_offset_min == prog->check_offset_max &&
 607               !(prog->extflags & RXf_CANY_SEEN)) {
 608             /* Substring at constant offset from beg-of-str... */
 609             I32 slen;
 610
 611             s = HOP3c(strpos, prog->check_offset_min, strend);
 612
 613             if (SvTAIL(check)) {
 614                 slen = SvCUR(check);    /* >= 1 */
 615
 616                 if ( strend - s > slen || strend - s < slen - 1
 617                      || (strend - s == slen && strend[-1] != '\n')) {
 618                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 619                     goto fail_finish;
 620                 }
 621                 /* Now should match s[0..slen-2] */
 622                 slen--;
 623                 if (slen && (*SvPVX_const(check) != *s
 624                              || (slen > 1
 625                                  && memNE(SvPVX_const(check), s, slen)))) {
 626                   report_neq:
 627                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 628                     goto fail_finish;
 629                 }
 630             }
 631             else if (*SvPVX_const(check) != *s
 632                      || ((slen = SvCUR(check)) > 1
 633                          && memNE(SvPVX_const(check), s, slen)))
 634                 goto report_neq;
 635             check_at = s;
 636             goto success_at_start;
 637           }
 638         }
 639         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 640         s = strpos;
 641         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 642         end_shift = prog->check_end_shift;
 643
 644         if (!ml_anch) {
 645             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 646                                          - (SvTAIL(check) != 0);
 647             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 648
 649             if (end_shift < eshift)
 650                 end_shift = eshift;
 651         }
 652     }
 653     else {                              /* Can match at random position */
 654         ml_anch = 0;
 655         s = strpos;
 656         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 657         end_shift = prog->check_end_shift;
 658
 659         /* end shift should be non negative here */
 660     }
 661
 662 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 663     if (end_shift < 0)
 664         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 665                    (IV)end_shift, RX_PRECOMP(prog));
 666 #endif
 667
 668   restart:
 669     /* Find a possible match in the region s..strend by looking for
 670        the "check" substring in the region corrected by start/end_shift. */
 671
 672     {
 673         I32 srch_start_shift = start_shift;
 674         I32 srch_end_shift = end_shift;
 675         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 676             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 677             srch_start_shift = strbeg - s;
 678         }
 679     DEBUG_OPTIMISE_MORE_r({
 680         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 681             (IV)prog->check_offset_min,
 682             (IV)srch_start_shift,
 683             (IV)srch_end_shift,
 684             (IV)prog->check_end_shift);
 685     });
 686
 687     if (flags & REXEC_SCREAM) {
 688         I32 p = -1;                     /* Internal iterator of scream. */
 689         I32 * const pp = data ? data->scream_pos : &p;
 690
 691         if (PL_screamfirst[BmRARE(check)] >= 0
 692             || ( BmRARE(check) == '\n'
 693                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 694                  && SvTAIL(check) ))
 695             s = screaminstr(sv, check,
 696                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 697         else
 698             goto fail_finish;
 699         /* we may be pointing at the wrong string */
 700         if (s && RXp_MATCH_COPIED(prog))
 701             s = strbeg + (s - SvPVX_const(sv));
 702         if (data)
 703             *data->scream_olds = s;
 704     }
 705     else {
 706         U8* start_point;
 707         U8* end_point;
 708         if (prog->extflags & RXf_CANY_SEEN) {
 709             start_point= (U8*)(s + srch_start_shift);
 710             end_point= (U8*)(strend - srch_end_shift);
 711         } else {
 712             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 713             end_point= HOP3(strend, -srch_end_shift, strbeg);
 714         }
 715         DEBUG_OPTIMISE_MORE_r({
 716             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 717                 (int)(end_point - start_point),
 718                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 719                 start_point);
 720         });
 721
 722         s = fbm_instr( start_point, end_point,
 723                       check, multiline ? FBMrf_MULTILINE : 0);
 724     }
 725     }
 726     /* Update the count-of-usability, remove useless subpatterns,
 727         unshift s.  */
 728
 729     DEBUG_EXECUTE_r({
 730         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 731             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 732         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 733                           (s ? "Found" : "Did not find"),
 734             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 735                 ? "anchored" : "floating"),
 736             quoted,
 737             RE_SV_TAIL(check),
 738             (s ? " at offset " : "...\n") );
 739     });
 740
 741     if (!s)
 742         goto fail_finish;
 743     /* Finish the diagnostic message */
 744     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 745
 746     /* XXX dmq: first branch is for positive lookbehind...
 747        Our check string is offset from the beginning of the pattern.
 748        So we need to do any stclass tests offset forward from that
 749        point. I think. :-(
 750      */
 751
 752
 753
 754     check_at=s;
 755
 756
 757     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 758        Start with the other substr.
 759        XXXX no SCREAM optimization yet - and a very coarse implementation
 760        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 761                 *always* match.  Probably should be marked during compile...
 762        Probably it is right to do no SCREAM here...
 763      */
 764
 765     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 766                 : (prog->float_substr && prog->anchored_substr))
 767     {
 768         /* Take into account the "other" substring. */
 769         /* XXXX May be hopelessly wrong for UTF... */
 770         if (!other_last)
 771             other_last = strpos;
 772         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 773           do_other_anchored:
 774             {
 775                 char * const last = HOP3c(s, -start_shift, strbeg);
 776                 char *last1, *last2;
 777                 char * const saved_s = s;
 778                 SV* must;
 779
 780                 t = s - prog->check_offset_max;
 781                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 782                     && (!utf8_target
 783                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 784                             && t > strpos)))
 785                     NOOP;
 786                 else
 787                     t = strpos;
 788                 t = HOP3c(t, prog->anchored_offset, strend);
 789                 if (t < other_last)     /* These positions already checked */
 790                     t = other_last;
 791                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 792                 if (last < last1)
 793                     last1 = last;
 794                 /* XXXX It is not documented what units *_offsets are in.
 795                    We assume bytes, but this is clearly wrong.
 796                    Meaning this code needs to be carefully reviewed for errors.
 797                    dmq.
 798                   */
 799
 800                 /* On end-of-str: see comment below. */
 801                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 802                 if (must == &PL_sv_undef) {
 803                     s = (char*)NULL;
 804                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 805                 }
 806                 else
 807                     s = fbm_instr(
 808                         (unsigned char*)t,
 809                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 810                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 811                         must,
 812                         multiline ? FBMrf_MULTILINE : 0
 813                     );
 814                 DEBUG_EXECUTE_r({
 815                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 816                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 817                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 818                         (s ? "Found" : "Contradicts"),
 819                         quoted, RE_SV_TAIL(must));
 820                 });
 821
 822
 823                 if (!s) {
 824                     if (last1 >= last2) {
 825                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 826                                                 ", giving up...\n"));
 827                         goto fail_finish;
 828                     }
 829                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 830                         ", trying floating at offset %ld...\n",
 831                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 832                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 833                     s = HOP3c(last, 1, strend);
 834                     goto restart;
 835                 }
 836                 else {
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 838                           (long)(s - i_strpos)));
 839                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 840                     other_last = HOP3c(s, 1, strend);
 841                     s = saved_s;
 842                     if (t == strpos)
 843                         goto try_at_start;
 844                     goto try_at_offset;
 845                 }
 846             }
 847         }
 848         else {          /* Take into account the floating substring. */
 849             char *last, *last1;
 850             char * const saved_s = s;
 851             SV* must;
 852
 853             t = HOP3c(s, -start_shift, strbeg);
 854             last1 = last =
 855                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 856             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 857                 last = HOP3c(t, prog->float_max_offset, strend);
 858             s = HOP3c(t, prog->float_min_offset, strend);
 859             if (s < other_last)
 860                 s = other_last;
 861  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 862             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 863             /* fbm_instr() takes into account exact value of end-of-str
 864                if the check is SvTAIL(ed).  Since false positives are OK,
 865                and end-of-str is not later than strend we are OK. */
 866             if (must == &PL_sv_undef) {
 867                 s = (char*)NULL;
 868                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 869             }
 870             else
 871                 s = fbm_instr((unsigned char*)s,
 872                               (unsigned char*)last + SvCUR(must)
 873                                   - (SvTAIL(must)!=0),
 874                               must, multiline ? FBMrf_MULTILINE : 0);
 875             DEBUG_EXECUTE_r({
 876                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 877                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 878                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 879                     (s ? "Found" : "Contradicts"),
 880                     quoted, RE_SV_TAIL(must));
 881             });
 882             if (!s) {
 883                 if (last1 == last) {
 884                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 885                                             ", giving up...\n"));
 886                     goto fail_finish;
 887                 }
 888                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 889                     ", trying anchored starting at offset %ld...\n",
 890                     (long)(saved_s + 1 - i_strpos)));
 891                 other_last = last;
 892                 s = HOP3c(t, 1, strend);
 893                 goto restart;
 894             }
 895             else {
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 897                       (long)(s - i_strpos)));
 898                 other_last = s; /* Fix this later. --Hugo */
 899                 s = saved_s;
 900                 if (t == strpos)
 901                     goto try_at_start;
 902                 goto try_at_offset;
 903             }
 904         }
 905     }
 906
 907
 908     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 909
 910     DEBUG_OPTIMISE_MORE_r(
 911         PerlIO_printf(Perl_debug_log,
 912             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 913             (IV)prog->check_offset_min,
 914             (IV)prog->check_offset_max,
 915             (IV)(s-strpos),
 916             (IV)(t-strpos),
 917             (IV)(t-s),
 918             (IV)(strend-strpos)
 919         )
 920     );
 921
 922     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 923         && (!utf8_target
 924             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 925                  && t > strpos)))
 926     {
 927         /* Fixed substring is found far enough so that the match
 928            cannot start at strpos. */
 929       try_at_offset:
 930         if (ml_anch && t[-1] != '\n') {
 931             /* Eventually fbm_*() should handle this, but often
 932                anchored_offset is not 0, so this check will not be wasted. */
 933             /* XXXX In the code below we prefer to look for "^" even in
 934                presence of anchored substrings.  And we search even
 935                beyond the found float position.  These pessimizations
 936                are historical artefacts only.  */
 937           find_anchor:
 938             while (t < strend - prog->minlen) {
 939                 if (*t == '\n') {
 940                     if (t < check_at - prog->check_offset_min) {
 941                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 942                             /* Since we moved from the found position,
 943                                we definitely contradict the found anchored
 944                                substr.  Due to the above check we do not
 945                                contradict "check" substr.
 946                                Thus we can arrive here only if check substr
 947                                is float.  Redo checking for "other"=="fixed".
 948                              */
 949                             strpos = t + 1;
 950                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 951                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 952                             goto do_other_anchored;
 953                         }
 954                         /* We don't contradict the found floating substring. */
 955                         /* XXXX Why not check for STCLASS? */
 956                         s = t + 1;
 957                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 958                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 959                         goto set_useful;
 960                     }
 961                     /* Position contradicts check-string */
 962                     /* XXXX probably better to look for check-string
 963                        than for "\n", so one should lower the limit for t? */
 964                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 965                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 966                     other_last = strpos = s = t + 1;
 967                     goto restart;
 968                 }
 969                 t++;
 970             }
 971             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 972                         PL_colors[0], PL_colors[1]));
 973             goto fail_finish;
 974         }
 975         else {
 976             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 977                         PL_colors[0], PL_colors[1]));
 978         }
 979         s = t;
 980       set_useful:
 981         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 982     }
 983     else {
 984         /* The found string does not prohibit matching at strpos,
 985            - no optimization of calling REx engine can be performed,
 986            unless it was an MBOL and we are not after MBOL,
 987            or a future STCLASS check will fail this. */
 988       try_at_start:
 989         /* Even in this situation we may use MBOL flag if strpos is offset
 990            wrt the start of the string. */
 991         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 992             && (strpos != strbeg) && strpos[-1] != '\n'
 993             /* May be due to an implicit anchor of m{.*foo}  */
 994             && !(prog->intflags & PREGf_IMPLICIT))
 995         {
 996             t = strpos;
 997             goto find_anchor;
 998         }
 999         DEBUG_EXECUTE_r( if (ml_anch)
1000             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1001                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1002         );
1003       success_at_start:
1004         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1005             && (utf8_target ? (
1006                 prog->check_utf8                /* Could be deleted already */
1007                 && --BmUSEFUL(prog->check_utf8) < 0
1008                 && (prog->check_utf8 == prog->float_utf8)
1009             ) : (
1010                 prog->check_substr              /* Could be deleted already */
1011                 && --BmUSEFUL(prog->check_substr) < 0
1012                 && (prog->check_substr == prog->float_substr)
1013             )))
1014         {
1015             /* If flags & SOMETHING - do not do it many times on the same match */
1016             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1017             /* XXX Does the destruction order has to change with utf8_target? */
1018             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1019             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1020             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1021             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1022             check = NULL;                       /* abort */
1023             s = strpos;
1024             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1025                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1026             if (prog->intflags & PREGf_IMPLICIT)
1027                 prog->extflags &= ~RXf_ANCH_MBOL;
1028             /* XXXX This is a remnant of the old implementation.  It
1029                     looks wasteful, since now INTUIT can use many
1030                     other heuristics. */
1031             prog->extflags &= ~RXf_USE_INTUIT;
1032             /* XXXX What other flags might need to be cleared in this branch? */
1033         }
1034         else
1035             s = strpos;
1036     }
1037
1038     /* Last resort... */
1039     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1040     /* trie stclasses are too expensive to use here, we are better off to
1041        leave it to regmatch itself */
1042     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1043         /* minlen == 0 is possible if regstclass is \b or \B,
1044            and the fixed substr is ''$.
1045            Since minlen is already taken into account, s+1 is before strend;
1046            accidentally, minlen >= 1 guaranties no false positives at s + 1
1047            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1048            regstclass does not come from lookahead...  */
1049         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1050            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1051         const U8* const str = (U8*)STRING(progi->regstclass);
1052         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1053                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1054                     : 1);
1055         char * endpos;
1056         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1057             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1058         else if (prog->float_substr || prog->float_utf8)
1059             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1060         else
1061             endpos= strend;
1062
1063         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1064                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1065
1066         t = s;
1067         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1068         if (!s) {
1069 #ifdef DEBUGGING
1070             const char *what = NULL;
1071 #endif
1072             if (endpos == strend) {
1073                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1074                                 "Could not match STCLASS...\n") );
1075                 goto fail;
1076             }
1077             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1078                                    "This position contradicts STCLASS...\n") );
1079             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1080                 goto fail;
1081             /* Contradict one of substrings */
1082             if (prog->anchored_substr || prog->anchored_utf8) {
1083                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1084                     DEBUG_EXECUTE_r( what = "anchored" );
1085                   hop_and_restart:
1086                     s = HOP3c(t, 1, strend);
1087                     if (s + start_shift + end_shift > strend) {
1088                         /* XXXX Should be taken into account earlier? */
1089                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1090                                                "Could not match STCLASS...\n") );
1091                         goto fail;
1092                     }
1093                     if (!check)
1094                         goto giveup;
1095                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1096                                 "Looking for %s substr starting at offset %ld...\n",
1097                                  what, (long)(s + start_shift - i_strpos)) );
1098                     goto restart;
1099                 }
1100                 /* Have both, check_string is floating */
1101                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1102                     goto retry_floating_check;
1103                 /* Recheck anchored substring, but not floating... */
1104                 s = check_at;
1105                 if (!check)
1106                     goto giveup;
1107                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                           "Looking for anchored substr starting at offset %ld...\n",
1109                           (long)(other_last - i_strpos)) );
1110                 goto do_other_anchored;
1111             }
1112             /* Another way we could have checked stclass at the
1113                current position only: */
1114             if (ml_anch) {
1115                 s = t = t + 1;
1116                 if (!check)
1117                     goto giveup;
1118                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                           "Looking for /%s^%s/m starting at offset %ld...\n",
1120                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1121                 goto try_at_offset;
1122             }
1123             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1124                 goto fail;
1125             /* Check is floating substring. */
1126           retry_floating_check:
1127             t = check_at - start_shift;
1128             DEBUG_EXECUTE_r( what = "floating" );
1129             goto hop_and_restart;
1130         }
1131         if (t != s) {
1132             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1133                         "By STCLASS: moving %ld --> %ld\n",
1134                                   (long)(t - i_strpos), (long)(s - i_strpos))
1135                    );
1136         }
1137         else {
1138             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1139                                   "Does not contradict STCLASS...\n");
1140                    );
1141         }
1142     }
1143   giveup:
1144     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1145                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1146                           PL_colors[5], (long)(s - i_strpos)) );
1147     return s;
1148
1149   fail_finish:                          /* Substring not found */
1150     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1151         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1152   fail:
1153     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1154                           PL_colors[4], PL_colors[5]));
1155     return NULL;
1156 }
1157
1158 #define DECL_TRIE_TYPE(scan) \
1159     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1160                     trie_type = (scan->flags != EXACT) \
1161                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1162                               : (utf8_target ? trie_utf8 : trie_plain)
1163
1164 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1165 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1166     switch (trie_type) {                                                    \
1167     case trie_utf8_fold:                                                    \
1168         if ( foldlen>0 ) {                                                  \
1169             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1170             foldlen -= len;                                                 \
1171             uscan += len;                                                   \
1172             len=0;                                                          \
1173         } else {                                                            \
1174             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1175             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1176             foldlen -= UNISKIP( uvc );                                      \
1177             uscan = foldbuf + UNISKIP( uvc );                               \
1178         }                                                                   \
1179         break;                                                              \
1180     case trie_latin_utf8_fold:                                              \
1181         if ( foldlen>0 ) {                                                  \
1182             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1183             foldlen -= len;                                                 \
1184             uscan += len;                                                   \
1185             len=0;                                                          \
1186         } else {                                                            \
1187             len = 1;                                                        \
1188             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1189             foldlen -= UNISKIP( uvc );                                      \
1190             uscan = foldbuf + UNISKIP( uvc );                               \
1191         }                                                                   \
1192         break;                                                              \
1193     case trie_utf8:                                                         \
1194         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1195         break;                                                              \
1196     case trie_plain:                                                        \
1197         uvc = (UV)*uc;                                                      \
1198         len = 1;                                                            \
1199     }                                                                       \
1200     if (uvc < 256) {                                                        \
1201         charid = trie->charmap[ uvc ];                                      \
1202     }                                                                       \
1203     else {                                                                  \
1204         charid = 0;                                                         \
1205         if (widecharmap) {                                                  \
1206             SV** const svpp = hv_fetch(widecharmap,                         \
1207                         (char*)&uvc, sizeof(UV), 0);                        \
1208             if (svpp)                                                       \
1209                 charid = (U16)SvIV(*svpp);                                  \
1210         }                                                                   \
1211     }                                                                       \
1212 } STMT_END
1213
1214 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1215 STMT_START {                                              \
1216     while (s <= e) {                                      \
1217         if ( (CoNd)                                       \
1218              && (ln == 1 || folder(s, pat_string, ln))    \
1219              && (!reginfo || regtry(reginfo, &s)) )       \
1220             goto got_it;                                  \
1221         s++;                                              \
1222     }                                                     \
1223 } STMT_END
1224
1225 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1226 STMT_START {                                          \
1227     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1228         CoDe                                          \
1229         s += uskip;                                   \
1230     }                                                 \
1231 } STMT_END
1232
1233 #define REXEC_FBC_SCAN(CoDe)                          \
1234 STMT_START {                                          \
1235     while (s < strend) {                              \
1236         CoDe                                          \
1237         s++;                                          \
1238     }                                                 \
1239 } STMT_END
1240
1241 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1242 REXEC_FBC_UTF8_SCAN(                                  \
1243     if (CoNd) {                                       \
1244         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1245             goto got_it;                              \
1246         else                                          \
1247             tmp = doevery;                            \
1248     }                                                 \
1249     else                                              \
1250         tmp = 1;                                      \
1251 )
1252
1253 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1254 REXEC_FBC_SCAN(                                       \
1255     if (CoNd) {                                       \
1256         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1257             goto got_it;                              \
1258         else                                          \
1259             tmp = doevery;                            \
1260     }                                                 \
1261     else                                              \
1262         tmp = 1;                                      \
1263 )
1264
1265 #define REXEC_FBC_TRYIT               \
1266 if ((!reginfo || regtry(reginfo, &s))) \
1267     goto got_it
1268
1269 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1270     if (utf8_target) {                                             \
1271         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1272     }                                                          \
1273     else {                                                     \
1274         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1275     }
1276
1277 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1278     if (utf8_target) {                                             \
1279         UtFpReLoAd;                                            \
1280         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1281     }                                                          \
1282     else {                                                     \
1283         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1284     }
1285
1286 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1287     PL_reg_flags |= RF_tainted;                                \
1288     if (utf8_target) {                                             \
1289         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1290     }                                                          \
1291     else {                                                     \
1292         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1293     }
1294
1295 #define DUMP_EXEC_POS(li,s,doutf8) \
1296     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1297
1298
1299 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1300         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1301         tmp = TEST_NON_UTF8(tmp);                                              \
1302         REXEC_FBC_UTF8_SCAN(                                                   \
1303             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1304                 tmp = !tmp;                                                    \
1305                 IF_SUCCESS;                                                    \
1306             }                                                                  \
1307             else {                                                             \
1308                 IF_FAIL;                                                       \
1309             }                                                                  \
1310         );                                                                     \
1311
1312 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1313         if (s == PL_bostr) {                                                   \
1314             tmp = '\n';                                                        \
1315         }                                                                      \
1316         else {                                                                 \
1317             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1318             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1319         }                                                                      \
1320         tmp = TeSt1_UtF8;                                                      \
1321         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1322         REXEC_FBC_UTF8_SCAN(                                                   \
1323             if (tmp == ! (TeSt2_UtF8)) { \
1324                 tmp = !tmp;                                                    \
1325                 IF_SUCCESS;                                                    \
1326             }                                                                  \
1327             else {                                                             \
1328                 IF_FAIL;                                                       \
1329             }                                                                  \
1330         );                                                                     \
1331
1332 /* The only difference between the BOUND and NBOUND cases is that
1333  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1334  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1335  * with the other one being empty */
1336 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1337     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1338
1339 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1340     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1341
1342 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1343     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1344
1345 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1346     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1347
1348
1349 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1350  * be passed in completely with the variable name being tested, which isn't
1351  * such a clean interface, but this is easier to read than it was before.  We
1352  * are looking for the boundary (or non-boundary between a word and non-word
1353  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1354  * must be different.  Find the "wordness" of the character just prior to this
1355  * one, and compare it with the wordness of this one.  If they differ, we have
1356  * a boundary.  At the beginning of the string, pretend that the previous
1357  * character was a new-line */
1358 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1359     if (utf8_target) {                                                         \
1360                 UTF8_CODE \
1361     }                                                                          \
1362     else {  /* Not utf8 */                                                     \
1363         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1364         tmp = TEST_NON_UTF8(tmp);                                              \
1365         REXEC_FBC_SCAN(                                                        \
1366             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1367                 tmp = !tmp;                                                    \
1368                 IF_SUCCESS;                                                    \
1369             }                                                                  \
1370             else {                                                             \
1371                 IF_FAIL;                                                       \
1372             }                                                                  \
1373         );                                                                     \
1374     }                                                                          \
1375     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1376         goto got_it;
1377
1378 /* We know what class REx starts with.  Try to find this position... */
1379 /* if reginfo is NULL, its a dryrun */
1380 /* annoyingly all the vars in this routine have different names from their counterparts
1381    in regmatch. /grrr */
1382
1383 STATIC char *
1384 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1385     const char *strend, regmatch_info *reginfo)
1386 {
1387         dVAR;
1388         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1389         char *pat_string;   /* The pattern's exactish string */
1390         char *pat_end;      /* ptr to end char of pat_string */
1391         re_fold_t folder;       /* Function for computing non-utf8 folds */
1392         const U8 *fold_array;   /* array for folding ords < 256 */
1393         STRLEN ln;
1394         STRLEN lnc;
1395         register STRLEN uskip;
1396         U8 c1;
1397         U8 c2;
1398         char *e;
1399         register I32 tmp = 1;   /* Scratch variable? */
1400         register const bool utf8_target = PL_reg_match_utf8;
1401         UV utf8_fold_flags = 0;
1402         RXi_GET_DECL(prog,progi);
1403
1404         PERL_ARGS_ASSERT_FIND_BYCLASS;
1405
1406         /* We know what class it must start with. */
1407         switch (OP(c)) {
1408         case ANYOFV:
1409         case ANYOF:
1410             if (utf8_target || OP(c) == ANYOFV) {
1411                 STRLEN inclasslen = strend - s;
1412                 REXEC_FBC_UTF8_CLASS_SCAN(
1413                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1414             }
1415             else {
1416                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1417             }
1418             break;
1419         case CANY:
1420             REXEC_FBC_SCAN(
1421                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1422                     goto got_it;
1423                 else
1424                     tmp = doevery;
1425             );
1426             break;
1427
1428         case EXACTFA:
1429             if (UTF_PATTERN || utf8_target) {
1430                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1431                 goto do_exactf_utf8;
1432             }
1433             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1434             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1435             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1436
1437         case EXACTFU:
1438             if (UTF_PATTERN || utf8_target) {
1439                 utf8_fold_flags = 0;
1440                 goto do_exactf_utf8;
1441             }
1442             fold_array = PL_fold_latin1;
1443             folder = foldEQ_latin1;
1444             /* XXX This uses the full utf8 fold because if the pattern contains
1445              * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1446              * There could be a new node type, say EXACTFU_SS, which is
1447              * generated by regcomp only if there is an 'ss', and then every
1448              * other case could goto do_exactf_non_utf8;*/
1449             goto do_exactf_utf8;
1450
1451         case EXACTF:
1452             if (UTF_PATTERN || utf8_target) {
1453                 utf8_fold_flags = 0;
1454                 goto do_exactf_utf8;
1455             }
1456             fold_array = PL_fold;
1457             folder = foldEQ;
1458             goto do_exactf_non_utf8;
1459
1460         case EXACTFL:
1461             if (UTF_PATTERN || utf8_target) {
1462                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1463                 goto do_exactf_utf8;
1464             }
1465             fold_array = PL_fold_locale;
1466             folder = foldEQ_locale;
1467
1468             /* FALL THROUGH */
1469
1470         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1471
1472             /* The idea in the non-utf8 EXACTF* cases is to first find the
1473              * first character of the EXACTF* node and then, if necessary,
1474              * case-insensitively compare the full text of the node.  c1 is the
1475              * first character.  c2 is its fold.  This logic will not work for
1476              * Unicode semantics and the german sharp ss, which hence should
1477              * not be compiled into a node that gets here. */
1478             pat_string = STRING(c);
1479             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1480
1481             e = HOP3c(strend, -((I32)ln), s);
1482
1483             if (!reginfo && e < s) {
1484                 e = s;                  /* Due to minlen logic of intuit() */
1485             }
1486
1487             c1 = *pat_string;
1488             c2 = fold_array[c1];
1489             if (c1 == c2) { /* If char and fold are the same */
1490                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1491             }
1492             else {
1493                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1494             }
1495             break;
1496
1497         do_exactf_utf8:
1498
1499             /* If one of the operands is in utf8, we can't use the simpler
1500              * folding above, due to the fact that many different characters
1501              * can have the same fold, or portion of a fold, or different-
1502              * length fold */
1503             pat_string = STRING(c);
1504             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1505             pat_end = pat_string + ln;
1506             lnc = (UTF_PATTERN) /* length to match in characters */
1507                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1508                     : ln;
1509
1510             e = HOP3c(strend, -((I32)lnc), s);
1511
1512             if (!reginfo && e < s) {
1513                 e = s;                  /* Due to minlen logic of intuit() */
1514             }
1515
1516             while (s <= e) {
1517                 char *my_strend= (char *)strend;
1518                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1519                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1520                     && (!reginfo || regtry(reginfo, &s)) )
1521                 {
1522                     goto got_it;
1523                 }
1524                 s += UTF8SKIP(s);
1525             }
1526             break;
1527         case BOUNDL:
1528             PL_reg_flags |= RF_tainted;
1529             FBC_BOUND(isALNUM_LC,
1530                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1531                       isALNUM_LC_utf8((U8*)s));
1532             break;
1533         case NBOUNDL:
1534             PL_reg_flags |= RF_tainted;
1535             FBC_NBOUND(isALNUM_LC,
1536                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1537                        isALNUM_LC_utf8((U8*)s));
1538             break;
1539         case BOUND:
1540             FBC_BOUND(isWORDCHAR,
1541                       isALNUM_uni(tmp),
1542                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1543             break;
1544         case BOUNDA:
1545             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1546                              isWORDCHAR_A(tmp),
1547                              isWORDCHAR_A((U8*)s));
1548             break;
1549         case NBOUND:
1550             FBC_NBOUND(isWORDCHAR,
1551                        isALNUM_uni(tmp),
1552                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1553             break;
1554         case NBOUNDA:
1555             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1556                               isWORDCHAR_A(tmp),
1557                               isWORDCHAR_A((U8*)s));
1558             break;
1559         case BOUNDU:
1560             FBC_BOUND(isWORDCHAR_L1,
1561                       isALNUM_uni(tmp),
1562                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1563             break;
1564         case NBOUNDU:
1565             FBC_NBOUND(isWORDCHAR_L1,
1566                        isALNUM_uni(tmp),
1567                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1568             break;
1569         case ALNUML:
1570             REXEC_FBC_CSCAN_TAINT(
1571                 isALNUM_LC_utf8((U8*)s),
1572                 isALNUM_LC(*s)
1573             );
1574             break;
1575         case ALNUMU:
1576             REXEC_FBC_CSCAN_PRELOAD(
1577                 LOAD_UTF8_CHARCLASS_ALNUM(),
1578                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1579                 isWORDCHAR_L1((U8) *s)
1580             );
1581             break;
1582         case ALNUM:
1583             REXEC_FBC_CSCAN_PRELOAD(
1584                 LOAD_UTF8_CHARCLASS_ALNUM(),
1585                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1586                 isWORDCHAR((U8) *s)
1587             );
1588             break;
1589         case ALNUMA:
1590             /* Don't need to worry about utf8, as it can match only a single
1591              * byte invariant character */
1592             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1593             break;
1594         case NALNUMU:
1595             REXEC_FBC_CSCAN_PRELOAD(
1596                 LOAD_UTF8_CHARCLASS_ALNUM(),
1597                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1598                 ! isWORDCHAR_L1((U8) *s)
1599             );
1600             break;
1601         case NALNUM:
1602             REXEC_FBC_CSCAN_PRELOAD(
1603                 LOAD_UTF8_CHARCLASS_ALNUM(),
1604                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1605                 ! isALNUM(*s)
1606             );
1607             break;
1608         case NALNUMA:
1609             REXEC_FBC_CSCAN(
1610                 !isWORDCHAR_A(*s),
1611                 !isWORDCHAR_A(*s)
1612             );
1613             break;
1614         case NALNUML:
1615             REXEC_FBC_CSCAN_TAINT(
1616                 !isALNUM_LC_utf8((U8*)s),
1617                 !isALNUM_LC(*s)
1618             );
1619             break;
1620         case SPACEU:
1621             REXEC_FBC_CSCAN_PRELOAD(
1622                 LOAD_UTF8_CHARCLASS_SPACE(),
1623                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1624                 isSPACE_L1((U8) *s)
1625             );
1626             break;
1627         case SPACE:
1628             REXEC_FBC_CSCAN_PRELOAD(
1629                 LOAD_UTF8_CHARCLASS_SPACE(),
1630                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1631                 isSPACE((U8) *s)
1632             );
1633             break;
1634         case SPACEA:
1635             /* Don't need to worry about utf8, as it can match only a single
1636              * byte invariant character */
1637             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1638             break;
1639         case SPACEL:
1640             REXEC_FBC_CSCAN_TAINT(
1641                 isSPACE_LC_utf8((U8*)s),
1642                 isSPACE_LC(*s)
1643             );
1644             break;
1645         case NSPACEU:
1646             REXEC_FBC_CSCAN_PRELOAD(
1647                 LOAD_UTF8_CHARCLASS_SPACE(),
1648                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1649                 ! isSPACE_L1((U8) *s)
1650             );
1651             break;
1652         case NSPACE:
1653             REXEC_FBC_CSCAN_PRELOAD(
1654                 LOAD_UTF8_CHARCLASS_SPACE(),
1655                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1656                 ! isSPACE((U8) *s)
1657             );
1658             break;
1659         case NSPACEA:
1660             REXEC_FBC_CSCAN(
1661                 !isSPACE_A(*s),
1662                 !isSPACE_A(*s)
1663             );
1664             break;
1665         case NSPACEL:
1666             REXEC_FBC_CSCAN_TAINT(
1667                 !isSPACE_LC_utf8((U8*)s),
1668                 !isSPACE_LC(*s)
1669             );
1670             break;
1671         case DIGIT:
1672             REXEC_FBC_CSCAN_PRELOAD(
1673                 LOAD_UTF8_CHARCLASS_DIGIT(),
1674                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1675                 isDIGIT(*s)
1676             );
1677             break;
1678         case DIGITA:
1679             /* Don't need to worry about utf8, as it can match only a single
1680              * byte invariant character */
1681             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1682             break;
1683         case DIGITL:
1684             REXEC_FBC_CSCAN_TAINT(
1685                 isDIGIT_LC_utf8((U8*)s),
1686                 isDIGIT_LC(*s)
1687             );
1688             break;
1689         case NDIGIT:
1690             REXEC_FBC_CSCAN_PRELOAD(
1691                 LOAD_UTF8_CHARCLASS_DIGIT(),
1692                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1693                 !isDIGIT(*s)
1694             );
1695             break;
1696         case NDIGITA:
1697             REXEC_FBC_CSCAN(
1698                 !isDIGIT_A(*s),
1699                 !isDIGIT_A(*s)
1700             );
1701             break;
1702         case NDIGITL:
1703             REXEC_FBC_CSCAN_TAINT(
1704                 !isDIGIT_LC_utf8((U8*)s),
1705                 !isDIGIT_LC(*s)
1706             );
1707             break;
1708         case LNBREAK:
1709             REXEC_FBC_CSCAN(
1710                 is_LNBREAK_utf8(s),
1711                 is_LNBREAK_latin1(s)
1712             );
1713             break;
1714         case VERTWS:
1715             REXEC_FBC_CSCAN(
1716                 is_VERTWS_utf8(s),
1717                 is_VERTWS_latin1(s)
1718             );
1719             break;
1720         case NVERTWS:
1721             REXEC_FBC_CSCAN(
1722                 !is_VERTWS_utf8(s),
1723                 !is_VERTWS_latin1(s)
1724             );
1725             break;
1726         case HORIZWS:
1727             REXEC_FBC_CSCAN(
1728                 is_HORIZWS_utf8(s),
1729                 is_HORIZWS_latin1(s)
1730             );
1731             break;
1732         case NHORIZWS:
1733             REXEC_FBC_CSCAN(
1734                 !is_HORIZWS_utf8(s),
1735                 !is_HORIZWS_latin1(s)
1736             );
1737             break;
1738         case AHOCORASICKC:
1739         case AHOCORASICK:
1740             {
1741                 DECL_TRIE_TYPE(c);
1742                 /* what trie are we using right now */
1743                 reg_ac_data *aho
1744                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1745                 reg_trie_data *trie
1746                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1747                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1748
1749                 const char *last_start = strend - trie->minlen;
1750 #ifdef DEBUGGING
1751                 const char *real_start = s;
1752 #endif
1753                 STRLEN maxlen = trie->maxlen;
1754                 SV *sv_points;
1755                 U8 **points; /* map of where we were in the input string
1756                                 when reading a given char. For ASCII this
1757                                 is unnecessary overhead as the relationship
1758                                 is always 1:1, but for Unicode, especially
1759                                 case folded Unicode this is not true. */
1760                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1761                 U8 *bitmap=NULL;
1762
1763
1764                 GET_RE_DEBUG_FLAGS_DECL;
1765
1766                 /* We can't just allocate points here. We need to wrap it in
1767                  * an SV so it gets freed properly if there is a croak while
1768                  * running the match */
1769                 ENTER;
1770                 SAVETMPS;
1771                 sv_points=newSV(maxlen * sizeof(U8 *));
1772                 SvCUR_set(sv_points,
1773                     maxlen * sizeof(U8 *));
1774                 SvPOK_on(sv_points);
1775                 sv_2mortal(sv_points);
1776                 points=(U8**)SvPV_nolen(sv_points );
1777                 if ( trie_type != trie_utf8_fold
1778                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1779                 {
1780                     if (trie->bitmap)
1781                         bitmap=(U8*)trie->bitmap;
1782                     else
1783                         bitmap=(U8*)ANYOF_BITMAP(c);
1784                 }
1785                 /* this is the Aho-Corasick algorithm modified a touch
1786                    to include special handling for long "unknown char"
1787                    sequences. The basic idea being that we use AC as long
1788                    as we are dealing with a possible matching char, when
1789                    we encounter an unknown char (and we have not encountered
1790                    an accepting state) we scan forward until we find a legal
1791                    starting char.
1792                    AC matching is basically that of trie matching, except
1793                    that when we encounter a failing transition, we fall back
1794                    to the current states "fail state", and try the current char
1795                    again, a process we repeat until we reach the root state,
1796                    state 1, or a legal transition. If we fail on the root state
1797                    then we can either terminate if we have reached an accepting
1798                    state previously, or restart the entire process from the beginning
1799                    if we have not.
1800
1801                  */
1802                 while (s <= last_start) {
1803                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1804                     U8 *uc = (U8*)s;
1805                     U16 charid = 0;
1806                     U32 base = 1;
1807                     U32 state = 1;
1808                     UV uvc = 0;
1809                     STRLEN len = 0;
1810                     STRLEN foldlen = 0;
1811                     U8 *uscan = (U8*)NULL;
1812                     U8 *leftmost = NULL;
1813 #ifdef DEBUGGING
1814                     U32 accepted_word= 0;
1815 #endif
1816                     U32 pointpos = 0;
1817
1818                     while ( state && uc <= (U8*)strend ) {
1819                         int failed=0;
1820                         U32 word = aho->states[ state ].wordnum;
1821
1822                         if( state==1 ) {
1823                             if ( bitmap ) {
1824                                 DEBUG_TRIE_EXECUTE_r(
1825                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1826                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1827                                             (char *)uc, utf8_target );
1828                                         PerlIO_printf( Perl_debug_log,
1829                                             " Scanning for legal start char...\n");
1830                                     }
1831                                 );
1832                                 if (utf8_target) {
1833                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1834                                         uc += UTF8SKIP(uc);
1835                                     }
1836                                 } else {
1837                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1838                                         uc++;
1839                                     }
1840                                 }
1841                                 s= (char *)uc;
1842                             }
1843                             if (uc >(U8*)last_start) break;
1844                         }
1845
1846                         if ( word ) {
1847                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1848                             if (!leftmost || lpos < leftmost) {
1849                                 DEBUG_r(accepted_word=word);
1850                                 leftmost= lpos;
1851                             }
1852                             if (base==0) break;
1853
1854                         }
1855                         points[pointpos++ % maxlen]= uc;
1856                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1857                                              uscan, len, uvc, charid, foldlen,
1858                                              foldbuf, uniflags);
1859                         DEBUG_TRIE_EXECUTE_r({
1860                             dump_exec_pos( (char *)uc, c, strend, real_start,
1861                                 s,   utf8_target );
1862                             PerlIO_printf(Perl_debug_log,
1863                                 " Charid:%3u CP:%4"UVxf" ",
1864                                  charid, uvc);
1865                         });
1866
1867                         do {
1868 #ifdef DEBUGGING
1869                             word = aho->states[ state ].wordnum;
1870 #endif
1871                             base = aho->states[ state ].trans.base;
1872
1873                             DEBUG_TRIE_EXECUTE_r({
1874                                 if (failed)
1875                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1876                                         s,   utf8_target );
1877                                 PerlIO_printf( Perl_debug_log,
1878                                     "%sState: %4"UVxf", word=%"UVxf,
1879                                     failed ? " Fail transition to " : "",
1880                                     (UV)state, (UV)word);
1881                             });
1882                             if ( base ) {
1883                                 U32 tmp;
1884                                 I32 offset;
1885                                 if (charid &&
1886                                      ( ((offset = base + charid
1887                                         - 1 - trie->uniquecharcount)) >= 0)
1888                                      && ((U32)offset < trie->lasttrans)
1889                                      && trie->trans[offset].check == state
1890                                      && (tmp=trie->trans[offset].next))
1891                                 {
1892                                     DEBUG_TRIE_EXECUTE_r(
1893                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1894                                     state = tmp;
1895                                     break;
1896                                 }
1897                                 else {
1898                                     DEBUG_TRIE_EXECUTE_r(
1899                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1900                                     failed = 1;
1901                                     state = aho->fail[state];
1902                                 }
1903                             }
1904                             else {
1905                                 /* we must be accepting here */
1906                                 DEBUG_TRIE_EXECUTE_r(
1907                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1908                                 failed = 1;
1909                                 break;
1910                             }
1911                         } while(state);
1912                         uc += len;
1913                         if (failed) {
1914                             if (leftmost)
1915                                 break;
1916                             if (!state) state = 1;
1917                         }
1918                     }
1919                     if ( aho->states[ state ].wordnum ) {
1920                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1921                         if (!leftmost || lpos < leftmost) {
1922                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1923                             leftmost = lpos;
1924                         }
1925                     }
1926                     if (leftmost) {
1927                         s = (char*)leftmost;
1928                         DEBUG_TRIE_EXECUTE_r({
1929                             PerlIO_printf(
1930                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1931                                 (UV)accepted_word, (IV)(s - real_start)
1932                             );
1933                         });
1934                         if (!reginfo || regtry(reginfo, &s)) {
1935                             FREETMPS;
1936                             LEAVE;
1937                             goto got_it;
1938                         }
1939                         s = HOPc(s,1);
1940                         DEBUG_TRIE_EXECUTE_r({
1941                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1942                         });
1943                     } else {
1944                         DEBUG_TRIE_EXECUTE_r(
1945                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1946                         break;
1947                     }
1948                 }
1949                 FREETMPS;
1950                 LEAVE;
1951             }
1952             break;
1953         default:
1954             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1955             break;
1956         }
1957         return 0;
1958       got_it:
1959         return s;
1960 }
1961
1962
1963 /*
1964  - regexec_flags - match a regexp against a string
1965  */
1966 I32
1967 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1968               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1969 /* strend: pointer to null at end of string */
1970 /* strbeg: real beginning of string */
1971 /* minend: end of match must be >=minend after stringarg. */
1972 /* data: May be used for some additional optimizations.
1973          Currently its only used, with a U32 cast, for transmitting
1974          the ganch offset when doing a /g match. This will change */
1975 /* nosave: For optimizations. */
1976 {
1977     dVAR;
1978     struct regexp *const prog = (struct regexp *)SvANY(rx);
1979     /*register*/ char *s;
1980     register regnode *c;
1981     /*register*/ char *startpos = stringarg;
1982     I32 minlen;         /* must match at least this many chars */
1983     I32 dontbother = 0; /* how many characters not to try at end */
1984     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1985     I32 scream_pos = -1;                /* Internal iterator of scream. */
1986     char *scream_olds = NULL;
1987     const bool utf8_target = cBOOL(DO_UTF8(sv));
1988     I32 multiline;
1989     RXi_GET_DECL(prog,progi);
1990     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1991     regexp_paren_pair *swap = NULL;
1992     GET_RE_DEBUG_FLAGS_DECL;
1993
1994     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1995     PERL_UNUSED_ARG(data);
1996
1997     /* Be paranoid... */
1998     if (prog == NULL || startpos == NULL) {
1999         Perl_croak(aTHX_ "NULL regexp parameter");
2000         return 0;
2001     }
2002
2003     multiline = prog->extflags & RXf_PMf_MULTILINE;
2004     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2005
2006     RX_MATCH_UTF8_set(rx, utf8_target);
2007     DEBUG_EXECUTE_r(
2008         debug_start_match(rx, utf8_target, startpos, strend,
2009         "Matching");
2010     );
2011
2012     minlen = prog->minlen;
2013
2014     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2015         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2016                               "String too short [regexec_flags]...\n"));
2017         goto phooey;
2018     }
2019
2020
2021     /* Check validity of program. */
2022     if (UCHARAT(progi->program) != REG_MAGIC) {
2023         Perl_croak(aTHX_ "corrupted regexp program");
2024     }
2025
2026     PL_reg_flags = 0;
2027     PL_reg_eval_set = 0;
2028     PL_reg_maxiter = 0;
2029
2030     if (RX_UTF8(rx))
2031         PL_reg_flags |= RF_utf8;
2032
2033     /* Mark beginning of line for ^ and lookbehind. */
2034     reginfo.bol = startpos; /* XXX not used ??? */
2035     PL_bostr  = strbeg;
2036     reginfo.sv = sv;
2037
2038     /* Mark end of line for $ (and such) */
2039     PL_regeol = strend;
2040
2041     /* see how far we have to get to not match where we matched before */
2042     reginfo.till = startpos+minend;
2043
2044     /* If there is a "must appear" string, look for it. */
2045     s = startpos;
2046
2047     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2048         MAGIC *mg;
2049         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2050             reginfo.ganch = startpos + prog->gofs;
2051             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2052               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2053         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2054                   && SvMAGIC(sv)
2055                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2056                   && mg->mg_len >= 0) {
2057             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2058             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2059                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2060
2061             if (prog->extflags & RXf_ANCH_GPOS) {
2062                 if (s > reginfo.ganch)
2063                     goto phooey;
2064                 s = reginfo.ganch - prog->gofs;
2065                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2066                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2067                 if (s < strbeg)
2068                     goto phooey;
2069             }
2070         }
2071         else if (data) {
2072             reginfo.ganch = strbeg + PTR2UV(data);
2073             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2074                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2075
2076         } else {                                /* pos() not defined */
2077             reginfo.ganch = strbeg;
2078             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2079                  "GPOS: reginfo.ganch = strbeg\n"));
2080         }
2081     }
2082     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2083         /* We have to be careful. If the previous successful match
2084            was from this regex we don't want a subsequent partially
2085            successful match to clobber the old results.
2086            So when we detect this possibility we add a swap buffer
2087            to the re, and switch the buffer each match. If we fail
2088            we switch it back, otherwise we leave it swapped.
2089         */
2090         swap = prog->offs;
2091         /* do we need a save destructor here for eval dies? */
2092         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2093     }
2094     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2095         re_scream_pos_data d;
2096
2097         d.scream_olds = &scream_olds;
2098         d.scream_pos = &scream_pos;
2099         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2100         if (!s) {
2101             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2102             goto phooey;        /* not present */
2103         }
2104     }
2105
2106
2107
2108     /* Simplest case:  anchored match need be tried only once. */
2109     /*  [unless only anchor is BOL and multiline is set] */
2110     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2111         if (s == startpos && regtry(&reginfo, &startpos))
2112             goto got_it;
2113         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2114                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2115         {
2116             char *end;
2117
2118             if (minlen)
2119                 dontbother = minlen - 1;
2120             end = HOP3c(strend, -dontbother, strbeg) - 1;
2121             /* for multiline we only have to try after newlines */
2122             if (prog->check_substr || prog->check_utf8) {
2123                 /* because of the goto we can not easily reuse the macros for bifurcating the
2124                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2125                 if (utf8_target) {
2126                     if (s == startpos)
2127                         goto after_try_utf8;
2128                     while (1) {
2129                         if (regtry(&reginfo, &s)) {
2130                             goto got_it;
2131                         }
2132                       after_try_utf8:
2133                         if (s > end) {
2134                             goto phooey;
2135                         }
2136                         if (prog->extflags & RXf_USE_INTUIT) {
2137                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2138                             if (!s) {
2139                                 goto phooey;
2140                             }
2141                         }
2142                         else {
2143                             s += UTF8SKIP(s);
2144                         }
2145                     }
2146                 } /* end search for check string in unicode */
2147                 else {
2148                     if (s == startpos) {
2149                         goto after_try_latin;
2150                     }
2151                     while (1) {
2152                         if (regtry(&reginfo, &s)) {
2153                             goto got_it;
2154                         }
2155                       after_try_latin:
2156                         if (s > end) {
2157                             goto phooey;
2158                         }
2159                         if (prog->extflags & RXf_USE_INTUIT) {
2160                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2161                             if (!s) {
2162                                 goto phooey;
2163                             }
2164                         }
2165                         else {
2166                             s++;
2167                         }
2168                     }
2169                 } /* end search for check string in latin*/
2170             } /* end search for check string */
2171             else { /* search for newline */
2172                 if (s > startpos) {
2173                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2174                     s--;
2175                 }
2176                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2177                 while (s < end) {
2178                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2179                         if (regtry(&reginfo, &s))
2180                             goto got_it;
2181                     }
2182                 }
2183             } /* end search for newline */
2184         } /* end anchored/multiline check string search */
2185         goto phooey;
2186     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2187     {
2188         /* the warning about reginfo.ganch being used without initialization
2189            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2190            and we only enter this block when the same bit is set. */
2191         char *tmp_s = reginfo.ganch - prog->gofs;
2192
2193         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2194             goto got_it;
2195         goto phooey;
2196     }
2197
2198     /* Messy cases:  unanchored match. */
2199     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2200         /* we have /x+whatever/ */
2201         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2202         char ch;
2203 #ifdef DEBUGGING
2204         int did_match = 0;
2205 #endif
2206         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2207             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2208         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2209
2210         if (utf8_target) {
2211             REXEC_FBC_SCAN(
2212                 if (*s == ch) {
2213                     DEBUG_EXECUTE_r( did_match = 1 );
2214                     if (regtry(&reginfo, &s)) goto got_it;
2215                     s += UTF8SKIP(s);
2216                     while (s < strend && *s == ch)
2217                         s += UTF8SKIP(s);
2218                 }
2219             );
2220         }
2221         else {
2222             REXEC_FBC_SCAN(
2223                 if (*s == ch) {
2224                     DEBUG_EXECUTE_r( did_match = 1 );
2225                     if (regtry(&reginfo, &s)) goto got_it;
2226                     s++;
2227                     while (s < strend && *s == ch)
2228                         s++;
2229                 }
2230             );
2231         }
2232         DEBUG_EXECUTE_r(if (!did_match)
2233                 PerlIO_printf(Perl_debug_log,
2234                                   "Did not find anchored character...\n")
2235                );
2236     }
2237     else if (prog->anchored_substr != NULL
2238               || prog->anchored_utf8 != NULL
2239               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2240                   && prog->float_max_offset < strend - s)) {
2241         SV *must;
2242         I32 back_max;
2243         I32 back_min;
2244         char *last;
2245         char *last1;            /* Last position checked before */
2246 #ifdef DEBUGGING
2247         int did_match = 0;
2248 #endif
2249         if (prog->anchored_substr || prog->anchored_utf8) {
2250             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2251                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2252             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2253             back_max = back_min = prog->anchored_offset;
2254         } else {
2255             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2256                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2257             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2258             back_max = prog->float_max_offset;
2259             back_min = prog->float_min_offset;
2260         }
2261
2262
2263         if (must == &PL_sv_undef)
2264             /* could not downgrade utf8 check substring, so must fail */
2265             goto phooey;
2266
2267         if (back_min<0) {
2268             last = strend;
2269         } else {
2270             last = HOP3c(strend,        /* Cannot start after this */
2271                   -(I32)(CHR_SVLEN(must)
2272                          - (SvTAIL(must) != 0) + back_min), strbeg);
2273         }
2274         if (s > PL_bostr)
2275             last1 = HOPc(s, -1);
2276         else
2277             last1 = s - 1;      /* bogus */
2278
2279         /* XXXX check_substr already used to find "s", can optimize if
2280            check_substr==must. */
2281         scream_pos = -1;
2282         dontbother = end_shift;
2283         strend = HOPc(strend, -dontbother);
2284         while ( (s <= last) &&
2285                 ((flags & REXEC_SCREAM)
2286                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2287                                     end_shift, &scream_pos, 0))
2288                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2289                                   (unsigned char*)strend, must,
2290                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2291             /* we may be pointing at the wrong string */
2292             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2293                 s = strbeg + (s - SvPVX_const(sv));
2294             DEBUG_EXECUTE_r( did_match = 1 );
2295             if (HOPc(s, -back_max) > last1) {
2296                 last1 = HOPc(s, -back_min);
2297                 s = HOPc(s, -back_max);
2298             }
2299             else {
2300                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2301
2302                 last1 = HOPc(s, -back_min);
2303                 s = t;
2304             }
2305             if (utf8_target) {
2306                 while (s <= last1) {
2307                     if (regtry(&reginfo, &s))
2308                         goto got_it;
2309                     s += UTF8SKIP(s);
2310                 }
2311             }
2312             else {
2313                 while (s <= last1) {
2314                     if (regtry(&reginfo, &s))
2315                         goto got_it;
2316                     s++;
2317                 }
2318             }
2319         }
2320         DEBUG_EXECUTE_r(if (!did_match) {
2321             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2322                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2323             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2324                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2325                                ? "anchored" : "floating"),
2326                 quoted, RE_SV_TAIL(must));
2327         });
2328         goto phooey;
2329     }
2330     else if ( (c = progi->regstclass) ) {
2331         if (minlen) {
2332             const OPCODE op = OP(progi->regstclass);
2333             /* don't bother with what can't match */
2334             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2335                 strend = HOPc(strend, -(minlen - 1));
2336         }
2337         DEBUG_EXECUTE_r({
2338             SV * const prop = sv_newmortal();
2339             regprop(prog, prop, c);
2340             {
2341                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2342                     s,strend-s,60);
2343                 PerlIO_printf(Perl_debug_log,
2344                     "Matching stclass %.*s against %s (%d bytes)\n",
2345                     (int)SvCUR(prop), SvPVX_const(prop),
2346                      quoted, (int)(strend - s));
2347             }
2348         });
2349         if (find_byclass(prog, c, s, strend, &reginfo))
2350             goto got_it;
2351         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2352     }
2353     else {
2354         dontbother = 0;
2355         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2356             /* Trim the end. */
2357             char *last;
2358             SV* float_real;
2359
2360             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2361                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2362             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2363
2364             if (flags & REXEC_SCREAM) {
2365                 last = screaminstr(sv, float_real, s - strbeg,
2366                                    end_shift, &scream_pos, 1); /* last one */
2367                 if (!last)
2368                     last = scream_olds; /* Only one occurrence. */
2369                 /* we may be pointing at the wrong string */
2370                 else if (RXp_MATCH_COPIED(prog))
2371                     s = strbeg + (s - SvPVX_const(sv));
2372             }
2373             else {
2374                 STRLEN len;
2375                 const char * const little = SvPV_const(float_real, len);
2376
2377                 if (SvTAIL(float_real)) {
2378                     if (memEQ(strend - len + 1, little, len - 1))
2379                         last = strend - len + 1;
2380                     else if (!multiline)
2381                         last = memEQ(strend - len, little, len)
2382                             ? strend - len : NULL;
2383                     else
2384                         goto find_last;
2385                 } else {
2386                   find_last:
2387                     if (len)
2388                         last = rninstr(s, strend, little, little + len);
2389                     else
2390                         last = strend;  /* matching "$" */
2391                 }
2392             }
2393             if (last == NULL) {
2394                 DEBUG_EXECUTE_r(
2395                     PerlIO_printf(Perl_debug_log,
2396                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2397                         PL_colors[4], PL_colors[5]));
2398                 goto phooey; /* Should not happen! */
2399             }
2400             dontbother = strend - last + prog->float_min_offset;
2401         }
2402         if (minlen && (dontbother < minlen))
2403             dontbother = minlen - 1;
2404         strend -= dontbother;              /* this one's always in bytes! */
2405         /* We don't know much -- general case. */
2406         if (utf8_target) {
2407             for (;;) {
2408                 if (regtry(&reginfo, &s))
2409                     goto got_it;
2410                 if (s >= strend)
2411                     break;
2412                 s += UTF8SKIP(s);
2413             };
2414         }
2415         else {
2416             do {
2417                 if (regtry(&reginfo, &s))
2418                     goto got_it;
2419             } while (s++ < strend);
2420         }
2421     }
2422
2423     /* Failure. */
2424     goto phooey;
2425
2426 got_it:
2427     Safefree(swap);
2428     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2429
2430     if (PL_reg_eval_set)
2431         restore_pos(aTHX_ prog);
2432     if (RXp_PAREN_NAMES(prog))
2433         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2434
2435     /* make sure $`, $&, $', and $digit will work later */
2436     if ( !(flags & REXEC_NOT_FIRST) ) {
2437         RX_MATCH_COPY_FREE(rx);
2438         if (flags & REXEC_COPY_STR) {
2439             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2440 #ifdef PERL_OLD_COPY_ON_WRITE
2441             if ((SvIsCOW(sv)
2442                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2443                 if (DEBUG_C_TEST) {
2444                     PerlIO_printf(Perl_debug_log,
2445                                   "Copy on write: regexp capture, type %d\n",
2446                                   (int) SvTYPE(sv));
2447                 }
2448                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2449                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2450                 assert (SvPOKp(prog->saved_copy));
2451             } else
2452 #endif
2453             {
2454                 RX_MATCH_COPIED_on(rx);
2455                 s = savepvn(strbeg, i);
2456                 prog->subbeg = s;
2457             }
2458             prog->sublen = i;
2459         }
2460         else {
2461             prog->subbeg = strbeg;
2462             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2463         }
2464     }
2465
2466     return 1;
2467
2468 phooey:
2469     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2470                           PL_colors[4], PL_colors[5]));
2471     if (PL_reg_eval_set)
2472         restore_pos(aTHX_ prog);
2473     if (swap) {
2474         /* we failed :-( roll it back */
2475         Safefree(prog->offs);
2476         prog->offs = swap;
2477     }
2478
2479     return 0;
2480 }
2481
2482
2483 /*
2484  - regtry - try match at specific point
2485  */
2486 STATIC I32                      /* 0 failure, 1 success */
2487 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2488 {
2489     dVAR;
2490     CHECKPOINT lastcp;
2491     REGEXP *const rx = reginfo->prog;
2492     regexp *const prog = (struct regexp *)SvANY(rx);
2493     RXi_GET_DECL(prog,progi);
2494     GET_RE_DEBUG_FLAGS_DECL;
2495
2496     PERL_ARGS_ASSERT_REGTRY;
2497
2498     reginfo->cutpoint=NULL;
2499
2500     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2501         MAGIC *mg;
2502
2503         PL_reg_eval_set = RS_init;
2504         DEBUG_EXECUTE_r(DEBUG_s(
2505             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2506                           (IV)(PL_stack_sp - PL_stack_base));
2507             ));
2508         SAVESTACK_CXPOS();
2509         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2510         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2511         SAVETMPS;
2512         /* Apparently this is not needed, judging by wantarray. */
2513         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2514            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2515
2516         if (reginfo->sv) {
2517             /* Make $_ available to executed code. */
2518             if (reginfo->sv != DEFSV) {
2519                 SAVE_DEFSV;
2520                 DEFSV_set(reginfo->sv);
2521             }
2522
2523             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2524                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2525                 /* prepare for quick setting of pos */
2526 #ifdef PERL_OLD_COPY_ON_WRITE
2527                 if (SvIsCOW(reginfo->sv))
2528                     sv_force_normal_flags(reginfo->sv, 0);
2529 #endif
2530                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2531                                  &PL_vtbl_mglob, NULL, 0);
2532                 mg->mg_len = -1;
2533             }
2534             PL_reg_magic    = mg;
2535             PL_reg_oldpos   = mg->mg_len;
2536             SAVEDESTRUCTOR_X(restore_pos, prog);
2537         }
2538         if (!PL_reg_curpm) {
2539             Newxz(PL_reg_curpm, 1, PMOP);
2540 #ifdef USE_ITHREADS
2541             {
2542                 SV* const repointer = &PL_sv_undef;
2543                 /* this regexp is also owned by the new PL_reg_curpm, which
2544                    will try to free it.  */
2545                 av_push(PL_regex_padav, repointer);
2546                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2547                 PL_regex_pad = AvARRAY(PL_regex_padav);
2548             }
2549 #endif
2550         }
2551 #ifdef USE_ITHREADS
2552         /* It seems that non-ithreads works both with and without this code.
2553            So for efficiency reasons it seems best not to have the code
2554            compiled when it is not needed.  */
2555         /* This is safe against NULLs: */
2556         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2557         /* PM_reg_curpm owns a reference to this regexp.  */
2558         (void)ReREFCNT_inc(rx);
2559 #endif
2560         PM_SETRE(PL_reg_curpm, rx);
2561         PL_reg_oldcurpm = PL_curpm;
2562         PL_curpm = PL_reg_curpm;
2563         if (RXp_MATCH_COPIED(prog)) {
2564             /*  Here is a serious problem: we cannot rewrite subbeg,
2565                 since it may be needed if this match fails.  Thus
2566                 $` inside (?{}) could fail... */
2567             PL_reg_oldsaved = prog->subbeg;
2568             PL_reg_oldsavedlen = prog->sublen;
2569 #ifdef PERL_OLD_COPY_ON_WRITE
2570             PL_nrs = prog->saved_copy;
2571 #endif
2572             RXp_MATCH_COPIED_off(prog);
2573         }
2574         else
2575             PL_reg_oldsaved = NULL;
2576         prog->subbeg = PL_bostr;
2577         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2578     }
2579     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2580     prog->offs[0].start = *startpos - PL_bostr;
2581     PL_reginput = *startpos;
2582     PL_reglastparen = &prog->lastparen;
2583     PL_reglastcloseparen = &prog->lastcloseparen;
2584     prog->lastparen = 0;
2585     prog->lastcloseparen = 0;
2586     PL_regsize = 0;
2587     PL_regoffs = prog->offs;
2588     if (PL_reg_start_tmpl <= prog->nparens) {
2589         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2590         if(PL_reg_start_tmp)
2591             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2592         else
2593             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2594     }
2595
2596     /* XXXX What this code is doing here?!!!  There should be no need
2597        to do this again and again, PL_reglastparen should take care of
2598        this!  --ilya*/
2599
2600     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2601      * Actually, the code in regcppop() (which Ilya may be meaning by
2602      * PL_reglastparen), is not needed at all by the test suite
2603      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2604      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2605      * Meanwhile, this code *is* needed for the
2606      * above-mentioned test suite tests to succeed.  The common theme
2607      * on those tests seems to be returning null fields from matches.
2608      * --jhi updated by dapm */
2609 #if 1
2610     if (prog->nparens) {
2611         regexp_paren_pair *pp = PL_regoffs;
2612         register I32 i;
2613         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2614             ++pp;
2615             pp->start = -1;
2616             pp->end = -1;
2617         }
2618     }
2619 #endif
2620     REGCP_SET(lastcp);
2621     if (regmatch(reginfo, progi->program + 1)) {
2622         PL_regoffs[0].end = PL_reginput - PL_bostr;
2623         return 1;
2624     }
2625     if (reginfo->cutpoint)
2626         *startpos= reginfo->cutpoint;
2627     REGCP_UNWIND(lastcp);
2628     return 0;
2629 }
2630
2631
2632 #define sayYES goto yes
2633 #define sayNO goto no
2634 #define sayNO_SILENT goto no_silent
2635
2636 /* we dont use STMT_START/END here because it leads to
2637    "unreachable code" warnings, which are bogus, but distracting. */
2638 #define CACHEsayNO \
2639     if (ST.cache_mask) \
2640        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2641     sayNO
2642
2643 /* this is used to determine how far from the left messages like
2644    'failed...' are printed. It should be set such that messages
2645    are inline with the regop output that created them.
2646 */
2647 #define REPORT_CODE_OFF 32
2648
2649
2650 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2651 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2652
2653 #define SLAB_FIRST(s) (&(s)->states[0])
2654 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2655
2656 /* grab a new slab and return the first slot in it */
2657
2658 STATIC regmatch_state *
2659 S_push_slab(pTHX)
2660 {
2661 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2662     dMY_CXT;
2663 #endif
2664     regmatch_slab *s = PL_regmatch_slab->next;
2665     if (!s) {
2666         Newx(s, 1, regmatch_slab);
2667         s->prev = PL_regmatch_slab;
2668         s->next = NULL;
2669         PL_regmatch_slab->next = s;
2670     }
2671     PL_regmatch_slab = s;
2672     return SLAB_FIRST(s);
2673 }
2674
2675
2676 /* push a new state then goto it */
2677
2678 #define PUSH_STATE_GOTO(state, node) \
2679     scan = node; \
2680     st->resume_state = state; \
2681     goto push_state;
2682
2683 /* push a new state with success backtracking, then goto it */
2684
2685 #define PUSH_YES_STATE_GOTO(state, node) \
2686     scan = node; \
2687     st->resume_state = state; \
2688     goto push_yes_state;
2689
2690
2691
2692 /*
2693
2694 regmatch() - main matching routine
2695
2696 This is basically one big switch statement in a loop. We execute an op,
2697 set 'next' to point the next op, and continue. If we come to a point which
2698 we may need to backtrack to on failure such as (A|B|C), we push a
2699 backtrack state onto the backtrack stack. On failure, we pop the top
2700 state, and re-enter the loop at the state indicated. If there are no more
2701 states to pop, we return failure.
2702
2703 Sometimes we also need to backtrack on success; for example /A+/, where
2704 after successfully matching one A, we need to go back and try to
2705 match another one; similarly for lookahead assertions: if the assertion
2706 completes successfully, we backtrack to the state just before the assertion
2707 and then carry on.  In these cases, the pushed state is marked as
2708 'backtrack on success too'. This marking is in fact done by a chain of
2709 pointers, each pointing to the previous 'yes' state. On success, we pop to
2710 the nearest yes state, discarding any intermediate failure-only states.
2711 Sometimes a yes state is pushed just to force some cleanup code to be
2712 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2713 it to free the inner regex.
2714
2715 Note that failure backtracking rewinds the cursor position, while
2716 success backtracking leaves it alone.
2717
2718 A pattern is complete when the END op is executed, while a subpattern
2719 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2720 ops trigger the "pop to last yes state if any, otherwise return true"
2721 behaviour.
2722
2723 A common convention in this function is to use A and B to refer to the two
2724 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2725 the subpattern to be matched possibly multiple times, while B is the entire
2726 rest of the pattern. Variable and state names reflect this convention.
2727
2728 The states in the main switch are the union of ops and failure/success of
2729 substates associated with with that op.  For example, IFMATCH is the op
2730 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2731 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2732 successfully matched A and IFMATCH_A_fail is a state saying that we have
2733 just failed to match A. Resume states always come in pairs. The backtrack
2734 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2735 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2736 on success or failure.
2737
2738 The struct that holds a backtracking state is actually a big union, with
2739 one variant for each major type of op. The variable st points to the
2740 top-most backtrack struct. To make the code clearer, within each
2741 block of code we #define ST to alias the relevant union.
2742
2743 Here's a concrete example of a (vastly oversimplified) IFMATCH
2744 implementation:
2745
2746     switch (state) {
2747     ....
2748
2749 #define ST st->u.ifmatch
2750
2751     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2752         ST.foo = ...; // some state we wish to save
2753         ...
2754         // push a yes backtrack state with a resume value of
2755         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2756         // first node of A:
2757         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2758         // NOTREACHED
2759
2760     case IFMATCH_A: // we have successfully executed A; now continue with B
2761         next = B;
2762         bar = ST.foo; // do something with the preserved value
2763         break;
2764
2765     case IFMATCH_A_fail: // A failed, so the assertion failed
2766         ...;   // do some housekeeping, then ...
2767         sayNO; // propagate the failure
2768
2769 #undef ST
2770
2771     ...
2772     }
2773
2774 For any old-timers reading this who are familiar with the old recursive
2775 approach, the code above is equivalent to:
2776
2777     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2778     {
2779         int foo = ...
2780         ...
2781         if (regmatch(A)) {
2782             next = B;
2783             bar = foo;
2784             break;
2785         }
2786         ...;   // do some housekeeping, then ...
2787         sayNO; // propagate the failure
2788     }
2789
2790 The topmost backtrack state, pointed to by st, is usually free. If you
2791 want to claim it, populate any ST.foo fields in it with values you wish to
2792 save, then do one of
2793
2794         PUSH_STATE_GOTO(resume_state, node);
2795         PUSH_YES_STATE_GOTO(resume_state, node);
2796
2797 which sets that backtrack state's resume value to 'resume_state', pushes a
2798 new free entry to the top of the backtrack stack, then goes to 'node'.
2799 On backtracking, the free slot is popped, and the saved state becomes the
2800 new free state. An ST.foo field in this new top state can be temporarily
2801 accessed to retrieve values, but once the main loop is re-entered, it
2802 becomes available for reuse.
2803
2804 Note that the depth of the backtrack stack constantly increases during the
2805 left-to-right execution of the pattern, rather than going up and down with
2806 the pattern nesting. For example the stack is at its maximum at Z at the
2807 end of the pattern, rather than at X in the following:
2808
2809     /(((X)+)+)+....(Y)+....Z/
2810
2811 The only exceptions to this are lookahead/behind assertions and the cut,
2812 (?>A), which pop all the backtrack states associated with A before
2813 continuing.
2814
2815 Backtrack state structs are allocated in slabs of about 4K in size.
2816 PL_regmatch_state and st always point to the currently active state,
2817 and PL_regmatch_slab points to the slab currently containing
2818 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2819 allocated, and is never freed until interpreter destruction. When the slab
2820 is full, a new one is allocated and chained to the end. At exit from
2821 regmatch(), slabs allocated since entry are freed.
2822
2823 */
2824
2825
2826 #define DEBUG_STATE_pp(pp)                                  \
2827     DEBUG_STATE_r({                                         \
2828         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2829         PerlIO_printf(Perl_debug_log,                       \
2830             "    %*s"pp" %s%s%s%s%s\n",                     \
2831             depth*2, "",                                    \
2832             PL_reg_name[st->resume_state],                     \
2833             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2834             ((st==yes_state) ? "Y" : ""),                   \
2835             ((st==mark_state) ? "M" : ""),                  \
2836             ((st==yes_state||st==mark_state) ? "]" : "")    \
2837         );                                                  \
2838     });
2839
2840
2841 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2842
2843 #ifdef DEBUGGING
2844
2845 STATIC void
2846 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2847     const char *start, const char *end, const char *blurb)
2848 {
2849     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2850
2851     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2852
2853     if (!PL_colorset)
2854             reginitcolors();
2855     {
2856         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2857             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2858
2859         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2860             start, end - start, 60);
2861
2862         PerlIO_printf(Perl_debug_log,
2863             "%s%s REx%s %s against %s\n",
2864                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2865
2866         if (utf8_target||utf8_pat)
2867             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2868                 utf8_pat ? "pattern" : "",
2869                 utf8_pat && utf8_target ? " and " : "",
2870                 utf8_target ? "string" : ""
2871             );
2872     }
2873 }
2874
2875 STATIC void
2876 S_dump_exec_pos(pTHX_ const char *locinput,
2877                       const regnode *scan,
2878                       const char *loc_regeol,
2879                       const char *loc_bostr,
2880                       const char *loc_reg_starttry,
2881                       const bool utf8_target)
2882 {
2883     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2884     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2885     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2886     /* The part of the string before starttry has one color
2887        (pref0_len chars), between starttry and current
2888        position another one (pref_len - pref0_len chars),
2889        after the current position the third one.
2890        We assume that pref0_len <= pref_len, otherwise we
2891        decrease pref0_len.  */
2892     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2893         ? (5 + taill) - l : locinput - loc_bostr;
2894     int pref0_len;
2895
2896     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2897
2898     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2899         pref_len++;
2900     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2901     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2902         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2903               ? (5 + taill) - pref_len : loc_regeol - locinput);
2904     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2905         l--;
2906     if (pref0_len < 0)
2907         pref0_len = 0;
2908     if (pref0_len > pref_len)
2909         pref0_len = pref_len;
2910     {
2911         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2912
2913         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2914             (locinput - pref_len),pref0_len, 60, 4, 5);
2915
2916         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2917                     (locinput - pref_len + pref0_len),
2918                     pref_len - pref0_len, 60, 2, 3);
2919
2920         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2921                     locinput, loc_regeol - locinput, 10, 0, 1);
2922
2923         const STRLEN tlen=len0+len1+len2;
2924         PerlIO_printf(Perl_debug_log,
2925                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2926                     (IV)(locinput - loc_bostr),
2927                     len0, s0,
2928                     len1, s1,
2929                     (docolor ? "" : "> <"),
2930                     len2, s2,
2931                     (int)(tlen > 19 ? 0 :  19 - tlen),
2932                     "");
2933     }
2934 }
2935
2936 #endif
2937
2938 /* reg_check_named_buff_matched()
2939  * Checks to see if a named buffer has matched. The data array of
2940  * buffer numbers corresponding to the buffer is expected to reside
2941  * in the regexp->data->data array in the slot stored in the ARG() of
2942  * node involved. Note that this routine doesn't actually care about the
2943  * name, that information is not preserved from compilation to execution.
2944  * Returns the index of the leftmost defined buffer with the given name
2945  * or 0 if non of the buffers matched.
2946  */
2947 STATIC I32
2948 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2949 {
2950     I32 n;
2951     RXi_GET_DECL(rex,rexi);
2952     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2953     I32 *nums=(I32*)SvPVX(sv_dat);
2954
2955     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2956
2957     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2958         if ((I32)*PL_reglastparen >= nums[n] &&
2959             PL_regoffs[nums[n]].end != -1)
2960         {
2961             return nums[n];
2962         }
2963     }
2964     return 0;
2965 }
2966
2967
2968 /* free all slabs above current one  - called during LEAVE_SCOPE */
2969
2970 STATIC void
2971 S_clear_backtrack_stack(pTHX_ void *p)
2972 {
2973     regmatch_slab *s = PL_regmatch_slab->next;
2974     PERL_UNUSED_ARG(p);
2975
2976     if (!s)
2977         return;
2978     PL_regmatch_slab->next = NULL;
2979     while (s) {
2980         regmatch_slab * const osl = s;
2981         s = s->next;
2982         Safefree(osl);
2983     }
2984 }
2985
2986
2987 #define SETREX(Re1,Re2) \
2988     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2989     Re1 = (Re2)
2990
2991 STATIC I32                      /* 0 failure, 1 success */
2992 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2993 {
2994 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2995     dMY_CXT;
2996 #endif
2997     dVAR;
2998     register const bool utf8_target = PL_reg_match_utf8;
2999     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3000     REGEXP *rex_sv = reginfo->prog;
3001     regexp *rex = (struct regexp *)SvANY(rex_sv);
3002     RXi_GET_DECL(rex,rexi);
3003     I32 oldsave;
3004     /* the current state. This is a cached copy of PL_regmatch_state */
3005     register regmatch_state *st;
3006     /* cache heavy used fields of st in registers */
3007     register regnode *scan;
3008     register regnode *next;
3009     register U32 n = 0; /* general value; init to avoid compiler warning */
3010     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3011     register char *locinput = PL_reginput;
3012     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3013
3014     bool result = 0;        /* return value of S_regmatch */
3015     int depth = 0;          /* depth of backtrack stack */
3016     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3017     const U32 max_nochange_depth =
3018         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3019         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3020     regmatch_state *yes_state = NULL; /* state to pop to on success of
3021                                                             subpattern */
3022     /* mark_state piggy backs on the yes_state logic so that when we unwind
3023        the stack on success we can update the mark_state as we go */
3024     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3025     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3026     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3027     U32 state_num;
3028     bool no_final = 0;      /* prevent failure from backtracking? */
3029     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3030     char *startpoint = PL_reginput;
3031     SV *popmark = NULL;     /* are we looking for a mark? */
3032     SV *sv_commit = NULL;   /* last mark name seen in failure */
3033     SV *sv_yes_mark = NULL; /* last mark name we have seen
3034                                during a successful match */
3035     U32 lastopen = 0;       /* last open we saw */
3036     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3037     SV* const oreplsv = GvSV(PL_replgv);
3038     /* these three flags are set by various ops to signal information to
3039      * the very next op. They have a useful lifetime of exactly one loop
3040      * iteration, and are not preserved or restored by state pushes/pops
3041      */
3042     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3043     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3044     int logical = 0;        /* the following EVAL is:
3045                                 0: (?{...})
3046                                 1: (?(?{...})X|Y)
3047                                 2: (??{...})
3048                                or the following IFMATCH/UNLESSM is:
3049                                 false: plain (?=foo)
3050                                 true:  used as a condition: (?(?=foo))
3051                             */
3052 #ifdef DEBUGGING
3053     GET_RE_DEBUG_FLAGS_DECL;
3054 #endif
3055
3056     PERL_ARGS_ASSERT_REGMATCH;
3057
3058     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3059             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3060     }));
3061     /* on first ever call to regmatch, allocate first slab */
3062     if (!PL_regmatch_slab) {
3063         Newx(PL_regmatch_slab, 1, regmatch_slab);
3064         PL_regmatch_slab->prev = NULL;
3065         PL_regmatch_slab->next = NULL;
3066         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3067     }
3068
3069     oldsave = PL_savestack_ix;
3070     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3071     SAVEVPTR(PL_regmatch_slab);
3072     SAVEVPTR(PL_regmatch_state);
3073
3074     /* grab next free state slot */
3075     st = ++PL_regmatch_state;
3076     if (st >  SLAB_LAST(PL_regmatch_slab))
3077         st = PL_regmatch_state = S_push_slab(aTHX);
3078
3079     /* Note that nextchr is a byte even in UTF */
3080     nextchr = UCHARAT(locinput);
3081     scan = prog;
3082     while (scan != NULL) {
3083
3084         DEBUG_EXECUTE_r( {
3085             SV * const prop = sv_newmortal();
3086             regnode *rnext=regnext(scan);
3087             DUMP_EXEC_POS( locinput, scan, utf8_target );
3088             regprop(rex, prop, scan);
3089
3090             PerlIO_printf(Perl_debug_log,
3091                     "%3"IVdf":%*s%s(%"IVdf")\n",
3092                     (IV)(scan - rexi->program), depth*2, "",
3093                     SvPVX_const(prop),
3094                     (PL_regkind[OP(scan)] == END || !rnext) ?
3095                         0 : (IV)(rnext - rexi->program));
3096         });
3097
3098         next = scan + NEXT_OFF(scan);
3099         if (next == scan)
3100             next = NULL;
3101         state_num = OP(scan);
3102
3103       reenter_switch:
3104
3105         assert(PL_reglastparen == &rex->lastparen);
3106         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3107         assert(PL_regoffs == rex->offs);
3108
3109         switch (state_num) {
3110         case BOL:
3111             if (locinput == PL_bostr)
3112             {
3113                 /* reginfo->till = reginfo->bol; */
3114                 break;
3115             }
3116             sayNO;
3117         case MBOL:
3118             if (locinput == PL_bostr ||
3119                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3120             {
3121                 break;
3122             }
3123             sayNO;
3124         case SBOL:
3125             if (locinput == PL_bostr)
3126                 break;
3127             sayNO;
3128         case GPOS:
3129             if (locinput == reginfo->ganch)
3130                 break;
3131             sayNO;
3132
3133         case KEEPS:
3134             /* update the startpoint */
3135             st->u.keeper.val = PL_regoffs[0].start;
3136             PL_reginput = locinput;
3137             PL_regoffs[0].start = locinput - PL_bostr;
3138             PUSH_STATE_GOTO(KEEPS_next, next);
3139             /*NOT-REACHED*/
3140         case KEEPS_next_fail:
3141             /* rollback the start point change */
3142             PL_regoffs[0].start = st->u.keeper.val;
3143             sayNO_SILENT;
3144             /*NOT-REACHED*/
3145         case EOL:
3146                 goto seol;
3147         case MEOL:
3148             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3149                 sayNO;
3150             break;
3151         case SEOL:
3152           seol:
3153             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3154                 sayNO;
3155             if (PL_regeol - locinput > 1)
3156                 sayNO;
3157             break;
3158         case EOS:
3159             if (PL_regeol != locinput)
3160                 sayNO;
3161             break;
3162         case SANY:
3163             if (!nextchr && locinput >= PL_regeol)
3164                 sayNO;
3165             if (utf8_target) {
3166                 locinput += PL_utf8skip[nextchr];
3167                 if (locinput > PL_regeol)
3168                     sayNO;
3169                 nextchr = UCHARAT(locinput);
3170             }
3171             else
3172                 nextchr = UCHARAT(++locinput);
3173             break;
3174         case CANY:
3175             if (!nextchr && locinput >= PL_regeol)
3176                 sayNO;
3177             nextchr = UCHARAT(++locinput);
3178             break;
3179         case REG_ANY:
3180             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3181                 sayNO;
3182             if (utf8_target) {
3183                 locinput += PL_utf8skip[nextchr];
3184                 if (locinput > PL_regeol)
3185                     sayNO;
3186                 nextchr = UCHARAT(locinput);
3187             }
3188             else
3189                 nextchr = UCHARAT(++locinput);
3190             break;
3191
3192 #undef  ST
3193 #define ST st->u.trie
3194         case TRIEC:
3195             /* In this case the charclass data is available inline so
3196                we can fail fast without a lot of extra overhead.
3197              */
3198             if (scan->flags == EXACT || !utf8_target) {
3199                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3200                     DEBUG_EXECUTE_r(
3201                         PerlIO_printf(Perl_debug_log,
3202                                   "%*s  %sfailed to match trie start class...%s\n",
3203                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3204                     );
3205                     sayNO_SILENT;
3206                     /* NOTREACHED */
3207                 }
3208             }
3209             /* FALL THROUGH */
3210         case TRIE:
3211             /* the basic plan of execution of the trie is:
3212              * At the beginning, run though all the states, and
3213              * find the longest-matching word. Also remember the position
3214              * of the shortest matching word. For example, this pattern:
3215              *    1  2 3 4    5
3216              *    ab|a|x|abcd|abc
3217              * when matched against the string "abcde", will generate
3218              * accept states for all words except 3, with the longest
3219              * matching word being 4, and the shortest being 1 (with
3220              * the position being after char 1 of the string).
3221              *
3222              * Then for each matching word, in word order (i.e. 1,2,4,5),
3223              * we run the remainder of the pattern; on each try setting
3224              * the current position to the character following the word,
3225              * returning to try the next word on failure.
3226              *
3227              * We avoid having to build a list of words at runtime by
3228              * using a compile-time structure, wordinfo[].prev, which
3229              * gives, for each word, the previous accepting word (if any).
3230              * In the case above it would contain the mappings 1->2, 2->0,
3231              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3232              * the longest word (4 above), a list of all words, by
3233              * following the list of prev pointers; this gives us the
3234              * unordered list 4,5,1,2. Then given the current word we have
3235              * just tried, we can go through the list and find the
3236              * next-biggest word to try (so if we just failed on word 2,
3237              * the next in the list is 4).
3238              *
3239              * Since at runtime we don't record the matching position in
3240              * the string for each word, we have to work that out for
3241              * each word we're about to process. The wordinfo table holds
3242              * the character length of each word; given that we recorded
3243              * at the start: the position of the shortest word and its
3244              * length in chars, we just need to move the pointer the
3245              * difference between the two char lengths. Depending on
3246              * Unicode status and folding, that's cheap or expensive.
3247              *
3248              * This algorithm is optimised for the case where are only a
3249              * small number of accept states, i.e. 0,1, or maybe 2.
3250              * With lots of accepts states, and having to try all of them,
3251              * it becomes quadratic on number of accept states to find all
3252              * the next words.
3253              */
3254
3255             {
3256                 /* what type of TRIE am I? (utf8 makes this contextual) */
3257                 DECL_TRIE_TYPE(scan);
3258
3259                 /* what trie are we using right now */
3260                 reg_trie_data * const trie
3261                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3262                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3263                 U32 state = trie->startstate;
3264
3265                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3266                     !TRIE_BITMAP_TEST(trie,*locinput)
3267                 ) {
3268                     if (trie->states[ state ].wordnum) {
3269                          DEBUG_EXECUTE_r(
3270                             PerlIO_printf(Perl_debug_log,
3271                                           "%*s  %smatched empty string...%s\n",
3272                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3273                         );
3274                         if (!trie->jump)
3275                             break;
3276                     } else {
3277                         DEBUG_EXECUTE_r(
3278                             PerlIO_printf(Perl_debug_log,
3279                                           "%*s  %sfailed to match trie start class...%s\n",
3280                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3281                         );
3282                         sayNO_SILENT;
3283                    }
3284                 }
3285
3286             {
3287                 U8 *uc = ( U8* )locinput;
3288
3289                 STRLEN len = 0;
3290                 STRLEN foldlen = 0;
3291                 U8 *uscan = (U8*)NULL;
3292                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3293                 U32 charcount = 0; /* how many input chars we have matched */
3294                 U32 accepted = 0; /* have we seen any accepting states? */
3295
3296                 ST.B = next;
3297                 ST.jump = trie->jump;
3298                 ST.me = scan;
3299                 ST.firstpos = NULL;
3300                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3301                 ST.nextword = 0;
3302
3303                 /* fully traverse the TRIE; note the position of the
3304                    shortest accept state and the wordnum of the longest
3305                    accept state */
3306
3307                 while ( state && uc <= (U8*)PL_regeol ) {
3308                     U32 base = trie->states[ state ].trans.base;
3309                     UV uvc = 0;
3310                     U16 charid = 0;
3311                     U16 wordnum;
3312                     wordnum = trie->states[ state ].wordnum;
3313
3314                     if (wordnum) { /* it's an accept state */
3315                         if (!accepted) {
3316                             accepted = 1;
3317                             /* record first match position */
3318                             if (ST.longfold) {
3319                                 ST.firstpos = (U8*)locinput;
3320                                 ST.firstchars = 0;
3321                             }
3322                             else {
3323                                 ST.firstpos = uc;
3324                                 ST.firstchars = charcount;
3325                             }
3326                         }
3327                         if (!ST.nextword || wordnum < ST.nextword)
3328                             ST.nextword = wordnum;
3329                         ST.topword = wordnum;
3330                     }
3331
3332                     DEBUG_TRIE_EXECUTE_r({
3333                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3334                                 PerlIO_printf( Perl_debug_log,
3335                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3336                                     2+depth * 2, "", PL_colors[4],
3337                                     (UV)state, (accepted ? 'Y' : 'N'));
3338                     });
3339
3340                     /* read a char and goto next state */
3341                     if ( base ) {
3342                         I32 offset;
3343                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3344                                              uscan, len, uvc, charid, foldlen,
3345                                              foldbuf, uniflags);
3346                         charcount++;
3347                         if (foldlen>0)
3348                             ST.longfold = TRUE;
3349                         if (charid &&
3350                              ( ((offset =
3351                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3352
3353                              && ((U32)offset < trie->lasttrans)
3354                              && trie->trans[offset].check == state)
3355                         {
3356                             state = trie->trans[offset].next;
3357                         }
3358                         else {
3359                             state = 0;
3360                         }
3361                         uc += len;
3362
3363                     }
3364                     else {
3365                         state = 0;
3366                     }
3367                     DEBUG_TRIE_EXECUTE_r(
3368                         PerlIO_printf( Perl_debug_log,
3369                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3370                             charid, uvc, (UV)state, PL_colors[5] );
3371                     );
3372                 }
3373                 if (!accepted)
3374                    sayNO;
3375
3376                 /* calculate total number of accept states */
3377                 {
3378                     U16 w = ST.topword;
3379                     accepted = 0;
3380                     while (w) {
3381                         w = trie->wordinfo[w].prev;
3382                         accepted++;
3383                     }
3384                     ST.accepted = accepted;
3385                 }
3386
3387                 DEBUG_EXECUTE_r(
3388                     PerlIO_printf( Perl_debug_log,
3389                         "%*s  %sgot %"IVdf" possible matches%s\n",
3390                         REPORT_CODE_OFF + depth * 2, "",
3391                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3392                 );
3393                 goto trie_first_try; /* jump into the fail handler */
3394             }}
3395             /* NOTREACHED */
3396
3397         case TRIE_next_fail: /* we failed - try next alternative */
3398             if ( ST.jump) {
3399                 REGCP_UNWIND(ST.cp);
3400                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3401                     PL_regoffs[n].end = -1;
3402                 *PL_reglastparen = n;
3403             }
3404             if (!--ST.accepted) {
3405                 DEBUG_EXECUTE_r({
3406                     PerlIO_printf( Perl_debug_log,
3407                         "%*s  %sTRIE failed...%s\n",
3408                         REPORT_CODE_OFF+depth*2, "",
3409                         PL_colors[4],
3410                         PL_colors[5] );
3411                 });
3412                 sayNO_SILENT;
3413             }
3414             {
3415                 /* Find next-highest word to process.  Note that this code
3416                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3417                 register U16 min = 0;
3418                 register U16 word;
3419                 register U16 const nextword = ST.nextword;
3420                 register reg_trie_wordinfo * const wordinfo
3421                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3422                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3423                     if (word > nextword && (!min || word < min))
3424                         min = word;
3425                 }
3426                 ST.nextword = min;
3427             }
3428
3429           trie_first_try:
3430             if (do_cutgroup) {
3431                 do_cutgroup = 0;
3432                 no_final = 0;
3433             }
3434
3435             if ( ST.jump) {
3436                 ST.lastparen = *PL_reglastparen;
3437                 REGCP_SET(ST.cp);
3438             }
3439
3440             /* find start char of end of current word */
3441             {
3442                 U32 chars; /* how many chars to skip */
3443                 U8 *uc = ST.firstpos;
3444                 reg_trie_data * const trie
3445                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3446
3447                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3448                             >=  ST.firstchars);
3449                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3450                             - ST.firstchars;
3451
3452                 if (ST.longfold) {
3453                     /* the hard option - fold each char in turn and find
3454                      * its folded length (which may be different */
3455                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3456                     STRLEN foldlen;
3457                     STRLEN len;
3458                     UV uvc;
3459                     U8 *uscan;
3460
3461                     while (chars) {
3462                         if (utf8_target) {
3463                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3464                                                     uniflags);
3465                             uc += len;
3466                         }
3467                         else {
3468                             uvc = *uc;
3469                             uc++;
3470                         }
3471                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3472                         uscan = foldbuf;
3473                         while (foldlen) {
3474                             if (!--chars)
3475                                 break;
3476                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3477                                             uniflags);
3478                             uscan += len;
3479                             foldlen -= len;
3480                         }
3481                     }
3482                 }
3483                 else {
3484                     if (utf8_target)
3485                         while (chars--)
3486                             uc += UTF8SKIP(uc);
3487                     else
3488                         uc += chars;
3489                 }
3490                 PL_reginput = (char *)uc;
3491             }
3492
3493             scan = (ST.jump && ST.jump[ST.nextword])
3494                         ? ST.me + ST.jump[ST.nextword]
3495                         : ST.B;
3496
3497             DEBUG_EXECUTE_r({
3498                 PerlIO_printf( Perl_debug_log,
3499                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3500                     REPORT_CODE_OFF+depth*2, "",
3501                     PL_colors[4],
3502                     ST.nextword,
3503                     PL_colors[5]
3504                     );
3505             });
3506
3507             if (ST.accepted > 1 || has_cutgroup) {
3508                 PUSH_STATE_GOTO(TRIE_next, scan);
3509                 /* NOTREACHED */
3510             }
3511             /* only one choice left - just continue */
3512             DEBUG_EXECUTE_r({
3513                 AV *const trie_words
3514                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3515                 SV ** const tmp = av_fetch( trie_words,
3516                     ST.nextword-1, 0 );
3517                 SV *sv= tmp ? sv_newmortal() : NULL;
3518
3519                 PerlIO_printf( Perl_debug_log,
3520                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3521                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3522                     ST.nextword,
3523                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3524                             PL_colors[0], PL_colors[1],
3525                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3526                         )
3527                     : "not compiled under -Dr",
3528                     PL_colors[5] );
3529             });
3530
3531             locinput = PL_reginput;
3532             nextchr = UCHARAT(locinput);
3533             continue; /* execute rest of RE */
3534             /* NOTREACHED */
3535 #undef  ST
3536
3537         case EXACT: {
3538             char *s = STRING(scan);
3539             ln = STR_LEN(scan);
3540             if (utf8_target != UTF_PATTERN) {
3541                 /* The target and the pattern have differing utf8ness. */
3542                 char *l = locinput;
3543                 const char * const e = s + ln;
3544
3545                 if (utf8_target) {
3546                     /* The target is utf8, the pattern is not utf8. */
3547                     while (s < e) {
3548                         STRLEN ulen;
3549                         if (l >= PL_regeol)
3550                              sayNO;
3551                         if (NATIVE_TO_UNI(*(U8*)s) !=
3552                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3553                                             uniflags))
3554                              sayNO;
3555                         l += ulen;
3556                         s ++;
3557                     }
3558                 }
3559                 else {
3560                     /* The target is not utf8, the pattern is utf8. */
3561                     while (s < e) {
3562                         STRLEN ulen;
3563                         if (l >= PL_regeol)
3564                             sayNO;
3565                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3566                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3567                                            uniflags))
3568                             sayNO;
3569                         s += ulen;
3570                         l ++;
3571                     }
3572                 }
3573                 locinput = l;
3574                 nextchr = UCHARAT(locinput);
3575                 break;
3576             }
3577             /* The target and the pattern have the same utf8ness. */
3578             /* Inline the first character, for speed. */
3579             if (UCHARAT(s) != nextchr)
3580                 sayNO;
3581             if (PL_regeol - locinput < ln)
3582                 sayNO;
3583             if (ln > 1 && memNE(s, locinput, ln))
3584                 sayNO;
3585             locinput += ln;
3586             nextchr = UCHARAT(locinput);
3587             break;
3588             }
3589         case EXACTFL: {
3590             re_fold_t folder;
3591             const U8 * fold_array;
3592             const char * s;
3593             U32 fold_utf8_flags;
3594
3595             PL_reg_flags |= RF_tainted;
3596             folder = foldEQ_locale;
3597             fold_array = PL_fold_locale;
3598             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3599             goto do_exactf;
3600
3601         case EXACTFU:
3602             folder = foldEQ_latin1;
3603             fold_array = PL_fold_latin1;
3604             fold_utf8_flags = 0;
3605             goto do_exactf;
3606
3607         case EXACTFA:
3608             folder = foldEQ_latin1;
3609             fold_array = PL_fold_latin1;
3610             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3611             goto do_exactf;
3612
3613         case EXACTF:
3614             folder = foldEQ;
3615             fold_array = PL_fold;
3616             fold_utf8_flags = 0;
3617
3618           do_exactf:
3619             s = STRING(scan);
3620             ln = STR_LEN(scan);
3621
3622             if (utf8_target || UTF_PATTERN) {
3623               /* Either target or the pattern are utf8. */
3624                 const char * const l = locinput;
3625                 char *e = PL_regeol;
3626
3627                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3628                                l, &e, 0,  utf8_target, fold_utf8_flags))
3629                 {
3630                     sayNO;
3631                 }
3632                 locinput = e;
3633                 nextchr = UCHARAT(locinput);
3634                 break;
3635             }
3636
3637             /* Neither the target nor the pattern are utf8 */
3638             if (UCHARAT(s) != nextchr &&
3639                 UCHARAT(s) != fold_array[nextchr])
3640             {
3641                 sayNO;
3642             }
3643             if (PL_regeol - locinput < ln)
3644                 sayNO;
3645             if (ln > 1 && ! folder(s, locinput, ln))
3646                 sayNO;
3647             locinput += ln;
3648             nextchr = UCHARAT(locinput);
3649             break;
3650         }
3651
3652         /* XXX Could improve efficiency by separating these all out using a
3653          * macro or in-line function.  At that point regcomp.c would no longer
3654          * have to set the FLAGS fields of these */
3655         case BOUNDL:
3656         case NBOUNDL:
3657             PL_reg_flags |= RF_tainted;
3658             /* FALL THROUGH */
3659         case BOUND:
3660         case BOUNDU:
3661         case BOUNDA:
3662         case NBOUND:
3663         case NBOUNDU:
3664         case NBOUNDA:
3665             /* was last char in word? */
3666             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3667                 if (locinput == PL_bostr)
3668                     ln = '\n';
3669                 else {
3670                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3671
3672                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3673                 }
3674                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3675                     ln = isALNUM_uni(ln);
3676                     LOAD_UTF8_CHARCLASS_ALNUM();
3677                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3678                 }
3679                 else {
3680                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3681                     n = isALNUM_LC_utf8((U8*)locinput);
3682                 }
3683             }
3684             else {
3685
3686                 /* Here the string isn't utf8, or is utf8 and only ascii
3687                  * characters are to match \w.  In the latter case looking at
3688                  * the byte just prior to the current one may be just the final
3689                  * byte of a multi-byte character.  This is ok.  There are two
3690                  * cases:
3691                  * 1) it is a single byte character, and then the test is doing
3692                  *      just what it's supposed to.
3693                  * 2) it is a multi-byte character, in which case the final
3694                  *      byte is never mistakable for ASCII, and so the test
3695                  *      will say it is not a word character, which is the
3696                  *      correct answer. */
3697                 ln = (locinput != PL_bostr) ?
3698                     UCHARAT(locinput - 1) : '\n';
3699                 switch (FLAGS(scan)) {
3700                     case REGEX_UNICODE_CHARSET:
3701                         ln = isWORDCHAR_L1(ln);
3702                         n = isWORDCHAR_L1(nextchr);
3703                         break;
3704                     case REGEX_LOCALE_CHARSET:
3705                         ln = isALNUM_LC(ln);
3706                         n = isALNUM_LC(nextchr);
3707                         break;
3708                     case REGEX_DEPENDS_CHARSET:
3709                         ln = isALNUM(ln);
3710                         n = isALNUM(nextchr);
3711                         break;
3712                     case REGEX_ASCII_RESTRICTED_CHARSET:
3713                         ln = isWORDCHAR_A(ln);
3714                         n = isWORDCHAR_A(nextchr);
3715                         break;
3716                     default:
3717                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3718                         break;
3719                 }
3720             }
3721             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3722              * regcomp.sym */
3723             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3724                     sayNO;
3725             break;
3726         case ANYOFV:
3727         case ANYOF:
3728             if (utf8_target || state_num == ANYOFV) {
3729                 STRLEN inclasslen = PL_regeol - locinput;
3730                 if (locinput >= PL_regeol)
3731                     sayNO;
3732
3733                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3734                     sayNO;
3735                 locinput += inclasslen;
3736                 nextchr = UCHARAT(locinput);
3737                 break;
3738             }
3739             else {
3740                 if (nextchr < 0)
3741                     nextchr = UCHARAT(locinput);
3742                 if (!nextchr && locinput >= PL_regeol)
3743                     sayNO;
3744                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3745                     sayNO;
3746                 nextchr = UCHARAT(++locinput);
3747                 break;
3748             }
3749             break;
3750         /* Special char classes - The defines start on line 129 or so */
3751         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3752                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3753                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3754                   ALNUMA, NALNUMA, isWORDCHAR_A,
3755                   alnum, "a");
3756
3757         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3758                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3759                   SPACEU, NSPACEU, isSPACE_L1,
3760                   SPACEA, NSPACEA, isSPACE_A,
3761                   space, " ");
3762
3763         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3764                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3765                 DIGITA, NDIGITA, isDIGIT_A,
3766                 digit, "0");
3767
3768         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3769                        a Unicode extended Grapheme Cluster */
3770             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3771               extended Grapheme Cluster is:
3772
3773                CR LF
3774                | Prepend* Begin Extend*
3775                | .
3776
3777                Begin is (Hangul-syllable | ! Control)
3778                Extend is (Grapheme_Extend | Spacing_Mark)
3779                Control is [ GCB_Control CR LF ]
3780
3781                The discussion below shows how the code for CLUMP is derived
3782                from this regex.  Note that most of these concepts are from
3783                property values of the Grapheme Cluster Boundary (GCB) property.
3784                No code point can have multiple property values for a given
3785                property.  Thus a code point in Prepend can't be in Control, but
3786                it must be in !Control.  This is why Control above includes
3787                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3788                property separately, and so can't be in GCB_Control, even though
3789                they logically are controls.  Control is not the same as gc=cc,
3790                but includes format and other characters as well.
3791
3792                The Unicode definition of Hangul-syllable is:
3793                    L+
3794                    | (L* ( ( V | LV ) V* | LVT ) T*)
3795                    | T+
3796                   )
3797                Each of these is a value for the GCB property, and hence must be
3798                disjoint, so the order they are tested is immaterial, so the
3799                above can safely be changed to
3800                    T+
3801                    | L+
3802                    | (L* ( LVT | ( V | LV ) V*) T*)
3803
3804                The last two terms can be combined like this:
3805                    L* ( L
3806                         | (( LVT | ( V | LV ) V*) T*))
3807
3808                And refactored into this:
3809                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3810
3811                That means that if we have seen any L's at all we can quit
3812                there, but if the next character is a LVT, a V or and LV we
3813                should keep going.
3814
3815                There is a subtlety with Prepend* which showed up in testing.
3816                Note that the Begin, and only the Begin is required in:
3817                 | Prepend* Begin Extend*
3818                Also, Begin contains '! Control'.  A Prepend must be a '!
3819                Control', which means it must be a Begin.  What it comes down to
3820                is that if we match Prepend* and then find no suitable Begin
3821                afterwards, that if we backtrack the last Prepend, that one will
3822                be a suitable Begin.
3823             */
3824
3825             if (locinput >= PL_regeol)
3826                 sayNO;
3827             if  (! utf8_target) {
3828
3829                 /* Match either CR LF  or '.', as all the other possibilities
3830                  * require utf8 */
3831                 locinput++;         /* Match the . or CR */
3832                 if (nextchr == '\r'
3833                     && locinput < PL_regeol
3834                     && UCHARAT(locinput) == '\n') locinput++;
3835             }
3836             else {
3837
3838                 /* Utf8: See if is ( CR LF ); already know that locinput <
3839                  * PL_regeol, so locinput+1 is in bounds */
3840                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3841                     locinput += 2;
3842                 }
3843                 else {
3844                     /* In case have to backtrack to beginning, then match '.' */
3845                     char *starting = locinput;
3846
3847                     /* In case have to backtrack the last prepend */
3848                     char *previous_prepend = 0;
3849
3850                     LOAD_UTF8_CHARCLASS_GCB();
3851
3852                     /* Match (prepend)* */
3853                     while (locinput < PL_regeol
3854                            && swash_fetch(PL_utf8_X_prepend,
3855                                           (U8*)locinput, utf8_target))
3856                     {
3857                         previous_prepend = locinput;
3858                         locinput += UTF8SKIP(locinput);
3859                     }
3860
3861                     /* As noted above, if we matched a prepend character, but
3862                      * the next thing won't match, back off the last prepend we
3863                      * matched, as it is guaranteed to match the begin */
3864                     if (previous_prepend
3865                         && (locinput >=  PL_regeol
3866                             || ! swash_fetch(PL_utf8_X_begin,
3867                                              (U8*)locinput, utf8_target)))
3868                     {
3869                         locinput = previous_prepend;
3870                     }
3871
3872                     /* Note that here we know PL_regeol > locinput, as we
3873                      * tested that upon input to this switch case, and if we
3874                      * moved locinput forward, we tested the result just above
3875                      * and it either passed, or we backed off so that it will
3876                      * now pass */
3877                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3878
3879                         /* Here did not match the required 'Begin' in the
3880                          * second term.  So just match the very first
3881                          * character, the '.' of the final term of the regex */
3882                         locinput = starting + UTF8SKIP(starting);
3883                     } else {
3884
3885                         /* Here is the beginning of a character that can have
3886                          * an extender.  It is either a hangul syllable, or a
3887                          * non-control */
3888                         if (swash_fetch(PL_utf8_X_non_hangul,
3889                                         (U8*)locinput, utf8_target))
3890                         {
3891
3892                             /* Here not a Hangul syllable, must be a
3893                              * ('!  * Control') */
3894                             locinput += UTF8SKIP(locinput);
3895                         } else {
3896
3897                             /* Here is a Hangul syllable.  It can be composed
3898                              * of several individual characters.  One
3899                              * possibility is T+ */
3900                             if (swash_fetch(PL_utf8_X_T,
3901                                             (U8*)locinput, utf8_target))
3902                             {
3903                                 while (locinput < PL_regeol
3904                                         && swash_fetch(PL_utf8_X_T,
3905                                                         (U8*)locinput, utf8_target))
3906                                 {
3907                                     locinput += UTF8SKIP(locinput);
3908                                 }
3909                             } else {
3910
3911                                 /* Here, not T+, but is a Hangul.  That means
3912                                  * it is one of the others: L, LV, LVT or V,
3913                                  * and matches:
3914                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3915
3916                                 /* Match L*           */
3917                                 while (locinput < PL_regeol
3918                                         && swash_fetch(PL_utf8_X_L,
3919                                                         (U8*)locinput, utf8_target))
3920                                 {
3921                                     locinput += UTF8SKIP(locinput);
3922                                 }
3923
3924                                 /* Here, have exhausted L*.  If the next
3925                                  * character is not an LV, LVT nor V, it means
3926                                  * we had to have at least one L, so matches L+
3927                                  * in the original equation, we have a complete
3928                                  * hangul syllable.  Are done. */
3929
3930                                 if (locinput < PL_regeol
3931                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3932                                                     (U8*)locinput, utf8_target))
3933                                 {
3934
3935                                     /* Otherwise keep going.  Must be LV, LVT
3936                                      * or V.  See if LVT */
3937                                     if (swash_fetch(PL_utf8_X_LVT,
3938                                                     (U8*)locinput, utf8_target))
3939                                     {
3940                                         locinput += UTF8SKIP(locinput);
3941                                     } else {
3942
3943                                         /* Must be  V or LV.  Take it, then
3944                                          * match V*     */
3945                                         locinput += UTF8SKIP(locinput);
3946                                         while (locinput < PL_regeol
3947                                                 && swash_fetch(PL_utf8_X_V,
3948                                                          (U8*)locinput, utf8_target))
3949                                         {
3950                                             locinput += UTF8SKIP(locinput);
3951                                         }
3952                                     }
3953
3954                                     /* And any of LV, LVT, or V can be followed
3955                                      * by T*            */
3956                                     while (locinput < PL_regeol
3957                                            && swash_fetch(PL_utf8_X_T,
3958                                                            (U8*)locinput,
3959                                                            utf8_target))
3960                                     {
3961                                         locinput += UTF8SKIP(locinput);
3962                                     }
3963                                 }
3964                             }
3965                         }
3966
3967                         /* Match any extender */
3968                         while (locinput < PL_regeol
3969                                 && swash_fetch(PL_utf8_X_extend,
3970                                                 (U8*)locinput, utf8_target))
3971                         {
3972                             locinput += UTF8SKIP(locinput);
3973                         }
3974                     }
3975                 }
3976                 if (locinput > PL_regeol) sayNO;
3977             }
3978             nextchr = UCHARAT(locinput);
3979             break;
3980
3981         case NREFFL:
3982         {   /* The capture buffer cases.  The ones beginning with N for the
3983                named buffers just convert to the equivalent numbered and
3984                pretend they were called as the corresponding numbered buffer
3985                op.  */
3986             /* don't initialize these in the declaration, it makes C++
3987                unhappy */
3988             char *s;
3989             char type;
3990             re_fold_t folder;
3991             const U8 *fold_array;
3992             UV utf8_fold_flags;
3993
3994             PL_reg_flags |= RF_tainted;
3995             folder = foldEQ_locale;
3996             fold_array = PL_fold_locale;
3997             type = REFFL;
3998             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
3999             goto do_nref;
4000
4001         case NREFFA:
4002             folder = foldEQ_latin1;
4003             fold_array = PL_fold_latin1;
4004             type = REFFA;
4005             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4006             goto do_nref;
4007
4008         case NREFFU:
4009             folder = foldEQ_latin1;
4010             fold_array = PL_fold_latin1;
4011             type = REFFU;
4012             utf8_fold_flags = 0;
4013             goto do_nref;
4014
4015         case NREFF:
4016             folder = foldEQ;
4017             fold_array = PL_fold;
4018             type = REFF;
4019             utf8_fold_flags = 0;
4020             goto do_nref;
4021
4022         case NREF:
4023             type = REF;
4024             folder = NULL;
4025             fold_array = NULL;
4026             utf8_fold_flags = 0;
4027           do_nref:
4028
4029             /* For the named back references, find the corresponding buffer
4030              * number */
4031             n = reg_check_named_buff_matched(rex,scan);
4032
4033             if ( ! n ) {
4034                 sayNO;
4035             }
4036             goto do_nref_ref_common;
4037
4038         case REFFL:
4039             PL_reg_flags |= RF_tainted;
4040             folder = foldEQ_locale;
4041             fold_array = PL_fold_locale;
4042             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4043             goto do_ref;
4044
4045         case REFFA:
4046             folder = foldEQ_latin1;
4047             fold_array = PL_fold_latin1;
4048             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4049             goto do_ref;
4050
4051         case REFFU:
4052             folder = foldEQ_latin1;
4053             fold_array = PL_fold_latin1;
4054             utf8_fold_flags = 0;
4055             goto do_ref;
4056
4057         case REFF:
4058             folder = foldEQ;
4059             fold_array = PL_fold;
4060             utf8_fold_flags = 0;
4061             goto do_ref;
4062
4063         case REF:
4064             folder = NULL;
4065             fold_array = NULL;
4066             utf8_fold_flags = 0;
4067
4068           do_ref:
4069             type = OP(scan);
4070             n = ARG(scan);  /* which paren pair */
4071
4072           do_nref_ref_common:
4073             ln = PL_regoffs[n].start;
4074             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4075             if (*PL_reglastparen < n || ln == -1)
4076                 sayNO;                  /* Do not match unless seen CLOSEn. */
4077             if (ln == PL_regoffs[n].end)
4078                 break;
4079
4080             s = PL_bostr + ln;
4081             if (type != REF     /* REF can do byte comparison */
4082                 && (utf8_target || type == REFFU))
4083             { /* XXX handle REFFL better */
4084                 char * limit = PL_regeol;
4085
4086                 /* This call case insensitively compares the entire buffer
4087                     * at s, with the current input starting at locinput, but
4088                     * not going off the end given by PL_regeol, and returns in
4089                     * limit upon success, how much of the current input was
4090                     * matched */
4091                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4092                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4093                 {
4094                     sayNO;
4095                 }
4096                 locinput = limit;
4097                 nextchr = UCHARAT(locinput);
4098                 break;
4099             }
4100
4101             /* Not utf8:  Inline the first character, for speed. */
4102             if (UCHARAT(s) != nextchr &&
4103                 (type == REF ||
4104                  UCHARAT(s) != fold_array[nextchr]))
4105                 sayNO;
4106             ln = PL_regoffs[n].end - ln;
4107             if (locinput + ln > PL_regeol)
4108                 sayNO;
4109             if (ln > 1 && (type == REF
4110                            ? memNE(s, locinput, ln)
4111                            : ! folder(s, locinput, ln)))
4112                 sayNO;
4113             locinput += ln;
4114             nextchr = UCHARAT(locinput);
4115             break;
4116         }
4117         case NOTHING:
4118         case TAIL:
4119             break;
4120         case BACK:
4121             break;
4122
4123 #undef  ST
4124 #define ST st->u.eval
4125         {
4126             SV *ret;
4127             REGEXP *re_sv;
4128             regexp *re;
4129             regexp_internal *rei;
4130             regnode *startpoint;
4131
4132         case GOSTART:
4133         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4134             if (cur_eval && cur_eval->locinput==locinput) {
4135                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4136                     Perl_croak(aTHX_ "Infinite recursion in regex");
4137                 if ( ++nochange_depth > max_nochange_depth )
4138                     Perl_croak(aTHX_
4139                         "Pattern subroutine nesting without pos change"
4140                         " exceeded limit in regex");
4141             } else {
4142                 nochange_depth = 0;
4143             }
4144             re_sv = rex_sv;
4145             re = rex;
4146             rei = rexi;
4147             (void)ReREFCNT_inc(rex_sv);
4148             if (OP(scan)==GOSUB) {
4149                 startpoint = scan + ARG2L(scan);
4150                 ST.close_paren = ARG(scan);
4151             } else {
4152                 startpoint = rei->program+1;
4153                 ST.close_paren = 0;
4154             }
4155             goto eval_recurse_doit;
4156             /* NOTREACHED */
4157         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4158             if (cur_eval && cur_eval->locinput==locinput) {
4159                 if ( ++nochange_depth > max_nochange_depth )
4160                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4161             } else {
4162                 nochange_depth = 0;
4163             }
4164             {
4165                 /* execute the code in the {...} */
4166                 dSP;
4167                 SV ** const before = SP;
4168                 OP_4tree * const oop = PL_op;
4169                 COP * const ocurcop = PL_curcop;
4170                 PAD *old_comppad;
4171                 char *saved_regeol = PL_regeol;
4172                 struct re_save_state saved_state;
4173
4174                 /* To not corrupt the existing regex state while executing the
4175                  * eval we would normally put it on the save stack, like with
4176                  * save_re_context. However, re-evals have a weird scoping so we
4177                  * can't just add ENTER/LEAVE here. With that, things like
4178                  *
4179                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4180                  *
4181                  * would break, as they expect the localisation to be unwound
4182                  * only when the re-engine backtracks through the bit that
4183                  * localised it.
4184                  *
4185                  * What we do instead is just saving the state in a local c
4186                  * variable.
4187                  */
4188                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4189
4190                 n = ARG(scan);
4191                 PL_op = (OP_4tree*)rexi->data->data[n];
4192                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4193                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4194                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4195                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4196
4197                 if (sv_yes_mark) {
4198                     SV *sv_mrk = get_sv("REGMARK", 1);
4199                     sv_setsv(sv_mrk, sv_yes_mark);
4200                 }
4201
4202                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4203                 SPAGAIN;
4204                 if (SP == before)
4205                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4206                 else {
4207                     ret = POPs;
4208                     PUTBACK;
4209                 }
4210
4211                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4212
4213                 PL_op = oop;
4214                 PAD_RESTORE_LOCAL(old_comppad);
4215                 PL_curcop = ocurcop;
4216                 PL_regeol = saved_regeol;
4217                 if (!logical) {
4218                     /* /(?{...})/ */
4219                     sv_setsv(save_scalar(PL_replgv), ret);
4220                     break;
4221                 }
4222             }
4223             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4224                 logical = 0;
4225                 {
4226                     /* extract RE object from returned value; compiling if
4227                      * necessary */
4228                     MAGIC *mg = NULL;
4229                     REGEXP *rx = NULL;
4230
4231                     if (SvROK(ret)) {
4232                         SV *const sv = SvRV(ret);
4233
4234                         if (SvTYPE(sv) == SVt_REGEXP) {
4235                             rx = (REGEXP*) sv;
4236                         } else if (SvSMAGICAL(sv)) {
4237                             mg = mg_find(sv, PERL_MAGIC_qr);
4238                             assert(mg);
4239                         }
4240                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4241                         rx = (REGEXP*) ret;
4242                     } else if (SvSMAGICAL(ret)) {
4243                         if (SvGMAGICAL(ret)) {
4244                             /* I don't believe that there is ever qr magic
4245                                here.  */
4246                             assert(!mg_find(ret, PERL_MAGIC_qr));
4247                             sv_unmagic(ret, PERL_MAGIC_qr);
4248                         }
4249                         else {
4250                             mg = mg_find(ret, PERL_MAGIC_qr);
4251                             /* testing suggests mg only ends up non-NULL for
4252                                scalars who were upgraded and compiled in the
4253                                else block below. In turn, this is only
4254                                triggered in the "postponed utf8 string" tests
4255                                in t/op/pat.t  */
4256                         }
4257                     }
4258
4259                     if (mg) {
4260                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4261                         assert(rx);
4262                     }
4263                     if (rx) {
4264                         rx = reg_temp_copy(NULL, rx);
4265                     }
4266                     else {
4267                         U32 pm_flags = 0;
4268                         const I32 osize = PL_regsize;
4269
4270                         if (DO_UTF8(ret)) {
4271                             assert (SvUTF8(ret));
4272                         } else if (SvUTF8(ret)) {
4273                             /* Not doing UTF-8, despite what the SV says. Is
4274                                this only if we're trapped in use 'bytes'?  */
4275                             /* Make a copy of the octet sequence, but without
4276                                the flag on, as the compiler now honours the
4277                                SvUTF8 flag on ret.  */
4278                             STRLEN len;
4279                             const char *const p = SvPV(ret, len);
4280                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4281                         }
4282                         rx = CALLREGCOMP(ret, pm_flags);
4283                         if (!(SvFLAGS(ret)
4284                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4285                                  | SVs_GMG))) {
4286                             /* This isn't a first class regexp. Instead, it's
4287                                caching a regexp onto an existing, Perl visible
4288                                scalar.  */
4289                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4290                         }
4291                         PL_regsize = osize;
4292                     }
4293                     re_sv = rx;
4294                     re = (struct regexp *)SvANY(rx);
4295                 }
4296                 RXp_MATCH_COPIED_off(re);
4297                 re->subbeg = rex->subbeg;
4298                 re->sublen = rex->sublen;
4299                 rei = RXi_GET(re);
4300                 DEBUG_EXECUTE_r(
4301                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4302                         "Matching embedded");
4303                 );
4304                 startpoint = rei->program + 1;
4305                 ST.close_paren = 0; /* only used for GOSUB */
4306                 /* borrowed from regtry */
4307                 if (PL_reg_start_tmpl <= re->nparens) {
4308                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4309                     if(PL_reg_start_tmp)
4310                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4311                     else
4312                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4313                 }
4314
4315         eval_recurse_doit: /* Share code with GOSUB below this line */
4316                 /* run the pattern returned from (??{...}) */
4317                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4318                 REGCP_SET(ST.lastcp);
4319
4320                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4321
4322                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4323                 PL_reglastparen = &re->lastparen;
4324                 PL_reglastcloseparen = &re->lastcloseparen;
4325                 re->lastparen = 0;
4326                 re->lastcloseparen = 0;
4327
4328                 PL_reginput = locinput;
4329                 PL_regsize = 0;
4330
4331                 /* XXXX This is too dramatic a measure... */
4332                 PL_reg_maxiter = 0;
4333
4334                 ST.toggle_reg_flags = PL_reg_flags;
4335                 if (RX_UTF8(re_sv))
4336                     PL_reg_flags |= RF_utf8;
4337                 else
4338                     PL_reg_flags &= ~RF_utf8;
4339                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4340
4341                 ST.prev_rex = rex_sv;
4342                 ST.prev_curlyx = cur_curlyx;
4343                 SETREX(rex_sv,re_sv);
4344                 rex = re;
4345                 rexi = rei;
4346                 cur_curlyx = NULL;
4347                 ST.B = next;
4348                 ST.prev_eval = cur_eval;
4349                 cur_eval = st;
4350                 /* now continue from first node in postoned RE */
4351                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4352                 /* NOTREACHED */
4353             }
4354             /* logical is 1,   /(?(?{...})X|Y)/ */
4355             sw = cBOOL(SvTRUE(ret));
4356             logical = 0;
4357             break;
4358         }
4359
4360         case EVAL_AB: /* cleanup after a successful (??{A})B */
4361             /* note: this is called twice; first after popping B, then A */
4362             PL_reg_flags ^= ST.toggle_reg_flags;
4363             ReREFCNT_dec(rex_sv);
4364             SETREX(rex_sv,ST.prev_rex);
4365             rex = (struct regexp *)SvANY(rex_sv);
4366             rexi = RXi_GET(rex);
4367             regcpblow(ST.cp);
4368             cur_eval = ST.prev_eval;
4369             cur_curlyx = ST.prev_curlyx;
4370
4371             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4372             PL_reglastparen = &rex->lastparen;
4373             PL_reglastcloseparen = &rex->lastcloseparen;
4374             /* also update PL_regoffs */
4375             PL_regoffs = rex->offs;
4376
4377             /* XXXX This is too dramatic a measure... */
4378             PL_reg_maxiter = 0;
4379             if ( nochange_depth )
4380                 nochange_depth--;
4381             sayYES;
4382
4383
4384         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4385             /* note: this is called twice; first after popping B, then A */
4386             PL_reg_flags ^= ST.toggle_reg_flags;
4387             ReREFCNT_dec(rex_sv);
4388             SETREX(rex_sv,ST.prev_rex);
4389             rex = (struct regexp *)SvANY(rex_sv);
4390             rexi = RXi_GET(rex);
4391             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4392             PL_reglastparen = &rex->lastparen;
4393             PL_reglastcloseparen = &rex->lastcloseparen;
4394
4395             PL_reginput = locinput;
4396             REGCP_UNWIND(ST.lastcp);
4397             regcppop(rex);
4398             cur_eval = ST.prev_eval;
4399             cur_curlyx = ST.prev_curlyx;
4400             /* XXXX This is too dramatic a measure... */
4401             PL_reg_maxiter = 0;
4402             if ( nochange_depth )
4403                 nochange_depth--;
4404             sayNO_SILENT;
4405 #undef ST
4406
4407         case OPEN:
4408             n = ARG(scan);  /* which paren pair */
4409             PL_reg_start_tmp[n] = locinput;
4410             if (n > PL_regsize)
4411                 PL_regsize = n;
4412             lastopen = n;
4413             break;
4414         case CLOSE:
4415             n = ARG(scan);  /* which paren pair */
4416             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4417             PL_regoffs[n].end = locinput - PL_bostr;
4418             /*if (n > PL_regsize)
4419                 PL_regsize = n;*/
4420             if (n > *PL_reglastparen)
4421                 *PL_reglastparen = n;
4422             *PL_reglastcloseparen = n;
4423             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4424                 goto fake_end;
4425             }
4426             break;
4427         case ACCEPT:
4428             if (ARG(scan)){
4429                 regnode *cursor;
4430                 for (cursor=scan;
4431                      cursor && OP(cursor)!=END;
4432                      cursor=regnext(cursor))
4433                 {
4434                     if ( OP(cursor)==CLOSE ){
4435                         n = ARG(cursor);
4436                         if ( n <= lastopen ) {
4437                             PL_regoffs[n].start
4438                                 = PL_reg_start_tmp[n] - PL_bostr;
4439                             PL_regoffs[n].end = locinput - PL_bostr;
4440                             /*if (n > PL_regsize)
4441                             PL_regsize = n;*/
4442                             if (n > *PL_reglastparen)
4443                                 *PL_reglastparen = n;
4444                             *PL_reglastcloseparen = n;
4445                             if ( n == ARG(scan) || (cur_eval &&
4446                                 cur_eval->u.eval.close_paren == n))
4447                                 break;
4448                         }
4449                     }
4450                 }
4451             }
4452             goto fake_end;
4453             /*NOTREACHED*/
4454         case GROUPP:
4455             n = ARG(scan);  /* which paren pair */
4456             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4457             break;
4458         case NGROUPP:
4459             /* reg_check_named_buff_matched returns 0 for no match */
4460             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4461             break;
4462         case INSUBP:
4463             n = ARG(scan);
4464             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4465             break;
4466         case DEFINEP:
4467             sw = 0;
4468             break;
4469         case IFTHEN:
4470             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4471             if (sw)
4472                 next = NEXTOPER(NEXTOPER(scan));
4473             else {
4474                 next = scan + ARG(scan);
4475                 if (OP(next) == IFTHEN) /* Fake one. */
4476                     next = NEXTOPER(NEXTOPER(next));
4477             }
4478             break;
4479         case LOGICAL:
4480             logical = scan->flags;
4481             break;
4482
4483 /*******************************************************************
4484
4485 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4486 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4487 STAR/PLUS/CURLY/CURLYN are used instead.)
4488
4489 A*B is compiled as <CURLYX><A><WHILEM><B>
4490
4491 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4492 state, which contains the current count, initialised to -1. It also sets
4493 cur_curlyx to point to this state, with any previous value saved in the
4494 state block.
4495
4496 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4497 since the pattern may possibly match zero times (i.e. it's a while {} loop
4498 rather than a do {} while loop).
4499
4500 Each entry to WHILEM represents a successful match of A. The count in the
4501 CURLYX block is incremented, another WHILEM state is pushed, and execution
4502 passes to A or B depending on greediness and the current count.
4503
4504 For example, if matching against the string a1a2a3b (where the aN are
4505 substrings that match /A/), then the match progresses as follows: (the
4506 pushed states are interspersed with the bits of strings matched so far):
4507
4508     <CURLYX cnt=-1>
4509     <CURLYX cnt=0><WHILEM>
4510     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4511     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4512     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4513     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4514
4515 (Contrast this with something like CURLYM, which maintains only a single
4516 backtrack state:
4517
4518     <CURLYM cnt=0> a1
4519     a1 <CURLYM cnt=1> a2
4520     a1 a2 <CURLYM cnt=2> a3
4521     a1 a2 a3 <CURLYM cnt=3> b
4522 )
4523
4524 Each WHILEM state block marks a point to backtrack to upon partial failure
4525 of A or B, and also contains some minor state data related to that
4526 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4527 overall state, such as the count, and pointers to the A and B ops.
4528
4529 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4530 must always point to the *current* CURLYX block, the rules are:
4531
4532 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4533 and set cur_curlyx to point the new block.
4534
4535 When popping the CURLYX block after a successful or unsuccessful match,
4536 restore the previous cur_curlyx.
4537
4538 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4539 to the outer one saved in the CURLYX block.
4540
4541 When popping the WHILEM block after a successful or unsuccessful B match,
4542 restore the previous cur_curlyx.
4543
4544 Here's an example for the pattern (AI* BI)*BO
4545 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4546
4547 cur_
4548 curlyx backtrack stack
4549 ------ ---------------
4550 NULL
4551 CO     <CO prev=NULL> <WO>
4552 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4553 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4554 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4555
4556 At this point the pattern succeeds, and we work back down the stack to
4557 clean up, restoring as we go:
4558
4559 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4560 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4561 CO     <CO prev=NULL> <WO>
4562 NULL
4563
4564 *******************************************************************/
4565
4566 #define ST st->u.curlyx
4567
4568         case CURLYX:    /* start of /A*B/  (for complex A) */
4569         {
4570             /* No need to save/restore up to this paren */
4571             I32 parenfloor = scan->flags;
4572
4573             assert(next); /* keep Coverity happy */
4574             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4575                 next += ARG(next);
4576
4577             /* XXXX Probably it is better to teach regpush to support
4578                parenfloor > PL_regsize... */
4579             if (parenfloor > (I32)*PL_reglastparen)
4580                 parenfloor = *PL_reglastparen; /* Pessimization... */
4581
4582             ST.prev_curlyx= cur_curlyx;
4583             cur_curlyx = st;
4584             ST.cp = PL_savestack_ix;
4585
4586             /* these fields contain the state of the current curly.
4587              * they are accessed by subsequent WHILEMs */
4588             ST.parenfloor = parenfloor;
4589             ST.me = scan;
4590             ST.B = next;
4591             ST.minmod = minmod;
4592             minmod = 0;
4593             ST.count = -1;      /* this will be updated by WHILEM */
4594             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4595
4596             PL_reginput = locinput;
4597             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4598             /* NOTREACHED */
4599         }
4600
4601         case CURLYX_end: /* just finished matching all of A*B */
4602             cur_curlyx = ST.prev_curlyx;
4603             sayYES;
4604             /* NOTREACHED */
4605
4606         case CURLYX_end_fail: /* just failed to match all of A*B */
4607             regcpblow(ST.cp);
4608             cur_curlyx = ST.prev_curlyx;
4609             sayNO;
4610             /* NOTREACHED */
4611
4612
4613 #undef ST
4614 #define ST st->u.whilem
4615
4616         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4617         {
4618             /* see the discussion above about CURLYX/WHILEM */
4619             I32 n;
4620             int min = ARG1(cur_curlyx->u.curlyx.me);
4621             int max = ARG2(cur_curlyx->u.curlyx.me);
4622             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4623
4624             assert(cur_curlyx); /* keep Coverity happy */
4625             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4626             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4627             ST.cache_offset = 0;
4628             ST.cache_mask = 0;
4629
4630             PL_reginput = locinput;
4631
4632             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4633                   "%*s  whilem: matched %ld out of %d..%d\n",
4634                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4635             );
4636
4637             /* First just match a string of min A's. */
4638
4639             if (n < min) {
4640                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4641                 cur_curlyx->u.curlyx.lastloc = locinput;
4642                 REGCP_SET(ST.lastcp);
4643
4644                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4645                 /* NOTREACHED */
4646             }
4647
4648             /* If degenerate A matches "", assume A done. */
4649
4650             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4651                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4652                    "%*s  whilem: empty match detected, trying continuation...\n",
4653                    REPORT_CODE_OFF+depth*2, "")
4654                 );
4655                 goto do_whilem_B_max;
4656             }
4657
4658             /* super-linear cache processing */
4659
4660             if (scan->flags) {
4661
4662                 if (!PL_reg_maxiter) {
4663                     /* start the countdown: Postpone detection until we
4664                      * know the match is not *that* much linear. */
4665                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4666                     /* possible overflow for long strings and many CURLYX's */
4667                     if (PL_reg_maxiter < 0)
4668                         PL_reg_maxiter = I32_MAX;
4669                     PL_reg_leftiter = PL_reg_maxiter;
4670                 }
4671
4672                 if (PL_reg_leftiter-- == 0) {
4673                     /* initialise cache */
4674                     const I32 size = (PL_reg_maxiter + 7)/8;
4675                     if (PL_reg_poscache) {
4676                         if ((I32)PL_reg_poscache_size < size) {
4677                             Renew(PL_reg_poscache, size, char);
4678                             PL_reg_poscache_size = size;
4679                         }
4680                         Zero(PL_reg_poscache, size, char);
4681                     }
4682                     else {
4683                         PL_reg_poscache_size = size;
4684                         Newxz(PL_reg_poscache, size, char);
4685                     }
4686                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4687       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4688                               PL_colors[4], PL_colors[5])
4689                     );
4690                 }
4691
4692                 if (PL_reg_leftiter < 0) {
4693                     /* have we already failed at this position? */
4694                     I32 offset, mask;
4695                     offset  = (scan->flags & 0xf) - 1
4696                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4697                     mask    = 1 << (offset % 8);
4698                     offset /= 8;
4699                     if (PL_reg_poscache[offset] & mask) {
4700                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4701                             "%*s  whilem: (cache) already tried at this position...\n",
4702                             REPORT_CODE_OFF+depth*2, "")
4703                         );
4704                         sayNO; /* cache records failure */
4705                     }
4706                     ST.cache_offset = offset;
4707                     ST.cache_mask   = mask;
4708                 }
4709             }
4710
4711             /* Prefer B over A for minimal matching. */
4712
4713             if (cur_curlyx->u.curlyx.minmod) {
4714                 ST.save_curlyx = cur_curlyx;
4715                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4716                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4717                 REGCP_SET(ST.lastcp);
4718                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4719                 /* NOTREACHED */
4720             }
4721
4722             /* Prefer A over B for maximal matching. */
4723
4724             if (n < max) { /* More greed allowed? */
4725                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4726                 cur_curlyx->u.curlyx.lastloc = locinput;
4727                 REGCP_SET(ST.lastcp);
4728                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4729                 /* NOTREACHED */
4730             }
4731             goto do_whilem_B_max;
4732         }
4733         /* NOTREACHED */
4734
4735         case WHILEM_B_min: /* just matched B in a minimal match */
4736         case WHILEM_B_max: /* just matched B in a maximal match */
4737             cur_curlyx = ST.save_curlyx;
4738             sayYES;
4739             /* NOTREACHED */
4740
4741         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4742             cur_curlyx = ST.save_curlyx;
4743             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4744             cur_curlyx->u.curlyx.count--;
4745             CACHEsayNO;
4746             /* NOTREACHED */
4747
4748         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4749             /* FALL THROUGH */
4750         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4751             REGCP_UNWIND(ST.lastcp);
4752             regcppop(rex);
4753             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4754             cur_curlyx->u.curlyx.count--;
4755             CACHEsayNO;
4756             /* NOTREACHED */
4757
4758         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4759             REGCP_UNWIND(ST.lastcp);
4760             regcppop(rex);      /* Restore some previous $<digit>s? */
4761             PL_reginput = locinput;
4762             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4763                 "%*s  whilem: failed, trying continuation...\n",
4764                 REPORT_CODE_OFF+depth*2, "")
4765             );
4766           do_whilem_B_max:
4767             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4768                 && ckWARN(WARN_REGEXP)
4769                 && !(PL_reg_flags & RF_warned))
4770             {
4771                 PL_reg_flags |= RF_warned;
4772                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4773                      "Complex regular subexpression recursion",
4774                      REG_INFTY - 1);
4775             }
4776
4777             /* now try B */
4778             ST.save_curlyx = cur_curlyx;
4779             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4780             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4781             /* NOTREACHED */
4782
4783         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4784             cur_curlyx = ST.save_curlyx;
4785             REGCP_UNWIND(ST.lastcp);
4786             regcppop(rex);
4787
4788             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4789                 /* Maximum greed exceeded */
4790                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4791                     && ckWARN(WARN_REGEXP)
4792                     && !(PL_reg_flags & RF_warned))
4793                 {
4794                     PL_reg_flags |= RF_warned;
4795                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4796                         "%s limit (%d) exceeded",
4797                         "Complex regular subexpression recursion",
4798                         REG_INFTY - 1);
4799                 }
4800                 cur_curlyx->u.curlyx.count--;
4801                 CACHEsayNO;
4802             }
4803
4804             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4805                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4806             );
4807             /* Try grabbing another A and see if it helps. */
4808             PL_reginput = locinput;
4809             cur_curlyx->u.curlyx.lastloc = locinput;
4810             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4811             REGCP_SET(ST.lastcp);
4812             PUSH_STATE_GOTO(WHILEM_A_min,
4813                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4814             /* NOTREACHED */
4815
4816 #undef  ST
4817 #define ST st->u.branch
4818
4819         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4820             next = scan + ARG(scan);
4821             if (next == scan)
4822                 next = NULL;
4823             scan = NEXTOPER(scan);
4824             /* FALL THROUGH */
4825
4826         case BRANCH:        /*  /(...|A|...)/ */
4827             scan = NEXTOPER(scan); /* scan now points to inner node */
4828             ST.lastparen = *PL_reglastparen;
4829             ST.next_branch = next;
4830             REGCP_SET(ST.cp);
4831             PL_reginput = locinput;
4832
4833             /* Now go into the branch */
4834             if (has_cutgroup) {
4835                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4836             } else {
4837                 PUSH_STATE_GOTO(BRANCH_next, scan);
4838             }
4839             /* NOTREACHED */
4840         case CUTGROUP:
4841             PL_reginput = locinput;
4842             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4843                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4844             PUSH_STATE_GOTO(CUTGROUP_next,next);
4845             /* NOTREACHED */
4846         case CUTGROUP_next_fail:
4847             do_cutgroup = 1;
4848             no_final = 1;
4849             if (st->u.mark.mark_name)
4850                 sv_commit = st->u.mark.mark_name;
4851             sayNO;
4852             /* NOTREACHED */
4853         case BRANCH_next:
4854             sayYES;
4855             /* NOTREACHED */
4856         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4857             if (do_cutgroup) {
4858                 do_cutgroup = 0;
4859                 no_final = 0;
4860             }
4861             REGCP_UNWIND(ST.cp);
4862             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4863                 PL_regoffs[n].end = -1;
4864             *PL_reglastparen = n;
4865             /*dmq: *PL_reglastcloseparen = n; */
4866             scan = ST.next_branch;
4867             /* no more branches? */
4868             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4869                 DEBUG_EXECUTE_r({
4870                     PerlIO_printf( Perl_debug_log,
4871                         "%*s  %sBRANCH failed...%s\n",
4872                         REPORT_CODE_OFF+depth*2, "",
4873                         PL_colors[4],
4874                         PL_colors[5] );
4875                 });
4876                 sayNO_SILENT;
4877             }
4878             continue; /* execute next BRANCH[J] op */
4879             /* NOTREACHED */
4880
4881         case MINMOD:
4882             minmod = 1;
4883             break;
4884
4885 #undef  ST
4886 #define ST st->u.curlym
4887
4888         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4889
4890             /* This is an optimisation of CURLYX that enables us to push
4891              * only a single backtracking state, no matter how many matches
4892              * there are in {m,n}. It relies on the pattern being constant
4893              * length, with no parens to influence future backrefs
4894              */
4895
4896             ST.me = scan;
4897             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4898
4899             /* if paren positive, emulate an OPEN/CLOSE around A */
4900             if (ST.me->flags) {
4901                 U32 paren = ST.me->flags;
4902                 if (paren > PL_regsize)
4903                     PL_regsize = paren;
4904                 if (paren > *PL_reglastparen)
4905                     *PL_reglastparen = paren;
4906                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4907             }
4908             ST.A = scan;
4909             ST.B = next;
4910             ST.alen = 0;
4911             ST.count = 0;
4912             ST.minmod = minmod;
4913             minmod = 0;
4914             ST.c1 = CHRTEST_UNINIT;
4915             REGCP_SET(ST.cp);
4916
4917             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4918                 goto curlym_do_B;
4919
4920           curlym_do_A: /* execute the A in /A{m,n}B/  */
4921             PL_reginput = locinput;
4922             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4923             /* NOTREACHED */
4924
4925         case CURLYM_A: /* we've just matched an A */
4926             locinput = st->locinput;
4927             nextchr = UCHARAT(locinput);
4928
4929             ST.count++;
4930             /* after first match, determine A's length: u.curlym.alen */
4931             if (ST.count == 1) {
4932                 if (PL_reg_match_utf8) {
4933                     char *s = locinput;
4934                     while (s < PL_reginput) {
4935                         ST.alen++;
4936                         s += UTF8SKIP(s);
4937                     }
4938                 }
4939                 else {
4940                     ST.alen = PL_reginput - locinput;
4941                 }
4942                 if (ST.alen == 0)
4943                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4944             }
4945             DEBUG_EXECUTE_r(
4946                 PerlIO_printf(Perl_debug_log,
4947                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4948                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4949                           (IV) ST.count, (IV)ST.alen)
4950             );
4951
4952             locinput = PL_reginput;
4953
4954             if (cur_eval && cur_eval->u.eval.close_paren &&
4955                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4956                 goto fake_end;
4957
4958             {
4959                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4960                 if ( max == REG_INFTY || ST.count < max )
4961                     goto curlym_do_A; /* try to match another A */
4962             }
4963             goto curlym_do_B; /* try to match B */
4964
4965         case CURLYM_A_fail: /* just failed to match an A */
4966             REGCP_UNWIND(ST.cp);
4967
4968             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4969                 || (cur_eval && cur_eval->u.eval.close_paren &&
4970                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4971                 sayNO;
4972
4973           curlym_do_B: /* execute the B in /A{m,n}B/  */
4974             PL_reginput = locinput;
4975             if (ST.c1 == CHRTEST_UNINIT) {
4976                 /* calculate c1 and c2 for possible match of 1st char
4977                  * following curly */
4978                 ST.c1 = ST.c2 = CHRTEST_VOID;
4979                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4980                     regnode *text_node = ST.B;
4981                     if (! HAS_TEXT(text_node))
4982                         FIND_NEXT_IMPT(text_node);
4983                     /* this used to be
4984
4985                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4986
4987                         But the former is redundant in light of the latter.
4988
4989                         if this changes back then the macro for
4990                         IS_TEXT and friends need to change.
4991                      */
4992                     if (PL_regkind[OP(text_node)] == EXACT)
4993                     {
4994
4995                         ST.c1 = (U8)*STRING(text_node);
4996                         switch (OP(text_node)) {
4997                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
4998                             case EXACTFA:
4999                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5000                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5001                             default: ST.c2 = ST.c1;
5002                         }
5003                     }
5004                 }
5005             }
5006
5007             DEBUG_EXECUTE_r(
5008                 PerlIO_printf(Perl_debug_log,
5009                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5010                     (int)(REPORT_CODE_OFF+(depth*2)),
5011                     "", (IV)ST.count)
5012                 );
5013             if (ST.c1 != CHRTEST_VOID
5014                     && UCHARAT(PL_reginput) != ST.c1
5015                     && UCHARAT(PL_reginput) != ST.c2)
5016             {
5017                 /* simulate B failing */
5018                 DEBUG_OPTIMISE_r(
5019                     PerlIO_printf(Perl_debug_log,
5020                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5021                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5022                         (IV)ST.c1,(IV)ST.c2
5023                 ));
5024                 state_num = CURLYM_B_fail;
5025                 goto reenter_switch;
5026             }
5027
5028             if (ST.me->flags) {
5029                 /* mark current A as captured */
5030                 I32 paren = ST.me->flags;
5031                 if (ST.count) {
5032                     PL_regoffs[paren].start
5033                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5034                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5035                     /*dmq: *PL_reglastcloseparen = paren; */
5036                 }
5037                 else
5038                     PL_regoffs[paren].end = -1;
5039                 if (cur_eval && cur_eval->u.eval.close_paren &&
5040                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5041                 {
5042                     if (ST.count)
5043                         goto fake_end;
5044                     else
5045                         sayNO;
5046                 }
5047             }
5048
5049             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5050             /* NOTREACHED */
5051
5052         case CURLYM_B_fail: /* just failed to match a B */
5053             REGCP_UNWIND(ST.cp);
5054             if (ST.minmod) {
5055                 I32 max = ARG2(ST.me);
5056                 if (max != REG_INFTY && ST.count == max)
5057                     sayNO;
5058                 goto curlym_do_A; /* try to match a further A */
5059             }
5060             /* backtrack one A */
5061             if (ST.count == ARG1(ST.me) /* min */)
5062                 sayNO;
5063             ST.count--;
5064             locinput = HOPc(locinput, -ST.alen);
5065             goto curlym_do_B; /* try to match B */
5066
5067 #undef ST
5068 #define ST st->u.curly
5069
5070 #define CURLY_SETPAREN(paren, success) \
5071     if (paren) { \
5072         if (success) { \
5073             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5074             PL_regoffs[paren].end = locinput - PL_bostr; \
5075             *PL_reglastcloseparen = paren; \
5076         } \
5077         else \
5078             PL_regoffs[paren].end = -1; \
5079     }
5080
5081         case STAR:              /*  /A*B/ where A is width 1 */
5082             ST.paren = 0;
5083             ST.min = 0;
5084             ST.max = REG_INFTY;
5085             scan = NEXTOPER(scan);
5086             goto repeat;
5087         case PLUS:              /*  /A+B/ where A is width 1 */
5088             ST.paren = 0;
5089             ST.min = 1;
5090             ST.max = REG_INFTY;
5091             scan = NEXTOPER(scan);
5092             goto repeat;
5093         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5094             ST.paren = scan->flags;     /* Which paren to set */
5095             if (ST.paren > PL_regsize)
5096                 PL_regsize = ST.paren;
5097             if (ST.paren > *PL_reglastparen)
5098                 *PL_reglastparen = ST.paren;
5099             ST.min = ARG1(scan);  /* min to match */
5100             ST.max = ARG2(scan);  /* max to match */
5101             if (cur_eval && cur_eval->u.eval.close_paren &&
5102                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5103                 ST.min=1;
5104                 ST.max=1;
5105             }
5106             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5107             goto repeat;
5108         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5109             ST.paren = 0;
5110             ST.min = ARG1(scan);  /* min to match */
5111             ST.max = ARG2(scan);  /* max to match */
5112             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5113           repeat:
5114             /*
5115             * Lookahead to avoid useless match attempts
5116             * when we know what character comes next.
5117             *
5118             * Used to only do .*x and .*?x, but now it allows
5119             * for )'s, ('s and (?{ ... })'s to be in the way
5120             * of the quantifier and the EXACT-like node.  -- japhy
5121             */
5122
5123             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5124                 sayNO;
5125             if (HAS_TEXT(next) || JUMPABLE(next)) {
5126                 U8 *s;
5127                 regnode *text_node = next;
5128
5129                 if (! HAS_TEXT(text_node))
5130                     FIND_NEXT_IMPT(text_node);
5131
5132                 if (! HAS_TEXT(text_node))
5133                     ST.c1 = ST.c2 = CHRTEST_VOID;
5134                 else {
5135                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5136                         ST.c1 = ST.c2 = CHRTEST_VOID;
5137                         goto assume_ok_easy;
5138                     }
5139                     else
5140                         s = (U8*)STRING(text_node);
5141
5142                     /*  Currently we only get here when
5143
5144                         PL_rekind[OP(text_node)] == EXACT
5145
5146                         if this changes back then the macro for IS_TEXT and
5147                         friends need to change. */
5148                     if (!UTF_PATTERN) {
5149                         ST.c1 = *s;
5150                         switch (OP(text_node)) {
5151                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5152                             case EXACTFA:
5153                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5154                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5155                             default: ST.c2 = ST.c1; break;
5156                         }
5157                     }
5158                     else { /* UTF_PATTERN */
5159                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5160                              STRLEN ulen1, ulen2;
5161                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5162                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5163
5164                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5165                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5166 #ifdef EBCDIC
5167                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5168                                                     ckWARN(WARN_UTF8) ?
5169                                                     0 : UTF8_ALLOW_ANY);
5170                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5171                                                     ckWARN(WARN_UTF8) ?
5172                                                     0 : UTF8_ALLOW_ANY);
5173 #else
5174                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5175                                                     uniflags);
5176                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5177                                                     uniflags);
5178 #endif
5179                         }
5180                         else {
5181                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5182                                                      uniflags);
5183                         }
5184                     }
5185                 }
5186             }
5187             else
5188                 ST.c1 = ST.c2 = CHRTEST_VOID;
5189         assume_ok_easy:
5190
5191             ST.A = scan;
5192             ST.B = next;
5193             PL_reginput = locinput;
5194             if (minmod) {
5195                 minmod = 0;
5196                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5197                     sayNO;
5198                 ST.count = ST.min;
5199                 locinput = PL_reginput;
5200                 REGCP_SET(ST.cp);
5201                 if (ST.c1 == CHRTEST_VOID)
5202                     goto curly_try_B_min;
5203
5204                 ST.oldloc = locinput;
5205
5206                 /* set ST.maxpos to the furthest point along the
5207                  * string that could possibly match */
5208                 if  (ST.max == REG_INFTY) {
5209                     ST.maxpos = PL_regeol - 1;
5210                     if (utf8_target)
5211                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5212                             ST.maxpos--;
5213                 }
5214                 else if (utf8_target) {
5215                     int m = ST.max - ST.min;
5216                     for (ST.maxpos = locinput;
5217                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5218                         ST.maxpos += UTF8SKIP(ST.maxpos);
5219                 }
5220                 else {
5221                     ST.maxpos = locinput + ST.max - ST.min;
5222                     if (ST.maxpos >= PL_regeol)
5223                         ST.maxpos = PL_regeol - 1;
5224                 }
5225                 goto curly_try_B_min_known;
5226
5227             }
5228             else {
5229                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5230                 locinput = PL_reginput;
5231                 if (ST.count < ST.min)
5232                     sayNO;
5233                 if ((ST.count > ST.min)
5234                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5235                 {
5236                     /* A{m,n} must come at the end of the string, there's
5237                      * no point in backing off ... */
5238                     ST.min = ST.count;
5239                     /* ...except that $ and \Z can match before *and* after
5240                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5241                        We may back off by one in this case. */
5242                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5243                         ST.min--;
5244                 }
5245                 REGCP_SET(ST.cp);
5246                 goto curly_try_B_max;
5247             }
5248             /* NOTREACHED */
5249
5250
5251         case CURLY_B_min_known_fail:
5252             /* failed to find B in a non-greedy match where c1,c2 valid */
5253             if (ST.paren && ST.count)
5254                 PL_regoffs[ST.paren].end = -1;
5255
5256             PL_reginput = locinput;     /* Could be reset... */
5257             REGCP_UNWIND(ST.cp);
5258             /* Couldn't or didn't -- move forward. */
5259             ST.oldloc = locinput;
5260             if (utf8_target)
5261                 locinput += UTF8SKIP(locinput);
5262             else
5263                 locinput++;
5264             ST.count++;
5265           curly_try_B_min_known:
5266              /* find the next place where 'B' could work, then call B */
5267             {
5268                 int n;
5269                 if (utf8_target) {
5270                     n = (ST.oldloc == locinput) ? 0 : 1;
5271                     if (ST.c1 == ST.c2) {
5272                         STRLEN len;
5273                         /* set n to utf8_distance(oldloc, locinput) */
5274                         while (locinput <= ST.maxpos &&
5275                                utf8n_to_uvchr((U8*)locinput,
5276                                               UTF8_MAXBYTES, &len,
5277                                               uniflags) != (UV)ST.c1) {
5278                             locinput += len;
5279                             n++;
5280                         }
5281                     }
5282                     else {
5283                         /* set n to utf8_distance(oldloc, locinput) */
5284                         while (locinput <= ST.maxpos) {
5285                             STRLEN len;
5286                             const UV c = utf8n_to_uvchr((U8*)locinput,
5287                                                   UTF8_MAXBYTES, &len,
5288                                                   uniflags);
5289                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5290                                 break;
5291                             locinput += len;
5292                             n++;
5293                         }
5294                     }
5295                 }
5296                 else {
5297                     if (ST.c1 == ST.c2) {
5298                         while (locinput <= ST.maxpos &&
5299                                UCHARAT(locinput) != ST.c1)
5300                             locinput++;
5301                     }
5302                     else {
5303                         while (locinput <= ST.maxpos
5304                                && UCHARAT(locinput) != ST.c1
5305                                && UCHARAT(locinput) != ST.c2)
5306                             locinput++;
5307                     }
5308                     n = locinput - ST.oldloc;
5309                 }
5310                 if (locinput > ST.maxpos)
5311                     sayNO;
5312                 /* PL_reginput == oldloc now */
5313                 if (n) {
5314                     ST.count += n;
5315                     if (regrepeat(rex, ST.A, n, depth) < n)
5316                         sayNO;
5317                 }
5318                 PL_reginput = locinput;
5319                 CURLY_SETPAREN(ST.paren, ST.count);
5320                 if (cur_eval && cur_eval->u.eval.close_paren &&
5321                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5322                     goto fake_end;
5323                 }
5324                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5325             }
5326             /* NOTREACHED */
5327
5328
5329         case CURLY_B_min_fail:
5330             /* failed to find B in a non-greedy match where c1,c2 invalid */
5331             if (ST.paren && ST.count)
5332                 PL_regoffs[ST.paren].end = -1;
5333
5334             REGCP_UNWIND(ST.cp);
5335             /* failed -- move forward one */
5336             PL_reginput = locinput;
5337             if (regrepeat(rex, ST.A, 1, depth)) {
5338                 ST.count++;
5339                 locinput = PL_reginput;
5340                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5341                         ST.count > 0)) /* count overflow ? */
5342                 {
5343                   curly_try_B_min:
5344                     CURLY_SETPAREN(ST.paren, ST.count);
5345                     if (cur_eval && cur_eval->u.eval.close_paren &&
5346                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5347                         goto fake_end;
5348                     }
5349                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5350                 }
5351             }
5352             sayNO;
5353             /* NOTREACHED */
5354
5355
5356         curly_try_B_max:
5357             /* a successful greedy match: now try to match B */
5358             if (cur_eval && cur_eval->u.eval.close_paren &&
5359                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5360                 goto fake_end;
5361             }
5362             {
5363                 UV c = 0;
5364                 if (ST.c1 != CHRTEST_VOID)
5365                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5366                                            UTF8_MAXBYTES, 0, uniflags)
5367                                 : (UV) UCHARAT(PL_reginput);
5368                 /* If it could work, try it. */
5369                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5370                     CURLY_SETPAREN(ST.paren, ST.count);
5371                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5372                     /* NOTREACHED */
5373                 }
5374             }
5375             /* FALL THROUGH */
5376         case CURLY_B_max_fail:
5377             /* failed to find B in a greedy match */
5378             if (ST.paren && ST.count)
5379                 PL_regoffs[ST.paren].end = -1;
5380
5381             REGCP_UNWIND(ST.cp);
5382             /*  back up. */
5383             if (--ST.count < ST.min)
5384                 sayNO;
5385             PL_reginput = locinput = HOPc(locinput, -1);
5386             goto curly_try_B_max;
5387
5388 #undef ST
5389
5390         case END:
5391             fake_end:
5392             if (cur_eval) {
5393                 /* we've just finished A in /(??{A})B/; now continue with B */
5394                 I32 tmpix;
5395                 st->u.eval.toggle_reg_flags
5396                             = cur_eval->u.eval.toggle_reg_flags;
5397                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5398
5399                 st->u.eval.prev_rex = rex_sv;           /* inner */
5400                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5401                 rex = (struct regexp *)SvANY(rex_sv);
5402                 rexi = RXi_GET(rex);
5403                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5404                 (void)ReREFCNT_inc(rex_sv);
5405                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5406
5407                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5408                 PL_reglastparen = &rex->lastparen;
5409                 PL_reglastcloseparen = &rex->lastcloseparen;
5410
5411                 REGCP_SET(st->u.eval.lastcp);
5412                 PL_reginput = locinput;
5413
5414                 /* Restore parens of the outer rex without popping the
5415                  * savestack */
5416                 tmpix = PL_savestack_ix;
5417                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5418                 regcppop(rex);
5419                 PL_savestack_ix = tmpix;
5420
5421                 st->u.eval.prev_eval = cur_eval;
5422                 cur_eval = cur_eval->u.eval.prev_eval;
5423                 DEBUG_EXECUTE_r(
5424                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5425                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5426                 if ( nochange_depth )
5427                     nochange_depth--;
5428
5429                 PUSH_YES_STATE_GOTO(EVAL_AB,
5430                         st->u.eval.prev_eval->u.eval.B); /* match B */
5431             }
5432
5433             if (locinput < reginfo->till) {
5434                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5435                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5436                                       PL_colors[4],
5437                                       (long)(locinput - PL_reg_starttry),
5438                                       (long)(reginfo->till - PL_reg_starttry),
5439                                       PL_colors[5]));
5440
5441                 sayNO_SILENT;           /* Cannot match: too short. */
5442             }
5443             PL_reginput = locinput;     /* put where regtry can find it */
5444             sayYES;                     /* Success! */
5445
5446         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5447             DEBUG_EXECUTE_r(
5448             PerlIO_printf(Perl_debug_log,
5449                 "%*s  %ssubpattern success...%s\n",
5450                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5451             PL_reginput = locinput;     /* put where regtry can find it */
5452             sayYES;                     /* Success! */
5453
5454 #undef  ST
5455 #define ST st->u.ifmatch
5456
5457         case SUSPEND:   /* (?>A) */
5458             ST.wanted = 1;
5459             PL_reginput = locinput;
5460             goto do_ifmatch;
5461
5462         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5463             ST.wanted = 0;
5464             goto ifmatch_trivial_fail_test;
5465
5466         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5467             ST.wanted = 1;
5468           ifmatch_trivial_fail_test:
5469             if (scan->flags) {
5470                 char * const s = HOPBACKc(locinput, scan->flags);
5471                 if (!s) {
5472                     /* trivial fail */
5473                     if (logical) {
5474                         logical = 0;
5475                         sw = 1 - cBOOL(ST.wanted);
5476                     }
5477                     else if (ST.wanted)
5478                         sayNO;
5479                     next = scan + ARG(scan);
5480                     if (next == scan)
5481                         next = NULL;
5482                     break;
5483                 }
5484                 PL_reginput = s;
5485             }
5486             else
5487                 PL_reginput = locinput;
5488
5489           do_ifmatch:
5490             ST.me = scan;
5491             ST.logical = logical;
5492             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5493
5494             /* execute body of (?...A) */
5495             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5496             /* NOTREACHED */
5497
5498         case IFMATCH_A_fail: /* body of (?...A) failed */
5499             ST.wanted = !ST.wanted;
5500             /* FALL THROUGH */
5501
5502         case IFMATCH_A: /* body of (?...A) succeeded */
5503             if (ST.logical) {
5504                 sw = cBOOL(ST.wanted);
5505             }
5506             else if (!ST.wanted)
5507                 sayNO;
5508
5509             if (OP(ST.me) == SUSPEND)
5510                 locinput = PL_reginput;
5511             else {
5512                 locinput = PL_reginput = st->locinput;
5513                 nextchr = UCHARAT(locinput);
5514             }
5515             scan = ST.me + ARG(ST.me);
5516             if (scan == ST.me)
5517                 scan = NULL;
5518             continue; /* execute B */
5519
5520 #undef ST
5521
5522         case LONGJMP:
5523             next = scan + ARG(scan);
5524             if (next == scan)
5525                 next = NULL;
5526             break;
5527         case COMMIT:
5528             reginfo->cutpoint = PL_regeol;
5529             /* FALLTHROUGH */
5530         case PRUNE:
5531             PL_reginput = locinput;
5532             if (!scan->flags)
5533                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5534             PUSH_STATE_GOTO(COMMIT_next,next);
5535             /* NOTREACHED */
5536         case COMMIT_next_fail:
5537             no_final = 1;
5538             /* FALLTHROUGH */
5539         case OPFAIL:
5540             sayNO;
5541             /* NOTREACHED */
5542
5543 #define ST st->u.mark
5544         case MARKPOINT:
5545             ST.prev_mark = mark_state;
5546             ST.mark_name = sv_commit = sv_yes_mark
5547                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5548             mark_state = st;
5549             ST.mark_loc = PL_reginput = locinput;
5550             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5551             /* NOTREACHED */
5552         case MARKPOINT_next:
5553             mark_state = ST.prev_mark;
5554             sayYES;
5555             /* NOTREACHED */
5556         case MARKPOINT_next_fail:
5557             if (popmark && sv_eq(ST.mark_name,popmark))
5558             {
5559                 if (ST.mark_loc > startpoint)
5560                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5561                 popmark = NULL; /* we found our mark */
5562                 sv_commit = ST.mark_name;
5563
5564                 DEBUG_EXECUTE_r({
5565                         PerlIO_printf(Perl_debug_log,
5566                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5567                             REPORT_CODE_OFF+depth*2, "",
5568                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5569                 });
5570             }
5571             mark_state = ST.prev_mark;
5572             sv_yes_mark = mark_state ?
5573                 mark_state->u.mark.mark_name : NULL;
5574             sayNO;
5575             /* NOTREACHED */
5576         case SKIP:
5577             PL_reginput = locinput;
5578             if (scan->flags) {
5579                 /* (*SKIP) : if we fail we cut here*/
5580                 ST.mark_name = NULL;
5581                 ST.mark_loc = locinput;
5582                 PUSH_STATE_GOTO(SKIP_next,next);
5583             } else {
5584                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5585                    otherwise do nothing.  Meaning we need to scan
5586                  */
5587                 regmatch_state *cur = mark_state;
5588                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5589
5590                 while (cur) {
5591                     if ( sv_eq( cur->u.mark.mark_name,
5592                                 find ) )
5593                     {
5594                         ST.mark_name = find;
5595                         PUSH_STATE_GOTO( SKIP_next, next );
5596                     }
5597                     cur = cur->u.mark.prev_mark;
5598                 }
5599             }
5600             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5601             break;
5602         case SKIP_next_fail:
5603             if (ST.mark_name) {
5604                 /* (*CUT:NAME) - Set up to search for the name as we
5605                    collapse the stack*/
5606                 popmark = ST.mark_name;
5607             } else {
5608                 /* (*CUT) - No name, we cut here.*/
5609                 if (ST.mark_loc > startpoint)
5610                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5611                 /* but we set sv_commit to latest mark_name if there
5612                    is one so they can test to see how things lead to this
5613                    cut */
5614                 if (mark_state)
5615                     sv_commit=mark_state->u.mark.mark_name;
5616             }
5617             no_final = 1;
5618             sayNO;
5619             /* NOTREACHED */
5620 #undef ST
5621         case FOLDCHAR:
5622             n = ARG(scan);
5623             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5624                 locinput += ln;
5625             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5626                 sayNO;
5627             } else  {
5628                 U8 folded[UTF8_MAXBYTES_CASE+1];
5629                 STRLEN foldlen;
5630                 const char * const l = locinput;
5631                 char *e = PL_regeol;
5632                 to_uni_fold(n, folded, &foldlen);
5633
5634                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5635                                l, &e, 0,  utf8_target)) {
5636                         sayNO;
5637                 }
5638                 locinput = e;
5639             }
5640             nextchr = UCHARAT(locinput);
5641             break;
5642         case LNBREAK:
5643             if ((n=is_LNBREAK(locinput,utf8_target))) {
5644                 locinput += n;
5645                 nextchr = UCHARAT(locinput);
5646             } else
5647                 sayNO;
5648             break;
5649
5650 #define CASE_CLASS(nAmE)                              \
5651         case nAmE:                                    \
5652             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5653                 locinput += n;                        \
5654                 nextchr = UCHARAT(locinput);          \
5655             } else                                    \
5656                 sayNO;                                \
5657             break;                                    \
5658         case N##nAmE:                                 \
5659             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5660                 sayNO;                                \
5661             } else {                                  \
5662                 locinput += UTF8SKIP(locinput);       \
5663                 nextchr = UCHARAT(locinput);          \
5664             }                                         \
5665             break
5666
5667         CASE_CLASS(VERTWS);
5668         CASE_CLASS(HORIZWS);
5669 #undef CASE_CLASS
5670
5671         default:
5672             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5673                           PTR2UV(scan), OP(scan));
5674             Perl_croak(aTHX_ "regexp memory corruption");
5675
5676         } /* end switch */
5677
5678         /* switch break jumps here */
5679         scan = next; /* prepare to execute the next op and ... */
5680         continue;    /* ... jump back to the top, reusing st */
5681         /* NOTREACHED */
5682
5683       push_yes_state:
5684         /* push a state that backtracks on success */
5685         st->u.yes.prev_yes_state = yes_state;
5686         yes_state = st;
5687         /* FALL THROUGH */
5688       push_state:
5689         /* push a new regex state, then continue at scan  */
5690         {
5691             regmatch_state *newst;
5692
5693             DEBUG_STACK_r({
5694                 regmatch_state *cur = st;
5695                 regmatch_state *curyes = yes_state;
5696                 int curd = depth;
5697                 regmatch_slab *slab = PL_regmatch_slab;
5698                 for (;curd > -1;cur--,curd--) {
5699                     if (cur < SLAB_FIRST(slab)) {
5700                         slab = slab->prev;
5701                         cur = SLAB_LAST(slab);
5702                     }
5703                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5704                         REPORT_CODE_OFF + 2 + depth * 2,"",
5705                         curd, PL_reg_name[cur->resume_state],
5706                         (curyes == cur) ? "yes" : ""
5707                     );
5708                     if (curyes == cur)
5709                         curyes = cur->u.yes.prev_yes_state;
5710                 }
5711             } else
5712                 DEBUG_STATE_pp("push")
5713             );
5714             depth++;
5715             st->locinput = locinput;
5716             newst = st+1;
5717             if (newst >  SLAB_LAST(PL_regmatch_slab))
5718                 newst = S_push_slab(aTHX);
5719             PL_regmatch_state = newst;
5720
5721             locinput = PL_reginput;
5722             nextchr = UCHARAT(locinput);
5723             st = newst;
5724             continue;
5725             /* NOTREACHED */
5726         }
5727     }
5728
5729     /*
5730     * We get here only if there's trouble -- normally "case END" is
5731     * the terminating point.
5732     */
5733     Perl_croak(aTHX_ "corrupted regexp pointers");
5734     /*NOTREACHED*/
5735     sayNO;
5736
5737 yes:
5738     if (yes_state) {
5739         /* we have successfully completed a subexpression, but we must now
5740          * pop to the state marked by yes_state and continue from there */
5741         assert(st != yes_state);
5742 #ifdef DEBUGGING
5743         while (st != yes_state) {
5744             st--;
5745             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5746                 PL_regmatch_slab = PL_regmatch_slab->prev;
5747                 st = SLAB_LAST(PL_regmatch_slab);
5748             }
5749             DEBUG_STATE_r({
5750                 if (no_final) {
5751                     DEBUG_STATE_pp("pop (no final)");
5752                 } else {
5753                     DEBUG_STATE_pp("pop (yes)");
5754                 }
5755             });
5756             depth--;
5757         }
5758 #else
5759         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5760             || yes_state > SLAB_LAST(PL_regmatch_slab))
5761         {
5762             /* not in this slab, pop slab */
5763             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5764             PL_regmatch_slab = PL_regmatch_slab->prev;
5765             st = SLAB_LAST(PL_regmatch_slab);
5766         }
5767         depth -= (st - yes_state);
5768 #endif
5769         st = yes_state;
5770         yes_state = st->u.yes.prev_yes_state;
5771         PL_regmatch_state = st;
5772
5773         if (no_final) {
5774             locinput= st->locinput;
5775             nextchr = UCHARAT(locinput);
5776         }
5777         state_num = st->resume_state + no_final;
5778         goto reenter_switch;
5779     }
5780
5781     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5782                           PL_colors[4], PL_colors[5]));
5783
5784     if (PL_reg_eval_set) {
5785         /* each successfully executed (?{...}) block does the equivalent of
5786          *   local $^R = do {...}
5787          * When popping the save stack, all these locals would be undone;
5788          * bypass this by setting the outermost saved $^R to the latest
5789          * value */
5790         if (oreplsv != GvSV(PL_replgv))
5791             sv_setsv(oreplsv, GvSV(PL_replgv));
5792     }
5793     result = 1;
5794     goto final_exit;
5795
5796 no:
5797     DEBUG_EXECUTE_r(
5798         PerlIO_printf(Perl_debug_log,
5799             "%*s  %sfailed...%s\n",
5800             REPORT_CODE_OFF+depth*2, "",
5801             PL_colors[4], PL_colors[5])
5802         );
5803
5804 no_silent:
5805     if (no_final) {
5806         if (yes_state) {
5807             goto yes;
5808         } else {
5809             goto final_exit;
5810         }
5811     }
5812     if (depth) {
5813         /* there's a previous state to backtrack to */
5814         st--;
5815         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5816             PL_regmatch_slab = PL_regmatch_slab->prev;
5817             st = SLAB_LAST(PL_regmatch_slab);
5818         }
5819         PL_regmatch_state = st;
5820         locinput= st->locinput;
5821         nextchr = UCHARAT(locinput);
5822
5823         DEBUG_STATE_pp("pop");
5824         depth--;
5825         if (yes_state == st)
5826             yes_state = st->u.yes.prev_yes_state;
5827
5828         state_num = st->resume_state + 1; /* failure = success + 1 */
5829         goto reenter_switch;
5830     }
5831     result = 0;
5832
5833   final_exit:
5834     if (rex->intflags & PREGf_VERBARG_SEEN) {
5835         SV *sv_err = get_sv("REGERROR", 1);
5836         SV *sv_mrk = get_sv("REGMARK", 1);
5837         if (result) {
5838             sv_commit = &PL_sv_no;
5839             if (!sv_yes_mark)
5840                 sv_yes_mark = &PL_sv_yes;
5841         } else {
5842             if (!sv_commit)
5843                 sv_commit = &PL_sv_yes;
5844             sv_yes_mark = &PL_sv_no;
5845         }
5846         sv_setsv(sv_err, sv_commit);
5847         sv_setsv(sv_mrk, sv_yes_mark);
5848     }
5849
5850     /* clean up; in particular, free all slabs above current one */
5851     LEAVE_SCOPE(oldsave);
5852
5853     return result;
5854 }
5855
5856 /*
5857  - regrepeat - repeatedly match something simple, report how many
5858  */
5859 /*
5860  * [This routine now assumes that it will only match on things of length 1.
5861  * That was true before, but now we assume scan - reginput is the count,
5862  * rather than incrementing count on every character.  [Er, except utf8.]]
5863  */
5864 STATIC I32
5865 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5866 {
5867     dVAR;
5868     register char *scan;
5869     register I32 c;
5870     register char *loceol = PL_regeol;
5871     register I32 hardcount = 0;
5872     register bool utf8_target = PL_reg_match_utf8;
5873     UV utf8_flags;
5874 #ifndef DEBUGGING
5875     PERL_UNUSED_ARG(depth);
5876 #endif
5877
5878     PERL_ARGS_ASSERT_REGREPEAT;
5879
5880     scan = PL_reginput;
5881     if (max == REG_INFTY)
5882         max = I32_MAX;
5883     else if (max < loceol - scan)
5884         loceol = scan + max;
5885     switch (OP(p)) {
5886     case REG_ANY:
5887         if (utf8_target) {
5888             loceol = PL_regeol;
5889             while (scan < loceol && hardcount < max && *scan != '\n') {
5890                 scan += UTF8SKIP(scan);
5891                 hardcount++;
5892             }
5893         } else {
5894             while (scan < loceol && *scan != '\n')
5895                 scan++;
5896         }
5897         break;
5898     case SANY:
5899         if (utf8_target) {
5900             loceol = PL_regeol;
5901             while (scan < loceol && hardcount < max) {
5902                 scan += UTF8SKIP(scan);
5903                 hardcount++;
5904             }
5905         }
5906         else
5907             scan = loceol;
5908         break;
5909     case CANY:
5910         scan = loceol;
5911         break;
5912     case EXACT:
5913         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5914          * means they match only characters in the string that can be expressed
5915          * as a single byte.  For non-utf8 strings, that means a simple match.
5916          * For utf8 strings, the character matched must be an invariant, or
5917          * downgradable to a single byte.  The pattern's utf8ness is
5918          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5919          * it is, it's an invariant */
5920
5921         c = (U8)*STRING(p);
5922         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5923
5924         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5925             while (scan < loceol && UCHARAT(scan) == c) {
5926                 scan++;
5927             }
5928         }
5929         else {
5930
5931             /* Here, the string is utf8, and the pattern char is different
5932              * in utf8 than not, so can't compare them directly.  Outside the
5933              * loop, find find the two utf8 bytes that represent c, and then
5934              * look for those in sequence in the utf8 string */
5935             U8 high = UTF8_TWO_BYTE_HI(c);
5936             U8 low = UTF8_TWO_BYTE_LO(c);
5937             loceol = PL_regeol;
5938
5939             while (hardcount < max
5940                     && scan + 1 < loceol
5941                     && UCHARAT(scan) == high
5942                     && UCHARAT(scan + 1) == low)
5943             {
5944                 scan += 2;
5945                 hardcount++;
5946             }
5947         }
5948         break;
5949     case EXACTFA:
5950         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5951         goto do_exactf;
5952
5953     case EXACTFL:
5954         PL_reg_flags |= RF_tainted;
5955         utf8_flags = FOLDEQ_UTF8_LOCALE;
5956         goto do_exactf;
5957
5958     case EXACTF:
5959     case EXACTFU:
5960         utf8_flags = 0;
5961
5962         /* The comments for the EXACT case above apply as well to these fold
5963          * ones */
5964
5965     do_exactf:
5966         c = (U8)*STRING(p);
5967         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5968
5969         if (utf8_target) { /* Use full Unicode fold matching */
5970             char *tmpeol = loceol;
5971             while (hardcount < max
5972                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5973                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5974             {
5975                 scan = tmpeol;
5976                 tmpeol = loceol;
5977                 hardcount++;
5978             }
5979
5980             /* XXX Note that the above handles properly the German sharp s in
5981              * the pattern matching ss in the string.  But it doesn't handle
5982              * properly cases where the string contains say 'LIGATURE ff' and
5983              * the pattern is 'f+'.  This would require, say, a new function or
5984              * revised interface to foldEQ_utf8(), in which the maximum number
5985              * of characters to match could be passed and it would return how
5986              * many actually did.  This is just one of many cases where
5987              * multi-char folds don't work properly, and so the fix is being
5988              * deferred */
5989         }
5990         else {
5991             U8 folded;
5992
5993             /* Here, the string isn't utf8 and c is a single byte; and either
5994              * the pattern isn't utf8 or c is an invariant, so its utf8ness
5995              * doesn't affect c.  Can just do simple comparisons for exact or
5996              * fold matching. */
5997             switch (OP(p)) {
5998                 case EXACTF: folded = PL_fold[c]; break;
5999                 case EXACTFA:
6000                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6001                 case EXACTFL: folded = PL_fold_locale[c]; break;
6002                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6003             }
6004             while (scan < loceol &&
6005                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6006             {
6007                 scan++;
6008             }
6009         }
6010         break;
6011     case ANYOFV:
6012     case ANYOF:
6013         if (utf8_target || OP(p) == ANYOFV) {
6014             STRLEN inclasslen;
6015             loceol = PL_regeol;
6016             inclasslen = loceol - scan;
6017             while (hardcount < max
6018                    && ((inclasslen = loceol - scan) > 0)
6019                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6020             {
6021                 scan += inclasslen;
6022                 hardcount++;
6023             }
6024         } else {
6025             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6026                 scan++;
6027         }
6028         break;
6029     case ALNUMU:
6030         if (utf8_target) {
6031     utf8_wordchar:
6032             loceol = PL_regeol;
6033             LOAD_UTF8_CHARCLASS_ALNUM();
6034             while (hardcount < max && scan < loceol &&
6035                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6036             {
6037                 scan += UTF8SKIP(scan);
6038                 hardcount++;
6039             }
6040         } else {
6041             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6042                 scan++;
6043             }
6044         }
6045         break;
6046     case ALNUM:
6047         if (utf8_target)
6048             goto utf8_wordchar;
6049         while (scan < loceol && isALNUM((U8) *scan)) {
6050             scan++;
6051         }
6052         break;
6053     case ALNUMA:
6054         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6055             scan++;
6056         }
6057         break;
6058     case ALNUML:
6059         PL_reg_flags |= RF_tainted;
6060         if (utf8_target) {
6061             loceol = PL_regeol;
6062             while (hardcount < max && scan < loceol &&
6063                    isALNUM_LC_utf8((U8*)scan)) {
6064                 scan += UTF8SKIP(scan);
6065                 hardcount++;
6066             }
6067         } else {
6068             while (scan < loceol && isALNUM_LC(*scan))
6069                 scan++;
6070         }
6071         break;
6072     case NALNUMU:
6073         if (utf8_target) {
6074
6075     utf8_Nwordchar:
6076
6077             loceol = PL_regeol;
6078             LOAD_UTF8_CHARCLASS_ALNUM();
6079             while (hardcount < max && scan < loceol &&
6080                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6081             {
6082                 scan += UTF8SKIP(scan);
6083                 hardcount++;
6084             }
6085         } else {
6086             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6087                 scan++;
6088             }
6089         }
6090         break;
6091     case NALNUM:
6092         if (utf8_target)
6093             goto utf8_Nwordchar;
6094         while (scan < loceol && ! isALNUM((U8) *scan)) {
6095             scan++;
6096         }
6097         break;
6098     case NALNUMA:
6099         if (utf8_target) {
6100             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6101                 scan += UTF8SKIP(scan);
6102             }
6103         }
6104         else {
6105             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6106                 scan++;
6107             }
6108         }
6109         break;
6110     case NALNUML:
6111         PL_reg_flags |= RF_tainted;
6112         if (utf8_target) {
6113             loceol = PL_regeol;
6114             while (hardcount < max && scan < loceol &&
6115                    !isALNUM_LC_utf8((U8*)scan)) {
6116                 scan += UTF8SKIP(scan);
6117                 hardcount++;
6118             }
6119         } else {
6120             while (scan < loceol && !isALNUM_LC(*scan))
6121                 scan++;
6122         }
6123         break;
6124     case SPACEU:
6125         if (utf8_target) {
6126
6127     utf8_space:
6128
6129             loceol = PL_regeol;
6130             LOAD_UTF8_CHARCLASS_SPACE();
6131             while (hardcount < max && scan < loceol &&
6132                    (*scan == ' ' ||
6133                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6134             {
6135                 scan += UTF8SKIP(scan);
6136                 hardcount++;
6137             }
6138             break;
6139         }
6140         else {
6141             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6142                 scan++;
6143             }
6144             break;
6145         }
6146     case SPACE:
6147         if (utf8_target)
6148             goto utf8_space;
6149
6150         while (scan < loceol && isSPACE((U8) *scan)) {
6151             scan++;
6152         }
6153         break;
6154     case SPACEA:
6155         while (scan < loceol && isSPACE_A((U8) *scan)) {
6156             scan++;
6157         }
6158         break;
6159     case SPACEL:
6160         PL_reg_flags |= RF_tainted;
6161         if (utf8_target) {
6162             loceol = PL_regeol;
6163             while (hardcount < max && scan < loceol &&
6164                    isSPACE_LC_utf8((U8*)scan)) {
6165                 scan += UTF8SKIP(scan);
6166                 hardcount++;
6167             }
6168         } else {
6169             while (scan < loceol && isSPACE_LC(*scan))
6170                 scan++;
6171         }
6172         break;
6173     case NSPACEU:
6174         if (utf8_target) {
6175
6176     utf8_Nspace:
6177
6178             loceol = PL_regeol;
6179             LOAD_UTF8_CHARCLASS_SPACE();
6180             while (hardcount < max && scan < loceol &&
6181                    ! (*scan == ' ' ||
6182                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6183             {
6184                 scan += UTF8SKIP(scan);
6185                 hardcount++;
6186             }
6187             break;
6188         }
6189         else {
6190             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6191                 scan++;
6192             }
6193         }
6194         break;
6195     case NSPACE:
6196         if (utf8_target)
6197             goto utf8_Nspace;
6198
6199         while (scan < loceol && ! isSPACE((U8) *scan)) {
6200             scan++;
6201         }
6202         break;
6203     case NSPACEA:
6204         if (utf8_target) {
6205             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6206                 scan += UTF8SKIP(scan);
6207             }
6208         }
6209         else {
6210             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6211                 scan++;
6212             }
6213         }
6214         break;
6215     case NSPACEL:
6216         PL_reg_flags |= RF_tainted;
6217         if (utf8_target) {
6218             loceol = PL_regeol;
6219             while (hardcount < max && scan < loceol &&
6220                    !isSPACE_LC_utf8((U8*)scan)) {
6221                 scan += UTF8SKIP(scan);
6222                 hardcount++;
6223             }
6224         } else {
6225             while (scan < loceol && !isSPACE_LC(*scan))
6226                 scan++;
6227         }
6228         break;
6229     case DIGIT:
6230         if (utf8_target) {
6231             loceol = PL_regeol;
6232             LOAD_UTF8_CHARCLASS_DIGIT();
6233             while (hardcount < max && scan < loceol &&
6234                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6235                 scan += UTF8SKIP(scan);
6236                 hardcount++;
6237             }
6238         } else {
6239             while (scan < loceol && isDIGIT(*scan))
6240                 scan++;
6241         }
6242         break;
6243     case DIGITA:
6244         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6245             scan++;
6246         }
6247         break;
6248     case DIGITL:
6249         PL_reg_flags |= RF_tainted;
6250         if (utf8_target) {
6251             loceol = PL_regeol;
6252             while (hardcount < max && scan < loceol &&
6253                    isDIGIT_LC_utf8((U8*)scan)) {
6254                 scan += UTF8SKIP(scan);
6255                 hardcount++;
6256             }
6257         } else {
6258             while (scan < loceol && isDIGIT_LC(*scan))
6259                 scan++;
6260         }
6261         break;
6262     case NDIGIT:
6263         if (utf8_target) {
6264             loceol = PL_regeol;
6265             LOAD_UTF8_CHARCLASS_DIGIT();
6266             while (hardcount < max && scan < loceol &&
6267                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6268                 scan += UTF8SKIP(scan);
6269                 hardcount++;
6270             }
6271         } else {
6272             while (scan < loceol && !isDIGIT(*scan))
6273                 scan++;
6274         }
6275         break;
6276     case NDIGITA:
6277         if (utf8_target) {
6278             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6279                 scan += UTF8SKIP(scan);
6280             }
6281         }
6282         else {
6283             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6284                 scan++;
6285             }
6286         }
6287         break;
6288     case NDIGITL:
6289         PL_reg_flags |= RF_tainted;
6290         if (utf8_target) {
6291             loceol = PL_regeol;
6292             while (hardcount < max && scan < loceol &&
6293                    !isDIGIT_LC_utf8((U8*)scan)) {
6294                 scan += UTF8SKIP(scan);
6295                 hardcount++;
6296             }
6297         } else {
6298             while (scan < loceol && !isDIGIT_LC(*scan))
6299                 scan++;
6300         }
6301         break;
6302     case LNBREAK:
6303         if (utf8_target) {
6304             loceol = PL_regeol;
6305             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6306                 scan += c;
6307                 hardcount++;
6308             }
6309         } else {
6310             /*
6311               LNBREAK can match two latin chars, which is ok,
6312               because we have a null terminated string, but we
6313               have to use hardcount in this situation
6314             */
6315             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6316                 scan+=c;
6317                 hardcount++;
6318             }
6319         }
6320         break;
6321     case HORIZWS:
6322         if (utf8_target) {
6323             loceol = PL_regeol;
6324             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6325                 scan += c;
6326                 hardcount++;
6327             }
6328         } else {
6329             while (scan < loceol && is_HORIZWS_latin1(scan))
6330                 scan++;
6331         }
6332         break;
6333     case NHORIZWS:
6334         if (utf8_target) {
6335             loceol = PL_regeol;
6336             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6337                 scan += UTF8SKIP(scan);
6338                 hardcount++;
6339             }
6340         } else {
6341             while (scan < loceol && !is_HORIZWS_latin1(scan))
6342                 scan++;
6343
6344         }
6345         break;
6346     case VERTWS:
6347         if (utf8_target) {
6348             loceol = PL_regeol;
6349             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6350                 scan += c;
6351                 hardcount++;
6352             }
6353         } else {
6354             while (scan < loceol && is_VERTWS_latin1(scan))
6355                 scan++;
6356
6357         }
6358         break;
6359     case NVERTWS:
6360         if (utf8_target) {
6361             loceol = PL_regeol;
6362             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6363                 scan += UTF8SKIP(scan);
6364                 hardcount++;
6365             }
6366         } else {
6367             while (scan < loceol && !is_VERTWS_latin1(scan))
6368                 scan++;
6369
6370         }
6371         break;
6372
6373     default:            /* Called on something of 0 width. */
6374         break;          /* So match right here or not at all. */
6375     }
6376
6377     if (hardcount)
6378         c = hardcount;
6379     else
6380         c = scan - PL_reginput;
6381     PL_reginput = scan;
6382
6383     DEBUG_r({
6384         GET_RE_DEBUG_FLAGS_DECL;
6385         DEBUG_EXECUTE_r({
6386             SV * const prop = sv_newmortal();
6387             regprop(prog, prop, p);
6388             PerlIO_printf(Perl_debug_log,
6389                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6390                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6391         });
6392     });
6393
6394     return(c);
6395 }
6396
6397
6398 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6399 /*
6400 - regclass_swash - prepare the utf8 swash
6401 */
6402
6403 SV *
6404 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6405 {
6406     dVAR;
6407     SV *sw  = NULL;
6408     SV *si  = NULL;
6409     SV *alt = NULL;
6410     RXi_GET_DECL(prog,progi);
6411     const struct reg_data * const data = prog ? progi->data : NULL;
6412
6413     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6414
6415     assert(ANYOF_NONBITMAP(node));
6416
6417     if (data && data->count) {
6418         const U32 n = ARG(node);
6419
6420         if (data->what[n] == 's') {
6421             SV * const rv = MUTABLE_SV(data->data[n]);
6422             AV * const av = MUTABLE_AV(SvRV(rv));
6423             SV **const ary = AvARRAY(av);
6424             SV **a, **b;
6425
6426             /* See the end of regcomp.c:S_regclass() for
6427              * documentation of these array elements. */
6428
6429             si = *ary;
6430             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6431             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6432
6433             if (a)
6434                 sw = *a;
6435             else if (si && doinit) {
6436                 sw = swash_init("utf8", "", si, 1, 0);
6437                 (void)av_store(av, 1, sw);
6438             }
6439             if (b)
6440                 alt = *b;
6441         }
6442     }
6443
6444     if (listsvp)
6445         *listsvp = si;
6446     if (altsvp)
6447         *altsvp  = alt;
6448
6449     return sw;
6450 }
6451 #endif
6452
6453 /*
6454  - reginclass - determine if a character falls into a character class
6455
6456   n is the ANYOF regnode
6457   p is the target string
6458   lenp is pointer to the maximum number of bytes of how far to go in p
6459     (This is assumed wthout checking to always be at least the current
6460     character's size)
6461   utf8_target tells whether p is in UTF-8.
6462
6463   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6464   from a successful match, the value it points to will be updated to how many
6465   bytes in p were matched.  If there was no match, the value is undefined,
6466   possibly changed from the input.
6467
6468   Note that this can be a synthetic start class, a combination of various
6469   nodes, so things you think might be mutually exclusive, such as locale,
6470   aren't.  It can match both locale and non-locale
6471
6472  */
6473
6474 STATIC bool
6475 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6476 {
6477     dVAR;
6478     const char flags = ANYOF_FLAGS(n);
6479     bool match = FALSE;
6480     UV c = *p;
6481     STRLEN c_len = 0;
6482     STRLEN maxlen;
6483
6484     PERL_ARGS_ASSERT_REGINCLASS;
6485
6486     /* If c is not already the code point, get it */
6487     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6488         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6489                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6490                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6491                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6492                  * UTF8_ALLOW_FFFF */
6493         if (c_len == (STRLEN)-1)
6494             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6495     }
6496     else {
6497         c_len = 1;
6498     }
6499
6500     /* Use passed in max length, or one character if none passed in or less
6501      * than one character.  And assume will match just one character.  This is
6502      * overwritten later if matched more. */
6503     if (lenp) {
6504         maxlen = (*lenp > c_len) ? *lenp : c_len;
6505         *lenp = c_len;
6506
6507     }
6508     else {
6509         maxlen = c_len;
6510     }
6511
6512     /* If this character is potentially in the bitmap, check it */
6513     if (c < 256) {
6514         if (ANYOF_BITMAP_TEST(n, c))
6515             match = TRUE;
6516         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6517                 && ! utf8_target
6518                 && ! isASCII(c))
6519         {
6520             match = TRUE;
6521         }
6522
6523         else if (flags & ANYOF_LOCALE) {
6524             PL_reg_flags |= RF_tainted;
6525
6526             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6527                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6528             {
6529                 match = TRUE;
6530             }
6531             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6532                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6533                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6534                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6535                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6536                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6537                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6538                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6539                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6540                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6541                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6542                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6543                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6544                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6545                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6546                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6547                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6548                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6549                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6550                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6551                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6552                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6553                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6554                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6555                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6556                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6557                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6558                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6559                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6560                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6561                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6562                      ) /* How's that for a conditional? */
6563             ) {
6564                 match = TRUE;
6565             }
6566         }
6567     }
6568
6569     /* If the bitmap didn't (or couldn't) match, and something outside the
6570      * bitmap could match, try that.  Locale nodes specifiy completely the
6571      * behavior of code points in the bit map (otherwise, a utf8 target would
6572      * cause them to be treated as Unicode and not locale), except in
6573      * the very unlikely event when this node is a synthetic start class, which
6574      * could be a combination of locale and non-locale nodes.  So allow locale
6575      * to match for the synthetic start class, which will give a false
6576      * positive that will be resolved when the match is done again as not part
6577      * of the synthetic start class */
6578     if (!match) {
6579         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6580             match = TRUE;       /* Everything above 255 matches */
6581         }
6582         else if (ANYOF_NONBITMAP(n)
6583                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6584                      || (utf8_target
6585                          && (c >=256
6586                              || (! (flags & ANYOF_LOCALE))
6587                              || (flags & ANYOF_IS_SYNTHETIC)))))
6588         {
6589             AV *av;
6590             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6591
6592             if (sw) {
6593                 U8 * utf8_p;
6594                 if (utf8_target) {
6595                     utf8_p = (U8 *) p;
6596                 } else {
6597
6598                     /* Not utf8.  Convert as much of the string as available up
6599                      * to the limit of how far the (single) character in the
6600                      * pattern can possibly match (no need to go further).  If
6601                      * the node is a straight ANYOF or not folding, it can't
6602                      * match more than one.  Otherwise, It can match up to how
6603                      * far a single char can fold to.  Since not utf8, each
6604                      * character is a single byte, so the max it can be in
6605                      * bytes is the same as the max it can be in characters */
6606                     STRLEN len = (OP(n) == ANYOF
6607                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6608                                   ? 1
6609                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6610                                     ? maxlen
6611                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6612                     utf8_p = bytes_to_utf8(p, &len);
6613                 }
6614
6615                 if (swash_fetch(sw, utf8_p, TRUE))
6616                     match = TRUE;
6617                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6618
6619                     /* Here, we need to test if the fold of the target string
6620                      * matches.  The non-multi char folds have all been moved to
6621                      * the compilation phase, and the multi-char folds have
6622                      * been stored by regcomp into 'av'; we linearly check to
6623                      * see if any match the target string (folded).   We know
6624                      * that the originals were each one character, but we don't
6625                      * currently know how many characters/bytes each folded to,
6626                      * except we do know that there are small limits imposed by
6627                      * Unicode.  XXX A performance enhancement would be to have
6628                      * regcomp.c store the max number of chars/bytes that are
6629                      * in an av entry, as, say the 0th element.  Even better
6630                      * would be to have a hash of the few characters that can
6631                      * start a multi-char fold to the max number of chars of
6632                      * those folds.
6633                      *
6634                      * If there is a match, we will need to advance (if lenp is
6635                      * specified) the match pointer in the target string.  But
6636                      * what we are comparing here isn't that string directly,
6637                      * but its fold, whose length may differ from the original.
6638                      * As we go along in constructing the fold, therefore, we
6639                      * create a map so that we know how many bytes in the
6640                      * source to advance given that we have matched a certain
6641                      * number of bytes in the fold.  This map is stored in
6642                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6643                      * the fold of the first character that we are folding.
6644                      * Then map_fold_len_back[n] is set to the number of bytes
6645                      * in that first character.  Similarly let m be the
6646                      * corresponding number for the second character to be
6647                      * folded.  Then map_fold_len_back[n+m] is set to the
6648                      * number of bytes occupied by the first two source
6649                      * characters. ... */
6650                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6651                     U8 folded[UTF8_MAXBYTES_CASE+1];
6652                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6653                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6654                                                   chars */
6655
6656                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6657
6658                         /* Here, only need to fold the first char of the target
6659                          * string.  It the source wasn't utf8, is 1 byte long */
6660                         to_utf8_fold(utf8_p, folded, &foldlen);
6661                         total_foldlen = foldlen;
6662                         map_fold_len_back[foldlen] = (utf8_target)
6663                                                      ? UTF8SKIP(utf8_p)
6664                                                      : 1;
6665                     }
6666                     else {
6667
6668                         /* Here, need to fold more than the first char.  Do so
6669                          * up to the limits */
6670                         U8* source_ptr = utf8_p;    /* The source for the fold
6671                                                        is the regex target
6672                                                        string */
6673                         U8* folded_ptr = folded;
6674                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6675                                                        available byte in the
6676                                                        target string */
6677                         U8 i;
6678                         for (i = 0;
6679                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6680                              i++)
6681                         {
6682
6683                             /* Fold the next character */
6684                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6685                             STRLEN this_char_foldlen;
6686                             to_utf8_fold(source_ptr,
6687                                          this_char_folded,
6688                                          &this_char_foldlen);
6689
6690                             /* Bail if it would exceed the byte limit for
6691                              * folding a single char. */
6692                             if (this_char_foldlen + folded_ptr - folded >
6693                                                             UTF8_MAXBYTES_CASE)
6694                             {
6695                                 break;
6696                             }
6697
6698                             /* Add the fold of this character */
6699                             Copy(this_char_folded,
6700                                  folded_ptr,
6701                                  this_char_foldlen,
6702                                  U8);
6703                             source_ptr += UTF8SKIP(source_ptr);
6704                             folded_ptr += this_char_foldlen;
6705                             total_foldlen = folded_ptr - folded;
6706
6707                             /* Create map from the number of bytes in the fold
6708                              * back to the number of bytes in the source.  If
6709                              * the source isn't utf8, the byte count is just
6710                              * the number of characters so far */
6711                             map_fold_len_back[total_foldlen]
6712                                                       = (utf8_target)
6713                                                         ? source_ptr - utf8_p
6714                                                         : i + 1;
6715                         }
6716                         *folded_ptr = '\0';
6717                     }
6718
6719
6720                     /* Do the linear search to see if the fold is in the list
6721                      * of multi-char folds. */
6722                     if (av) {
6723                         I32 i;
6724                         for (i = 0; i <= av_len(av); i++) {
6725                             SV* const sv = *av_fetch(av, i, FALSE);
6726                             STRLEN len;
6727                             const char * const s = SvPV_const(sv, len);
6728
6729                             if (len <= total_foldlen && memEQ(s,
6730                                                                (char*)folded,
6731                                                                len))
6732                             {
6733
6734                                 /* Advance the target string ptr to account for
6735                                  * this fold, but have to translate from the
6736                                  * folded length to the corresponding source
6737                                  * length. */
6738                                 if (lenp) {
6739                                     *lenp = map_fold_len_back[len];
6740                                     assert(*lenp != 0); /* Otherwise will loop */
6741                                 }
6742                                 match = TRUE;
6743                                 break;
6744                             }
6745                         }
6746                     }
6747                 }
6748
6749                 /* If we allocated a string above, free it */
6750                 if (! utf8_target) Safefree(utf8_p);
6751             }
6752         }
6753     }
6754
6755     return (flags & ANYOF_INVERT) ? !match : match;
6756 }
6757
6758 STATIC U8 *
6759 S_reghop3(U8 *s, I32 off, const U8* lim)
6760 {
6761     dVAR;
6762
6763     PERL_ARGS_ASSERT_REGHOP3;
6764
6765     if (off >= 0) {
6766         while (off-- && s < lim) {
6767             /* XXX could check well-formedness here */
6768             s += UTF8SKIP(s);
6769         }
6770     }
6771     else {
6772         while (off++ && s > lim) {
6773             s--;
6774             if (UTF8_IS_CONTINUED(*s)) {
6775                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6776                     s--;
6777             }
6778             /* XXX could check well-formedness here */
6779         }
6780     }
6781     return s;
6782 }
6783
6784 #ifdef XXX_dmq
6785 /* there are a bunch of places where we use two reghop3's that should
6786    be replaced with this routine. but since thats not done yet
6787    we ifdef it out - dmq
6788 */
6789 STATIC U8 *
6790 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6791 {
6792     dVAR;
6793
6794     PERL_ARGS_ASSERT_REGHOP4;
6795
6796     if (off >= 0) {
6797         while (off-- && s < rlim) {
6798             /* XXX could check well-formedness here */
6799             s += UTF8SKIP(s);
6800         }
6801     }
6802     else {
6803         while (off++ && s > llim) {
6804             s--;
6805             if (UTF8_IS_CONTINUED(*s)) {
6806                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6807                     s--;
6808             }
6809             /* XXX could check well-formedness here */
6810         }
6811     }
6812     return s;
6813 }
6814 #endif
6815
6816 STATIC U8 *
6817 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6818 {
6819     dVAR;
6820
6821     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6822
6823     if (off >= 0) {
6824         while (off-- && s < lim) {
6825             /* XXX could check well-formedness here */
6826             s += UTF8SKIP(s);
6827         }
6828         if (off >= 0)
6829             return NULL;
6830     }
6831     else {
6832         while (off++ && s > lim) {
6833             s--;
6834             if (UTF8_IS_CONTINUED(*s)) {
6835                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6836                     s--;
6837             }
6838             /* XXX could check well-formedness here */
6839         }
6840         if (off <= 0)
6841             return NULL;
6842     }
6843     return s;
6844 }
6845
6846 static void
6847 restore_pos(pTHX_ void *arg)
6848 {
6849     dVAR;
6850     regexp * const rex = (regexp *)arg;
6851     if (PL_reg_eval_set) {
6852         if (PL_reg_oldsaved) {
6853             rex->subbeg = PL_reg_oldsaved;
6854             rex->sublen = PL_reg_oldsavedlen;
6855 #ifdef PERL_OLD_COPY_ON_WRITE
6856             rex->saved_copy = PL_nrs;
6857 #endif
6858             RXp_MATCH_COPIED_on(rex);
6859         }
6860         PL_reg_magic->mg_len = PL_reg_oldpos;
6861         PL_reg_eval_set = 0;
6862         PL_curpm = PL_reg_oldcurpm;
6863     }
6864 }
6865
6866 STATIC void
6867 S_to_utf8_substr(pTHX_ register regexp *prog)
6868 {
6869     int i = 1;
6870
6871     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6872
6873     do {
6874         if (prog->substrs->data[i].substr
6875             && !prog->substrs->data[i].utf8_substr) {
6876             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6877             prog->substrs->data[i].utf8_substr = sv;
6878             sv_utf8_upgrade(sv);
6879             if (SvVALID(prog->substrs->data[i].substr)) {
6880                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6881                 if (flags & FBMcf_TAIL) {
6882                     /* Trim the trailing \n that fbm_compile added last
6883                        time.  */
6884                     SvCUR_set(sv, SvCUR(sv) - 1);
6885                     /* Whilst this makes the SV technically "invalid" (as its
6886                        buffer is no longer followed by "\0") when fbm_compile()
6887                        adds the "\n" back, a "\0" is restored.  */
6888                 }
6889                 fbm_compile(sv, flags);
6890             }
6891             if (prog->substrs->data[i].substr == prog->check_substr)
6892                 prog->check_utf8 = sv;
6893         }
6894     } while (i--);
6895 }
6896
6897 STATIC void
6898 S_to_byte_substr(pTHX_ register regexp *prog)
6899 {
6900     dVAR;
6901     int i = 1;
6902
6903     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6904
6905     do {
6906         if (prog->substrs->data[i].utf8_substr
6907             && !prog->substrs->data[i].substr) {
6908             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6909             if (sv_utf8_downgrade(sv, TRUE)) {
6910                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6911                     const U8 flags
6912                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6913                     if (flags & FBMcf_TAIL) {
6914                         /* Trim the trailing \n that fbm_compile added last
6915                            time.  */
6916                         SvCUR_set(sv, SvCUR(sv) - 1);
6917                     }
6918                     fbm_compile(sv, flags);
6919                 }
6920             } else {
6921                 SvREFCNT_dec(sv);
6922                 sv = &PL_sv_undef;
6923             }
6924             prog->substrs->data[i].substr = sv;
6925             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6926                 prog->check_substr = sv;
6927         }
6928     } while (i--);
6929 }
6930
6931 /*
6932  * Local variables:
6933  * c-indentation-style: bsd
6934  * c-basic-offset: 4
6935  * indent-tabs-mode: t
6936  * End:
6937  *
6938  * ex: set ts=8 sts=4 sw=4 noet:
6939  */