regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 /*
  46  * pregcomp and pregexec -- regsub and regerror are not used in perl
  47  *
  48  *      Copyright (c) 1986 by University of Toronto.
  49  *      Written by Henry Spencer.  Not derived from licensed software.
  50  *
  51  *      Permission is granted to anyone to use this software for any
  52  *      purpose on any computer system, and to redistribute it freely,
  53  *      subject to the following restrictions:
  54  *
  55  *      1. The author is not responsible for the consequences of use of
  56  *              this software, no matter how awful, even if they arise
  57  *              from defects in it.
  58  *
  59  *      2. The origin of this software must not be misrepresented, either
  60  *              by explicit claim or by omission.
  61  *
  62  *      3. Altered versions must be plainly marked as such, and must not
  63  *              be misrepresented as being the original software.
  64  *
  65  ****    Alterations to Henry's code are...
  66  ****
  67  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  68  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  69  ****    by Larry Wall and others
  70  ****
  71  ****    You may distribute under the terms of either the GNU General Public
  72  ****    License or the Artistic License, as specified in the README file.
  73  *
  74  * Beware that some of this code is subtly aware of the way operator
  75  * precedence is structured in regular expressions.  Serious changes in
  76  * regular-expression syntax might require a total rethink.
  77  */
  78 #include "EXTERN.h"
  79 #define PERL_IN_REGEXEC_C
  80 #include "perl.h"
  81
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "inline_invlist.c"
  89 #include "unicode_constants.h"
  90
  91 #define RF_tainted      1       /* tainted information used? e.g. locale */
  92 #define RF_warned       2               /* warned about big count? */
  93
  94 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  95
  96 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  97
  98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
 105  * call if there are no complications: i.e., if everything matchable is
 106  * straight forward in the bitmap */
 107 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 108                                               : ANYOF_BITMAP_TEST(p,*(c)))
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 116
 117 #define HOPc(pos,off) \
 118         (char *)(PL_reg_match_utf8 \
 119             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 120             : (U8*)(pos + off))
 121 #define HOPBACKc(pos, off) \
 122         (char*)(PL_reg_match_utf8\
 123             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 124             : (pos - off >= PL_bostr)           \
 125                 ? (U8*)pos - off                \
 126                 : NULL)
 127
 128 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 129 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 130
 131
 132 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 133 #define NEXTCHR_IS_EOS (nextchr < 0)
 134
 135 #define SET_nextchr \
 136     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 137
 138 #define SET_locinput(p) \
 139     locinput = (p);  \
 140     SET_nextchr
 141
 142
 143 /* these are unrolled below in the CCC_TRY_XXX defined */
 144 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 145     if (!CAT2(PL_utf8_,class)) { \
 146         bool ok; \
 147         ENTER; save_re_context(); \
 148         ok=CAT2(is_utf8_,class)((const U8*)str); \
 149         PERL_UNUSED_VAR(ok); \
 150         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 151 /* Doesn't do an assert to verify that is correct */
 152 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 153     if (!CAT2(PL_utf8_,class)) { \
 154         bool throw_away PERL_UNUSED_DECL; \
 155         ENTER; save_re_context(); \
 156         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 157         LEAVE; } } STMT_END
 158
 159 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 160 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 161 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 162
 163 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 164         /* No asserts are done for some of these, in case called on a   */  \
 165         /* Unicode version in which they map to nothing */                  \
 166         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 167         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 168
 169 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 170
 171 /* The actual code for CCC_TRY, which uses several variables from the routine
 172  * it's callable from.  It is designed to be the bulk of a case statement.
 173  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 174  *      nextchr matches the class.
 175  * UTF8_TEST is the whole test string to use for utf8 targets
 176  * LOAD is what to use to test, and if not present to load in the swash for the
 177  *      class
 178  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 179  *      UTF8_TEST test.
 180  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 181  * utf8 and a variant, load the swash if necessary and test using the utf8
 182  * test.  Advance to the next character if test is ok, otherwise fail; If not
 183  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 184  * fails, or advance to the next character */
 185
 186 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 187     if (NEXTCHR_IS_EOS) {                                              \
 188         sayNO;                                                                \
 189     }                                                                         \
 190     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 191         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 192         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 193             sayNO;                                                            \
 194         }                                                                     \
 195     }                                                                         \
 196     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 197             sayNO;                                                            \
 198     }                                                                         \
 199     goto increment_locinput;
 200
 201 /* Handle the non-locale cases for a character class and its complement.  It
 202  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 203  * This is because that code fails when the test succeeds, so we want to have
 204  * the test fail so that the code succeeds.  The swash is stored in a
 205  * predictable PL_ place */
 206 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 207                            CLASS, STR)                                        \
 208     case NAME:                                                                \
 209         _CCC_TRY_CODE( !, FUNC,                                               \
 210                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 211                                             (U8*)locinput, TRUE)),            \
 212                           CLASS, STR)                                         \
 213     case NNAME:                                                               \
 214         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 215                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 216                                             (U8*)locinput, TRUE)),            \
 217                           CLASS, STR)                                         \
 218
 219 /* Generate the case statements for both locale and non-locale character
 220  * classes in regmatch for classes that don't have special unicode semantics.
 221  * Locales don't use an immediate swash, but an intermediary special locale
 222  * function that is called on the pointer to the current place in the input
 223  * string.  That function will resolve to needing the same swash.  One might
 224  * think that because we don't know what the locale will match, we shouldn't
 225  * check with the swash loading function that it loaded properly; ie, that we
 226  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 227  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 228  * irrelevant here */
 229 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 230                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 231                 NAMEA, NNAMEA, FUNCA,                                         \
 232                 CLASS, STR)                                                   \
 233     case NAMEL:                                                               \
 234         PL_reg_flags |= RF_tainted;                                           \
 235         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 236     case NNAMEL:                                                              \
 237         PL_reg_flags |= RF_tainted;                                           \
 238         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 239                        CLASS, STR)                                            \
 240     case NAMEA:                                                               \
 241         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 242             sayNO;                                                            \
 243         }                                                                     \
 244         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 245         locinput++;                                        \
 246         break;                                                                \
 247     case NNAMEA:                                                              \
 248         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 249             sayNO;                                                            \
 250         }                                                                     \
 251         goto increment_locinput;                                              \
 252     /* Generate the non-locale cases */                                       \
 253     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 254
 255 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 256  * statements to handle separate Unicode semantics nodes */
 257 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 258                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 259                   NAMEU, NNAMEU, FUNCU,                                        \
 260                   NAMEA, NNAMEA, FUNCA,                                        \
 261                   CLASS, STR)                                                  \
 262     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 263             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 264             NAMEA, NNAMEA, FUNCA,                                              \
 265             CLASS, STR)                                                        \
 266     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 267
 268 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 269
 270 /* for use after a quantifier and before an EXACT-like node -- japhy */
 271 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 272  *
 273  * NOTE that *nothing* that affects backtracking should be in here, specifically
 274  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 275  * node that is in between two EXACT like nodes when ascertaining what the required
 276  * "follow" character is. This should probably be moved to regex compile time
 277  * although it may be done at run time beause of the REF possibility - more
 278  * investigation required. -- demerphq
 279 */
 280 #define JUMPABLE(rn) (      \
 281     OP(rn) == OPEN ||       \
 282     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 283     OP(rn) == EVAL ||   \
 284     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 285     OP(rn) == PLUS || OP(rn) == MINMOD || \
 286     OP(rn) == KEEPS || \
 287     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 288 )
 289 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 290
 291 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 292
 293 #if 0
 294 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 295    we don't need this definition. */
 296 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 297 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 298 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 299
 300 #else
 301 /* ... so we use this as its faster. */
 302 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 303 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 304 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 305 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 306
 307 #endif
 308
 309 /*
 310   Search for mandatory following text node; for lookahead, the text must
 311   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 312 */
 313 #define FIND_NEXT_IMPT(rn) STMT_START { \
 314     while (JUMPABLE(rn)) { \
 315         const OPCODE type = OP(rn); \
 316         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 317             rn = NEXTOPER(NEXTOPER(rn)); \
 318         else if (type == PLUS) \
 319             rn = NEXTOPER(rn); \
 320         else if (type == IFMATCH) \
 321             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 322         else rn += NEXT_OFF(rn); \
 323     } \
 324 } STMT_END
 325
 326
 327 static void restore_pos(pTHX_ void *arg);
 328
 329 #define REGCP_PAREN_ELEMS 3
 330 #define REGCP_OTHER_ELEMS 3
 331 #define REGCP_FRAME_ELEMS 1
 332 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 333  * are needed for the regexp context stack bookkeeping. */
 334
 335 STATIC CHECKPOINT
 336 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 337 {
 338     dVAR;
 339     const int retval = PL_savestack_ix;
 340     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 341     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 342     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 343     I32 p;
 344     GET_RE_DEBUG_FLAGS_DECL;
 345
 346     PERL_ARGS_ASSERT_REGCPPUSH;
 347
 348     if (paren_elems_to_push < 0)
 349         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 350                    paren_elems_to_push);
 351
 352     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 353         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 354                    " out of range (%lu-%ld)",
 355                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 356
 357     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 358
 359     DEBUG_BUFFERS_r(
 360         if ((int)PL_regsize > (int)parenfloor)
 361             PerlIO_printf(Perl_debug_log,
 362                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 363                 PTR2UV(rex),
 364                 PTR2UV(rex->offs)
 365             );
 366     );
 367     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 368 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 369         SSPUSHINT(rex->offs[p].end);
 370         SSPUSHINT(rex->offs[p].start);
 371         SSPUSHINT(rex->offs[p].start_tmp);
 372         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 373             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 374             (UV)p,
 375             (IV)rex->offs[p].start,
 376             (IV)rex->offs[p].start_tmp,
 377             (IV)rex->offs[p].end
 378         ));
 379     }
 380 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 381     SSPUSHINT(PL_regsize);
 382     SSPUSHINT(rex->lastparen);
 383     SSPUSHINT(rex->lastcloseparen);
 384     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 385
 386     return retval;
 387 }
 388
 389 /* These are needed since we do not localize EVAL nodes: */
 390 #define REGCP_SET(cp)                                           \
 391     DEBUG_STATE_r(                                              \
 392             PerlIO_printf(Perl_debug_log,                       \
 393                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 394                 (IV)PL_savestack_ix));                          \
 395     cp = PL_savestack_ix
 396
 397 #define REGCP_UNWIND(cp)                                        \
 398     DEBUG_STATE_r(                                              \
 399         if (cp != PL_savestack_ix)                              \
 400             PerlIO_printf(Perl_debug_log,                       \
 401                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 402                 (IV)(cp), (IV)PL_savestack_ix));                \
 403     regcpblow(cp)
 404
 405 #define UNWIND_PAREN(lp, lcp)               \
 406     for (n = rex->lastparen; n > lp; n--)   \
 407         rex->offs[n].end = -1;              \
 408     rex->lastparen = n;                     \
 409     rex->lastcloseparen = lcp;
 410
 411
 412 STATIC void
 413 S_regcppop(pTHX_ regexp *rex)
 414 {
 415     dVAR;
 416     UV i;
 417     U32 paren;
 418     GET_RE_DEBUG_FLAGS_DECL;
 419
 420     PERL_ARGS_ASSERT_REGCPPOP;
 421
 422     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 423     i = SSPOPUV;
 424     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 425     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 426     rex->lastcloseparen = SSPOPINT;
 427     rex->lastparen = SSPOPINT;
 428     PL_regsize = SSPOPINT;
 429
 430     i -= REGCP_OTHER_ELEMS;
 431     /* Now restore the parentheses context. */
 432     DEBUG_BUFFERS_r(
 433         if (i || rex->lastparen + 1 <= rex->nparens)
 434             PerlIO_printf(Perl_debug_log,
 435                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 436                 PTR2UV(rex),
 437                 PTR2UV(rex->offs)
 438             );
 439     );
 440     paren = PL_regsize;
 441     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 442         I32 tmps;
 443         rex->offs[paren].start_tmp = SSPOPINT;
 444         rex->offs[paren].start = SSPOPINT;
 445         tmps = SSPOPINT;
 446         if (paren <= rex->lastparen)
 447             rex->offs[paren].end = tmps;
 448         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 449             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 450             (UV)paren,
 451             (IV)rex->offs[paren].start,
 452             (IV)rex->offs[paren].start_tmp,
 453             (IV)rex->offs[paren].end,
 454             (paren > rex->lastparen ? "(skipped)" : ""));
 455         );
 456         paren--;
 457     }
 458 #if 1
 459     /* It would seem that the similar code in regtry()
 460      * already takes care of this, and in fact it is in
 461      * a better location to since this code can #if 0-ed out
 462      * but the code in regtry() is needed or otherwise tests
 463      * requiring null fields (pat.t#187 and split.t#{13,14}
 464      * (as of patchlevel 7877)  will fail.  Then again,
 465      * this code seems to be necessary or otherwise
 466      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 467      * --jhi updated by dapm */
 468     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 469         if (i > PL_regsize)
 470             rex->offs[i].start = -1;
 471         rex->offs[i].end = -1;
 472         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 473             "    \\%"UVuf": %s   ..-1 undeffing\n",
 474             (UV)i,
 475             (i > PL_regsize) ? "-1" : "  "
 476         ));
 477     }
 478 #endif
 479 }
 480
 481 /* restore the parens and associated vars at savestack position ix,
 482  * but without popping the stack */
 483
 484 STATIC void
 485 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 486 {
 487     I32 tmpix = PL_savestack_ix;
 488     PL_savestack_ix = ix;
 489     regcppop(rex);
 490     PL_savestack_ix = tmpix;
 491 }
 492
 493 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 494
 495 /*
 496  * pregexec and friends
 497  */
 498
 499 #ifndef PERL_IN_XSUB_RE
 500 /*
 501  - pregexec - match a regexp against a string
 502  */
 503 I32
 504 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 505          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 506 /* stringarg: the point in the string at which to begin matching */
 507 /* strend:    pointer to null at end of string */
 508 /* strbeg:    real beginning of string */
 509 /* minend:    end of match must be >= minend bytes after stringarg. */
 510 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 511  *            itself is accessed via the pointers above */
 512 /* nosave:    For optimizations. */
 513 {
 514     PERL_ARGS_ASSERT_PREGEXEC;
 515
 516     return
 517         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 518                       nosave ? 0 : REXEC_COPY_STR);
 519 }
 520 #endif
 521
 522 /*
 523  * Need to implement the following flags for reg_anch:
 524  *
 525  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 526  * USE_INTUIT_ML
 527  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 528  * INTUIT_AUTORITATIVE_ML
 529  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 530  * INTUIT_ONCE_ML
 531  *
 532  * Another flag for this function: SECOND_TIME (so that float substrs
 533  * with giant delta may be not rechecked).
 534  */
 535
 536 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 537
 538 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 539    Otherwise, only SvCUR(sv) is used to get strbeg. */
 540
 541 /* XXXX We assume that strpos is strbeg unless sv. */
 542
 543 /* XXXX Some places assume that there is a fixed substring.
 544         An update may be needed if optimizer marks as "INTUITable"
 545         RExen without fixed substrings.  Similarly, it is assumed that
 546         lengths of all the strings are no more than minlen, thus they
 547         cannot come from lookahead.
 548         (Or minlen should take into account lookahead.)
 549   NOTE: Some of this comment is not correct. minlen does now take account
 550   of lookahead/behind. Further research is required. -- demerphq
 551
 552 */
 553
 554 /* A failure to find a constant substring means that there is no need to make
 555    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 556    finding a substring too deep into the string means that less calls to
 557    regtry() should be needed.
 558
 559    REx compiler's optimizer found 4 possible hints:
 560         a) Anchored substring;
 561         b) Fixed substring;
 562         c) Whether we are anchored (beginning-of-line or \G);
 563         d) First node (of those at offset 0) which may distinguish positions;
 564    We use a)b)d) and multiline-part of c), and try to find a position in the
 565    string which does not contradict any of them.
 566  */
 567
 568 /* Most of decisions we do here should have been done at compile time.
 569    The nodes of the REx which we used for the search should have been
 570    deleted from the finite automaton. */
 571
 572 char *
 573 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 574                      char *strend, const U32 flags, re_scream_pos_data *data)
 575 {
 576     dVAR;
 577     struct regexp *const prog = (struct regexp *)SvANY(rx);
 578     I32 start_shift = 0;
 579     /* Should be nonnegative! */
 580     I32 end_shift   = 0;
 581     char *s;
 582     SV *check;
 583     char *strbeg;
 584     char *t;
 585     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 586     I32 ml_anch;
 587     char *other_last = NULL;    /* other substr checked before this */
 588     char *check_at = NULL;              /* check substr found at this pos */
 589     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 590     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 591     RXi_GET_DECL(prog,progi);
 592 #ifdef DEBUGGING
 593     const char * const i_strpos = strpos;
 594 #endif
 595     GET_RE_DEBUG_FLAGS_DECL;
 596
 597     PERL_ARGS_ASSERT_RE_INTUIT_START;
 598     PERL_UNUSED_ARG(flags);
 599     PERL_UNUSED_ARG(data);
 600
 601     RX_MATCH_UTF8_set(rx,utf8_target);
 602
 603     if (RX_UTF8(rx)) {
 604         PL_reg_flags |= RF_utf8;
 605     }
 606     DEBUG_EXECUTE_r(
 607         debug_start_match(rx, utf8_target, strpos, strend,
 608             sv ? "Guessing start of match in sv for"
 609                : "Guessing start of match in string for");
 610               );
 611
 612     /* CHR_DIST() would be more correct here but it makes things slow. */
 613     if (prog->minlen > strend - strpos) {
 614         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 615                               "String too short... [re_intuit_start]\n"));
 616         goto fail;
 617     }
 618
 619     /* XXX we need to pass strbeg as a separate arg: the following is
 620      * guesswork and can be wrong... */
 621     if (sv && SvPOK(sv)) {
 622         char * p   = SvPVX(sv);
 623         STRLEN cur = SvCUR(sv);
 624         if (p <= strpos && strpos < p + cur) {
 625             strbeg = p;
 626             assert(p <= strend && strend <= p + cur);
 627         }
 628         else
 629             strbeg = strend - cur;
 630     }
 631     else
 632         strbeg = strpos;
 633
 634     PL_regeol = strend;
 635     if (utf8_target) {
 636         if (!prog->check_utf8 && prog->check_substr)
 637             to_utf8_substr(prog);
 638         check = prog->check_utf8;
 639     } else {
 640         if (!prog->check_substr && prog->check_utf8) {
 641             if (! to_byte_substr(prog)) {
 642                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 643                                         non_utf8_target_but_utf8_required));
 644                 goto fail;
 645             }
 646         }
 647         check = prog->check_substr;
 648     }
 649     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 650         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 651                      || ( (prog->extflags & RXf_ANCH_BOL)
 652                           && !multiline ) );    /* Check after \n? */
 653
 654         if (!ml_anch) {
 655           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 656                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 657                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 658                && sv && !SvROK(sv)
 659                && (strpos != strbeg)) {
 660               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 661               goto fail;
 662           }
 663           if (prog->check_offset_min == prog->check_offset_max &&
 664               !(prog->extflags & RXf_CANY_SEEN)) {
 665             /* Substring at constant offset from beg-of-str... */
 666             I32 slen;
 667
 668             s = HOP3c(strpos, prog->check_offset_min, strend);
 669
 670             if (SvTAIL(check)) {
 671                 slen = SvCUR(check);    /* >= 1 */
 672
 673                 if ( strend - s > slen || strend - s < slen - 1
 674                      || (strend - s == slen && strend[-1] != '\n')) {
 675                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 676                     goto fail_finish;
 677                 }
 678                 /* Now should match s[0..slen-2] */
 679                 slen--;
 680                 if (slen && (*SvPVX_const(check) != *s
 681                              || (slen > 1
 682                                  && memNE(SvPVX_const(check), s, slen)))) {
 683                   report_neq:
 684                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 685                     goto fail_finish;
 686                 }
 687             }
 688             else if (*SvPVX_const(check) != *s
 689                      || ((slen = SvCUR(check)) > 1
 690                          && memNE(SvPVX_const(check), s, slen)))
 691                 goto report_neq;
 692             check_at = s;
 693             goto success_at_start;
 694           }
 695         }
 696         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 697         s = strpos;
 698         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 699         end_shift = prog->check_end_shift;
 700
 701         if (!ml_anch) {
 702             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 703                                          - (SvTAIL(check) != 0);
 704             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 705
 706             if (end_shift < eshift)
 707                 end_shift = eshift;
 708         }
 709     }
 710     else {                              /* Can match at random position */
 711         ml_anch = 0;
 712         s = strpos;
 713         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 714         end_shift = prog->check_end_shift;
 715
 716         /* end shift should be non negative here */
 717     }
 718
 719 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 720     if (end_shift < 0)
 721         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 722                    (IV)end_shift, RX_PRECOMP(prog));
 723 #endif
 724
 725   restart:
 726     /* Find a possible match in the region s..strend by looking for
 727        the "check" substring in the region corrected by start/end_shift. */
 728
 729     {
 730         I32 srch_start_shift = start_shift;
 731         I32 srch_end_shift = end_shift;
 732         U8* start_point;
 733         U8* end_point;
 734         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 735             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 736             srch_start_shift = strbeg - s;
 737         }
 738     DEBUG_OPTIMISE_MORE_r({
 739         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 740             (IV)prog->check_offset_min,
 741             (IV)srch_start_shift,
 742             (IV)srch_end_shift,
 743             (IV)prog->check_end_shift);
 744     });
 745
 746         if (prog->extflags & RXf_CANY_SEEN) {
 747             start_point= (U8*)(s + srch_start_shift);
 748             end_point= (U8*)(strend - srch_end_shift);
 749         } else {
 750             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 751             end_point= HOP3(strend, -srch_end_shift, strbeg);
 752         }
 753         DEBUG_OPTIMISE_MORE_r({
 754             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 755                 (int)(end_point - start_point),
 756                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 757                 start_point);
 758         });
 759
 760         s = fbm_instr( start_point, end_point,
 761                       check, multiline ? FBMrf_MULTILINE : 0);
 762     }
 763     /* Update the count-of-usability, remove useless subpatterns,
 764         unshift s.  */
 765
 766     DEBUG_EXECUTE_r({
 767         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 768             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 769         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 770                           (s ? "Found" : "Did not find"),
 771             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 772                 ? "anchored" : "floating"),
 773             quoted,
 774             RE_SV_TAIL(check),
 775             (s ? " at offset " : "...\n") );
 776     });
 777
 778     if (!s)
 779         goto fail_finish;
 780     /* Finish the diagnostic message */
 781     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 782
 783     /* XXX dmq: first branch is for positive lookbehind...
 784        Our check string is offset from the beginning of the pattern.
 785        So we need to do any stclass tests offset forward from that
 786        point. I think. :-(
 787      */
 788
 789
 790
 791     check_at=s;
 792
 793
 794     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 795        Start with the other substr.
 796        XXXX no SCREAM optimization yet - and a very coarse implementation
 797        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 798                 *always* match.  Probably should be marked during compile...
 799        Probably it is right to do no SCREAM here...
 800      */
 801
 802     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 803                 : (prog->float_substr && prog->anchored_substr))
 804     {
 805         /* Take into account the "other" substring. */
 806         /* XXXX May be hopelessly wrong for UTF... */
 807         if (!other_last)
 808             other_last = strpos;
 809         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 810           do_other_anchored:
 811             {
 812                 char * const last = HOP3c(s, -start_shift, strbeg);
 813                 char *last1, *last2;
 814                 char * const saved_s = s;
 815                 SV* must;
 816
 817                 t = s - prog->check_offset_max;
 818                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 819                     && (!utf8_target
 820                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 821                             && t > strpos)))
 822                     NOOP;
 823                 else
 824                     t = strpos;
 825                 t = HOP3c(t, prog->anchored_offset, strend);
 826                 if (t < other_last)     /* These positions already checked */
 827                     t = other_last;
 828                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 829                 if (last < last1)
 830                     last1 = last;
 831                 /* XXXX It is not documented what units *_offsets are in.
 832                    We assume bytes, but this is clearly wrong.
 833                    Meaning this code needs to be carefully reviewed for errors.
 834                    dmq.
 835                   */
 836
 837                 /* On end-of-str: see comment below. */
 838                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 839                 if (must == &PL_sv_undef) {
 840                     s = (char*)NULL;
 841                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 842                 }
 843                 else
 844                     s = fbm_instr(
 845                         (unsigned char*)t,
 846                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 847                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 848                         must,
 849                         multiline ? FBMrf_MULTILINE : 0
 850                     );
 851                 DEBUG_EXECUTE_r({
 852                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 853                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 854                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 855                         (s ? "Found" : "Contradicts"),
 856                         quoted, RE_SV_TAIL(must));
 857                 });
 858
 859
 860                 if (!s) {
 861                     if (last1 >= last2) {
 862                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 863                                                 ", giving up...\n"));
 864                         goto fail_finish;
 865                     }
 866                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 867                         ", trying floating at offset %ld...\n",
 868                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 869                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 870                     s = HOP3c(last, 1, strend);
 871                     goto restart;
 872                 }
 873                 else {
 874                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 875                           (long)(s - i_strpos)));
 876                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 877                     other_last = HOP3c(s, 1, strend);
 878                     s = saved_s;
 879                     if (t == strpos)
 880                         goto try_at_start;
 881                     goto try_at_offset;
 882                 }
 883             }
 884         }
 885         else {          /* Take into account the floating substring. */
 886             char *last, *last1;
 887             char * const saved_s = s;
 888             SV* must;
 889
 890             t = HOP3c(s, -start_shift, strbeg);
 891             last1 = last =
 892                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 893             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 894                 last = HOP3c(t, prog->float_max_offset, strend);
 895             s = HOP3c(t, prog->float_min_offset, strend);
 896             if (s < other_last)
 897                 s = other_last;
 898  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 899             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 900             /* fbm_instr() takes into account exact value of end-of-str
 901                if the check is SvTAIL(ed).  Since false positives are OK,
 902                and end-of-str is not later than strend we are OK. */
 903             if (must == &PL_sv_undef) {
 904                 s = (char*)NULL;
 905                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 906             }
 907             else
 908                 s = fbm_instr((unsigned char*)s,
 909                               (unsigned char*)last + SvCUR(must)
 910                                   - (SvTAIL(must)!=0),
 911                               must, multiline ? FBMrf_MULTILINE : 0);
 912             DEBUG_EXECUTE_r({
 913                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 914                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 915                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 916                     (s ? "Found" : "Contradicts"),
 917                     quoted, RE_SV_TAIL(must));
 918             });
 919             if (!s) {
 920                 if (last1 == last) {
 921                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 922                                             ", giving up...\n"));
 923                     goto fail_finish;
 924                 }
 925                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 926                     ", trying anchored starting at offset %ld...\n",
 927                     (long)(saved_s + 1 - i_strpos)));
 928                 other_last = last;
 929                 s = HOP3c(t, 1, strend);
 930                 goto restart;
 931             }
 932             else {
 933                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 934                       (long)(s - i_strpos)));
 935                 other_last = s; /* Fix this later. --Hugo */
 936                 s = saved_s;
 937                 if (t == strpos)
 938                     goto try_at_start;
 939                 goto try_at_offset;
 940             }
 941         }
 942     }
 943
 944
 945     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 946
 947     DEBUG_OPTIMISE_MORE_r(
 948         PerlIO_printf(Perl_debug_log,
 949             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 950             (IV)prog->check_offset_min,
 951             (IV)prog->check_offset_max,
 952             (IV)(s-strpos),
 953             (IV)(t-strpos),
 954             (IV)(t-s),
 955             (IV)(strend-strpos)
 956         )
 957     );
 958
 959     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 960         && (!utf8_target
 961             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 962                  && t > strpos)))
 963     {
 964         /* Fixed substring is found far enough so that the match
 965            cannot start at strpos. */
 966       try_at_offset:
 967         if (ml_anch && t[-1] != '\n') {
 968             /* Eventually fbm_*() should handle this, but often
 969                anchored_offset is not 0, so this check will not be wasted. */
 970             /* XXXX In the code below we prefer to look for "^" even in
 971                presence of anchored substrings.  And we search even
 972                beyond the found float position.  These pessimizations
 973                are historical artefacts only.  */
 974           find_anchor:
 975             while (t < strend - prog->minlen) {
 976                 if (*t == '\n') {
 977                     if (t < check_at - prog->check_offset_min) {
 978                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 979                             /* Since we moved from the found position,
 980                                we definitely contradict the found anchored
 981                                substr.  Due to the above check we do not
 982                                contradict "check" substr.
 983                                Thus we can arrive here only if check substr
 984                                is float.  Redo checking for "other"=="fixed".
 985                              */
 986                             strpos = t + 1;
 987                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 988                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 989                             goto do_other_anchored;
 990                         }
 991                         /* We don't contradict the found floating substring. */
 992                         /* XXXX Why not check for STCLASS? */
 993                         s = t + 1;
 994                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 995                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 996                         goto set_useful;
 997                     }
 998                     /* Position contradicts check-string */
 999                     /* XXXX probably better to look for check-string
1000                        than for "\n", so one should lower the limit for t? */
1001                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1002                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1003                     other_last = strpos = s = t + 1;
1004                     goto restart;
1005                 }
1006                 t++;
1007             }
1008             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1009                         PL_colors[0], PL_colors[1]));
1010             goto fail_finish;
1011         }
1012         else {
1013             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1014                         PL_colors[0], PL_colors[1]));
1015         }
1016         s = t;
1017       set_useful:
1018         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1019     }
1020     else {
1021         /* The found string does not prohibit matching at strpos,
1022            - no optimization of calling REx engine can be performed,
1023            unless it was an MBOL and we are not after MBOL,
1024            or a future STCLASS check will fail this. */
1025       try_at_start:
1026         /* Even in this situation we may use MBOL flag if strpos is offset
1027            wrt the start of the string. */
1028         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1029             && (strpos != strbeg) && strpos[-1] != '\n'
1030             /* May be due to an implicit anchor of m{.*foo}  */
1031             && !(prog->intflags & PREGf_IMPLICIT))
1032         {
1033             t = strpos;
1034             goto find_anchor;
1035         }
1036         DEBUG_EXECUTE_r( if (ml_anch)
1037             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1038                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1039         );
1040       success_at_start:
1041         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1042             && (utf8_target ? (
1043                 prog->check_utf8                /* Could be deleted already */
1044                 && --BmUSEFUL(prog->check_utf8) < 0
1045                 && (prog->check_utf8 == prog->float_utf8)
1046             ) : (
1047                 prog->check_substr              /* Could be deleted already */
1048                 && --BmUSEFUL(prog->check_substr) < 0
1049                 && (prog->check_substr == prog->float_substr)
1050             )))
1051         {
1052             /* If flags & SOMETHING - do not do it many times on the same match */
1053             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1054             /* XXX Does the destruction order has to change with utf8_target? */
1055             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1056             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1057             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1058             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1059             check = NULL;                       /* abort */
1060             s = strpos;
1061             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1062                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1063             if (prog->intflags & PREGf_IMPLICIT)
1064                 prog->extflags &= ~RXf_ANCH_MBOL;
1065             /* XXXX This is a remnant of the old implementation.  It
1066                     looks wasteful, since now INTUIT can use many
1067                     other heuristics. */
1068             prog->extflags &= ~RXf_USE_INTUIT;
1069             /* XXXX What other flags might need to be cleared in this branch? */
1070         }
1071         else
1072             s = strpos;
1073     }
1074
1075     /* Last resort... */
1076     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1077     /* trie stclasses are too expensive to use here, we are better off to
1078        leave it to regmatch itself */
1079     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1080         /* minlen == 0 is possible if regstclass is \b or \B,
1081            and the fixed substr is ''$.
1082            Since minlen is already taken into account, s+1 is before strend;
1083            accidentally, minlen >= 1 guaranties no false positives at s + 1
1084            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1085            regstclass does not come from lookahead...  */
1086         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1087            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1088         const U8* const str = (U8*)STRING(progi->regstclass);
1089         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1090                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1091                     : 1);
1092         char * endpos;
1093         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1094             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1095         else if (prog->float_substr || prog->float_utf8)
1096             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1097         else
1098             endpos= strend;
1099
1100         if (checked_upto < s)
1101            checked_upto = s;
1102         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1103                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1104
1105         t = s;
1106         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1107         if (s) {
1108             checked_upto = s;
1109         } else {
1110 #ifdef DEBUGGING
1111             const char *what = NULL;
1112 #endif
1113             if (endpos == strend) {
1114                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1115                                 "Could not match STCLASS...\n") );
1116                 goto fail;
1117             }
1118             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                                    "This position contradicts STCLASS...\n") );
1120             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1121                 goto fail;
1122             checked_upto = HOPBACKc(endpos, start_shift);
1123             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1124                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1125             /* Contradict one of substrings */
1126             if (prog->anchored_substr || prog->anchored_utf8) {
1127                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1128                     DEBUG_EXECUTE_r( what = "anchored" );
1129                   hop_and_restart:
1130                     s = HOP3c(t, 1, strend);
1131                     if (s + start_shift + end_shift > strend) {
1132                         /* XXXX Should be taken into account earlier? */
1133                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1134                                                "Could not match STCLASS...\n") );
1135                         goto fail;
1136                     }
1137                     if (!check)
1138                         goto giveup;
1139                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1140                                 "Looking for %s substr starting at offset %ld...\n",
1141                                  what, (long)(s + start_shift - i_strpos)) );
1142                     goto restart;
1143                 }
1144                 /* Have both, check_string is floating */
1145                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1146                     goto retry_floating_check;
1147                 /* Recheck anchored substring, but not floating... */
1148                 s = check_at;
1149                 if (!check)
1150                     goto giveup;
1151                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1152                           "Looking for anchored substr starting at offset %ld...\n",
1153                           (long)(other_last - i_strpos)) );
1154                 goto do_other_anchored;
1155             }
1156             /* Another way we could have checked stclass at the
1157                current position only: */
1158             if (ml_anch) {
1159                 s = t = t + 1;
1160                 if (!check)
1161                     goto giveup;
1162                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1163                           "Looking for /%s^%s/m starting at offset %ld...\n",
1164                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1165                 goto try_at_offset;
1166             }
1167             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1168                 goto fail;
1169             /* Check is floating substring. */
1170           retry_floating_check:
1171             t = check_at - start_shift;
1172             DEBUG_EXECUTE_r( what = "floating" );
1173             goto hop_and_restart;
1174         }
1175         if (t != s) {
1176             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1177                         "By STCLASS: moving %ld --> %ld\n",
1178                                   (long)(t - i_strpos), (long)(s - i_strpos))
1179                    );
1180         }
1181         else {
1182             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1183                                   "Does not contradict STCLASS...\n");
1184                    );
1185         }
1186     }
1187   giveup:
1188     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1189                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1190                           PL_colors[5], (long)(s - i_strpos)) );
1191     return s;
1192
1193   fail_finish:                          /* Substring not found */
1194     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1195         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1196   fail:
1197     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1198                           PL_colors[4], PL_colors[5]));
1199     return NULL;
1200 }
1201
1202 #define DECL_TRIE_TYPE(scan) \
1203     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1204                     trie_type = ((scan->flags == EXACT) \
1205                               ? (utf8_target ? trie_utf8 : trie_plain) \
1206                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1207
1208 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1209 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1210     STRLEN skiplen;                                                                 \
1211     switch (trie_type) {                                                            \
1212     case trie_utf8_fold:                                                            \
1213         if ( foldlen>0 ) {                                                          \
1214             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1215             foldlen -= len;                                                         \
1216             uscan += len;                                                           \
1217             len=0;                                                                  \
1218         } else {                                                                    \
1219             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1220             len = UTF8SKIP(uc);                                                     \
1221             skiplen = UNISKIP( uvc );                                               \
1222             foldlen -= skiplen;                                                     \
1223             uscan = foldbuf + skiplen;                                              \
1224         }                                                                           \
1225         break;                                                                      \
1226     case trie_latin_utf8_fold:                                                      \
1227         if ( foldlen>0 ) {                                                          \
1228             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1229             foldlen -= len;                                                         \
1230             uscan += len;                                                           \
1231             len=0;                                                                  \
1232         } else {                                                                    \
1233             len = 1;                                                                \
1234             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1235             skiplen = UNISKIP( uvc );                                               \
1236             foldlen -= skiplen;                                                     \
1237             uscan = foldbuf + skiplen;                                              \
1238         }                                                                           \
1239         break;                                                                      \
1240     case trie_utf8:                                                                 \
1241         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1242         break;                                                                      \
1243     case trie_plain:                                                                \
1244         uvc = (UV)*uc;                                                              \
1245         len = 1;                                                                    \
1246     }                                                                               \
1247     if (uvc < 256) {                                                                \
1248         charid = trie->charmap[ uvc ];                                              \
1249     }                                                                               \
1250     else {                                                                          \
1251         charid = 0;                                                                 \
1252         if (widecharmap) {                                                          \
1253             SV** const svpp = hv_fetch(widecharmap,                                 \
1254                         (char*)&uvc, sizeof(UV), 0);                                \
1255             if (svpp)                                                               \
1256                 charid = (U16)SvIV(*svpp);                                          \
1257         }                                                                           \
1258     }                                                                               \
1259 } STMT_END
1260
1261 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1262 STMT_START {                                              \
1263     while (s <= e) {                                      \
1264         if ( (CoNd)                                       \
1265              && (ln == 1 || folder(s, pat_string, ln))    \
1266              && (!reginfo || regtry(reginfo, &s)) )       \
1267             goto got_it;                                  \
1268         s++;                                              \
1269     }                                                     \
1270 } STMT_END
1271
1272 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1273 STMT_START {                                          \
1274     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1275         CoDe                                          \
1276         s += uskip;                                   \
1277     }                                                 \
1278 } STMT_END
1279
1280 #define REXEC_FBC_SCAN(CoDe)                          \
1281 STMT_START {                                          \
1282     while (s < strend) {                              \
1283         CoDe                                          \
1284         s++;                                          \
1285     }                                                 \
1286 } STMT_END
1287
1288 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1289 REXEC_FBC_UTF8_SCAN(                                  \
1290     if (CoNd) {                                       \
1291         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1292             goto got_it;                              \
1293         else                                          \
1294             tmp = doevery;                            \
1295     }                                                 \
1296     else                                              \
1297         tmp = 1;                                      \
1298 )
1299
1300 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1301 REXEC_FBC_SCAN(                                       \
1302     if (CoNd) {                                       \
1303         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1304             goto got_it;                              \
1305         else                                          \
1306             tmp = doevery;                            \
1307     }                                                 \
1308     else                                              \
1309         tmp = 1;                                      \
1310 )
1311
1312 #define REXEC_FBC_TRYIT               \
1313 if ((!reginfo || regtry(reginfo, &s))) \
1314     goto got_it
1315
1316 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1317     if (utf8_target) {                                             \
1318         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1319     }                                                          \
1320     else {                                                     \
1321         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1322     }
1323
1324 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1325     if (utf8_target) {                                             \
1326         UtFpReLoAd;                                            \
1327         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1328     }                                                          \
1329     else {                                                     \
1330         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1331     }
1332
1333 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1334     PL_reg_flags |= RF_tainted;                                \
1335     if (utf8_target) {                                             \
1336         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1337     }                                                          \
1338     else {                                                     \
1339         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1340     }
1341
1342 #define DUMP_EXEC_POS(li,s,doutf8) \
1343     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1344
1345
1346 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1347         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1348         tmp = TEST_NON_UTF8(tmp);                                              \
1349         REXEC_FBC_UTF8_SCAN(                                                   \
1350             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1351                 tmp = !tmp;                                                    \
1352                 IF_SUCCESS;                                                    \
1353             }                                                                  \
1354             else {                                                             \
1355                 IF_FAIL;                                                       \
1356             }                                                                  \
1357         );                                                                     \
1358
1359 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1360         if (s == PL_bostr) {                                                   \
1361             tmp = '\n';                                                        \
1362         }                                                                      \
1363         else {                                                                 \
1364             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1365             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1366         }                                                                      \
1367         tmp = TeSt1_UtF8;                                                      \
1368         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1369         REXEC_FBC_UTF8_SCAN(                                                   \
1370             if (tmp == ! (TeSt2_UtF8)) { \
1371                 tmp = !tmp;                                                    \
1372                 IF_SUCCESS;                                                    \
1373             }                                                                  \
1374             else {                                                             \
1375                 IF_FAIL;                                                       \
1376             }                                                                  \
1377         );                                                                     \
1378
1379 /* The only difference between the BOUND and NBOUND cases is that
1380  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1381  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1382  * with the other one being empty */
1383 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1384     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1385
1386 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1387     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1388
1389 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1390     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1391
1392 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1393     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1394
1395
1396 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1397  * be passed in completely with the variable name being tested, which isn't
1398  * such a clean interface, but this is easier to read than it was before.  We
1399  * are looking for the boundary (or non-boundary between a word and non-word
1400  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1401  * must be different.  Find the "wordness" of the character just prior to this
1402  * one, and compare it with the wordness of this one.  If they differ, we have
1403  * a boundary.  At the beginning of the string, pretend that the previous
1404  * character was a new-line */
1405 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1406     if (utf8_target) {                                                         \
1407                 UTF8_CODE \
1408     }                                                                          \
1409     else {  /* Not utf8 */                                                     \
1410         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1411         tmp = TEST_NON_UTF8(tmp);                                              \
1412         REXEC_FBC_SCAN(                                                        \
1413             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1414                 tmp = !tmp;                                                    \
1415                 IF_SUCCESS;                                                    \
1416             }                                                                  \
1417             else {                                                             \
1418                 IF_FAIL;                                                       \
1419             }                                                                  \
1420         );                                                                     \
1421     }                                                                          \
1422     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1423         goto got_it;
1424
1425 /* We know what class REx starts with.  Try to find this position... */
1426 /* if reginfo is NULL, its a dryrun */
1427 /* annoyingly all the vars in this routine have different names from their counterparts
1428    in regmatch. /grrr */
1429
1430 STATIC char *
1431 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1432     const char *strend, regmatch_info *reginfo)
1433 {
1434         dVAR;
1435         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1436         char *pat_string;   /* The pattern's exactish string */
1437         char *pat_end;      /* ptr to end char of pat_string */
1438         re_fold_t folder;       /* Function for computing non-utf8 folds */
1439         const U8 *fold_array;   /* array for folding ords < 256 */
1440         STRLEN ln;
1441         STRLEN lnc;
1442         STRLEN uskip;
1443         U8 c1;
1444         U8 c2;
1445         char *e;
1446         I32 tmp = 1;    /* Scratch variable? */
1447         const bool utf8_target = PL_reg_match_utf8;
1448         UV utf8_fold_flags = 0;
1449         RXi_GET_DECL(prog,progi);
1450
1451         PERL_ARGS_ASSERT_FIND_BYCLASS;
1452
1453         /* We know what class it must start with. */
1454         switch (OP(c)) {
1455         case ANYOFV:
1456         case ANYOF:
1457             if (utf8_target || OP(c) == ANYOFV) {
1458                 STRLEN inclasslen = strend - s;
1459                 REXEC_FBC_UTF8_CLASS_SCAN(
1460                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1461             }
1462             else {
1463                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1464             }
1465             break;
1466         case CANY:
1467             REXEC_FBC_SCAN(
1468                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1469                     goto got_it;
1470                 else
1471                     tmp = doevery;
1472             );
1473             break;
1474
1475         case EXACTFA:
1476             if (UTF_PATTERN || utf8_target) {
1477                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1478                 goto do_exactf_utf8;
1479             }
1480             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1481             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1482             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1483
1484         case EXACTF:
1485             if (utf8_target) {
1486
1487                 /* regcomp.c already folded this if pattern is in UTF-8 */
1488                 utf8_fold_flags = 0;
1489                 goto do_exactf_utf8;
1490             }
1491             fold_array = PL_fold;
1492             folder = foldEQ;
1493             goto do_exactf_non_utf8;
1494
1495         case EXACTFL:
1496             if (UTF_PATTERN || utf8_target) {
1497                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1498                 goto do_exactf_utf8;
1499             }
1500             fold_array = PL_fold_locale;
1501             folder = foldEQ_locale;
1502             goto do_exactf_non_utf8;
1503
1504         case EXACTFU_SS:
1505             if (UTF_PATTERN) {
1506                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1507             }
1508             goto do_exactf_utf8;
1509
1510         case EXACTFU_TRICKYFOLD:
1511         case EXACTFU:
1512             if (UTF_PATTERN || utf8_target) {
1513                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1514                 goto do_exactf_utf8;
1515             }
1516
1517             /* Any 'ss' in the pattern should have been replaced by regcomp,
1518              * so we don't have to worry here about this single special case
1519              * in the Latin1 range */
1520             fold_array = PL_fold_latin1;
1521             folder = foldEQ_latin1;
1522
1523             /* FALL THROUGH */
1524
1525         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1526                                are no glitches with fold-length differences
1527                                between the target string and pattern */
1528
1529             /* The idea in the non-utf8 EXACTF* cases is to first find the
1530              * first character of the EXACTF* node and then, if necessary,
1531              * case-insensitively compare the full text of the node.  c1 is the
1532              * first character.  c2 is its fold.  This logic will not work for
1533              * Unicode semantics and the german sharp ss, which hence should
1534              * not be compiled into a node that gets here. */
1535             pat_string = STRING(c);
1536             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1537
1538             /* We know that we have to match at least 'ln' bytes (which is the
1539              * same as characters, since not utf8).  If we have to match 3
1540              * characters, and there are only 2 availabe, we know without
1541              * trying that it will fail; so don't start a match past the
1542              * required minimum number from the far end */
1543             e = HOP3c(strend, -((I32)ln), s);
1544
1545             if (!reginfo && e < s) {
1546                 e = s;                  /* Due to minlen logic of intuit() */
1547             }
1548
1549             c1 = *pat_string;
1550             c2 = fold_array[c1];
1551             if (c1 == c2) { /* If char and fold are the same */
1552                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1553             }
1554             else {
1555                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1556             }
1557             break;
1558
1559         do_exactf_utf8:
1560         {
1561             unsigned expansion;
1562
1563
1564             /* If one of the operands is in utf8, we can't use the simpler
1565              * folding above, due to the fact that many different characters
1566              * can have the same fold, or portion of a fold, or different-
1567              * length fold */
1568             pat_string = STRING(c);
1569             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1570             pat_end = pat_string + ln;
1571             lnc = (UTF_PATTERN) /* length to match in characters */
1572                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1573                     : ln;
1574
1575             /* We have 'lnc' characters to match in the pattern, but because of
1576              * multi-character folding, each character in the target can match
1577              * up to 3 characters (Unicode guarantees it will never exceed
1578              * this) if it is utf8-encoded; and up to 2 if not (based on the
1579              * fact that the Latin 1 folds are already determined, and the
1580              * only multi-char fold in that range is the sharp-s folding to
1581              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1582              * string character.  Adjust lnc accordingly, rounding up, so that
1583              * if we need to match at least 4+1/3 chars, that really is 5. */
1584             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1585             lnc = (lnc + expansion - 1) / expansion;
1586
1587             /* As in the non-UTF8 case, if we have to match 3 characters, and
1588              * only 2 are left, it's guaranteed to fail, so don't start a
1589              * match that would require us to go beyond the end of the string
1590              */
1591             e = HOP3c(strend, -((I32)lnc), s);
1592
1593             if (!reginfo && e < s) {
1594                 e = s;                  /* Due to minlen logic of intuit() */
1595             }
1596
1597             /* XXX Note that we could recalculate e to stop the loop earlier,
1598              * as the worst case expansion above will rarely be met, and as we
1599              * go along we would usually find that e moves further to the left.
1600              * This would happen only after we reached the point in the loop
1601              * where if there were no expansion we should fail.  Unclear if
1602              * worth the expense */
1603
1604             while (s <= e) {
1605                 char *my_strend= (char *)strend;
1606                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1607                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1608                     && (!reginfo || regtry(reginfo, &s)) )
1609                 {
1610                     goto got_it;
1611                 }
1612                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1613             }
1614             break;
1615         }
1616         case BOUNDL:
1617             PL_reg_flags |= RF_tainted;
1618             FBC_BOUND(isALNUM_LC,
1619                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1620                       isALNUM_LC_utf8((U8*)s));
1621             break;
1622         case NBOUNDL:
1623             PL_reg_flags |= RF_tainted;
1624             FBC_NBOUND(isALNUM_LC,
1625                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1626                        isALNUM_LC_utf8((U8*)s));
1627             break;
1628         case BOUND:
1629             FBC_BOUND(isWORDCHAR,
1630                       isALNUM_uni(tmp),
1631                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1632             break;
1633         case BOUNDA:
1634             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1635                              isWORDCHAR_A(tmp),
1636                              isWORDCHAR_A((U8*)s));
1637             break;
1638         case NBOUND:
1639             FBC_NBOUND(isWORDCHAR,
1640                        isALNUM_uni(tmp),
1641                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1642             break;
1643         case NBOUNDA:
1644             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1645                               isWORDCHAR_A(tmp),
1646                               isWORDCHAR_A((U8*)s));
1647             break;
1648         case BOUNDU:
1649             FBC_BOUND(isWORDCHAR_L1,
1650                       isALNUM_uni(tmp),
1651                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1652             break;
1653         case NBOUNDU:
1654             FBC_NBOUND(isWORDCHAR_L1,
1655                        isALNUM_uni(tmp),
1656                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1657             break;
1658         case ALNUML:
1659             REXEC_FBC_CSCAN_TAINT(
1660                 isALNUM_LC_utf8((U8*)s),
1661                 isALNUM_LC(*s)
1662             );
1663             break;
1664         case ALNUMU:
1665             REXEC_FBC_CSCAN_PRELOAD(
1666                 LOAD_UTF8_CHARCLASS_ALNUM(),
1667                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1668                 isWORDCHAR_L1((U8) *s)
1669             );
1670             break;
1671         case ALNUM:
1672             REXEC_FBC_CSCAN_PRELOAD(
1673                 LOAD_UTF8_CHARCLASS_ALNUM(),
1674                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1675                 isWORDCHAR((U8) *s)
1676             );
1677             break;
1678         case ALNUMA:
1679             /* Don't need to worry about utf8, as it can match only a single
1680              * byte invariant character */
1681             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1682             break;
1683         case NALNUMU:
1684             REXEC_FBC_CSCAN_PRELOAD(
1685                 LOAD_UTF8_CHARCLASS_ALNUM(),
1686                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1687                 ! isWORDCHAR_L1((U8) *s)
1688             );
1689             break;
1690         case NALNUM:
1691             REXEC_FBC_CSCAN_PRELOAD(
1692                 LOAD_UTF8_CHARCLASS_ALNUM(),
1693                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1694                 ! isALNUM(*s)
1695             );
1696             break;
1697         case NALNUMA:
1698             REXEC_FBC_CSCAN(
1699                 !isWORDCHAR_A(*s),
1700                 !isWORDCHAR_A(*s)
1701             );
1702             break;
1703         case NALNUML:
1704             REXEC_FBC_CSCAN_TAINT(
1705                 !isALNUM_LC_utf8((U8*)s),
1706                 !isALNUM_LC(*s)
1707             );
1708             break;
1709         case SPACEU:
1710             REXEC_FBC_CSCAN_PRELOAD(
1711                 LOAD_UTF8_CHARCLASS_SPACE(),
1712                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1713                 isSPACE_L1((U8) *s)
1714             );
1715             break;
1716         case SPACE:
1717             REXEC_FBC_CSCAN_PRELOAD(
1718                 LOAD_UTF8_CHARCLASS_SPACE(),
1719                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1720                 isSPACE((U8) *s)
1721             );
1722             break;
1723         case SPACEA:
1724             /* Don't need to worry about utf8, as it can match only a single
1725              * byte invariant character */
1726             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1727             break;
1728         case SPACEL:
1729             REXEC_FBC_CSCAN_TAINT(
1730                 isSPACE_LC_utf8((U8*)s),
1731                 isSPACE_LC(*s)
1732             );
1733             break;
1734         case NSPACEU:
1735             REXEC_FBC_CSCAN_PRELOAD(
1736                 LOAD_UTF8_CHARCLASS_SPACE(),
1737                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1738                 ! isSPACE_L1((U8) *s)
1739             );
1740             break;
1741         case NSPACE:
1742             REXEC_FBC_CSCAN_PRELOAD(
1743                 LOAD_UTF8_CHARCLASS_SPACE(),
1744                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1745                 ! isSPACE((U8) *s)
1746             );
1747             break;
1748         case NSPACEA:
1749             REXEC_FBC_CSCAN(
1750                 !isSPACE_A(*s),
1751                 !isSPACE_A(*s)
1752             );
1753             break;
1754         case NSPACEL:
1755             REXEC_FBC_CSCAN_TAINT(
1756                 !isSPACE_LC_utf8((U8*)s),
1757                 !isSPACE_LC(*s)
1758             );
1759             break;
1760         case DIGIT:
1761             REXEC_FBC_CSCAN_PRELOAD(
1762                 LOAD_UTF8_CHARCLASS_DIGIT(),
1763                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1764                 isDIGIT(*s)
1765             );
1766             break;
1767         case DIGITA:
1768             /* Don't need to worry about utf8, as it can match only a single
1769              * byte invariant character */
1770             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1771             break;
1772         case DIGITL:
1773             REXEC_FBC_CSCAN_TAINT(
1774                 isDIGIT_LC_utf8((U8*)s),
1775                 isDIGIT_LC(*s)
1776             );
1777             break;
1778         case NDIGIT:
1779             REXEC_FBC_CSCAN_PRELOAD(
1780                 LOAD_UTF8_CHARCLASS_DIGIT(),
1781                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1782                 !isDIGIT(*s)
1783             );
1784             break;
1785         case NDIGITA:
1786             REXEC_FBC_CSCAN(
1787                 !isDIGIT_A(*s),
1788                 !isDIGIT_A(*s)
1789             );
1790             break;
1791         case NDIGITL:
1792             REXEC_FBC_CSCAN_TAINT(
1793                 !isDIGIT_LC_utf8((U8*)s),
1794                 !isDIGIT_LC(*s)
1795             );
1796             break;
1797         case LNBREAK:
1798             REXEC_FBC_CSCAN(
1799                 is_LNBREAK_utf8_safe(s, strend),
1800                 is_LNBREAK_latin1_safe(s, strend)
1801             );
1802             break;
1803         case VERTWS:
1804             REXEC_FBC_CSCAN(
1805                 is_VERTWS_utf8_safe(s, strend),
1806                 is_VERTWS_latin1_safe(s, strend)
1807             );
1808             break;
1809         case NVERTWS:
1810             REXEC_FBC_CSCAN(
1811                 !is_VERTWS_utf8_safe(s, strend),
1812                 !is_VERTWS_latin1_safe(s, strend)
1813             );
1814             break;
1815         case HORIZWS:
1816             REXEC_FBC_CSCAN(
1817                 is_HORIZWS_utf8_safe(s, strend),
1818                 is_HORIZWS_latin1_safe(s, strend)
1819             );
1820             break;
1821         case NHORIZWS:
1822             REXEC_FBC_CSCAN(
1823                 !is_HORIZWS_utf8_safe(s, strend),
1824                 !is_HORIZWS_latin1_safe(s, strend)
1825             );
1826             break;
1827         case POSIXA:
1828             /* Don't need to worry about utf8, as it can match only a single
1829             * byte invariant character.  The flag in this node type is the
1830             * class number to pass to _generic_isCC() to build a mask for
1831             * searching in PL_charclass[] */
1832             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1833             break;
1834         case NPOSIXA:
1835             REXEC_FBC_CSCAN(
1836                 !_generic_isCC_A(*s, FLAGS(c)),
1837                 !_generic_isCC_A(*s, FLAGS(c))
1838             );
1839             break;
1840
1841         case AHOCORASICKC:
1842         case AHOCORASICK:
1843             {
1844                 DECL_TRIE_TYPE(c);
1845                 /* what trie are we using right now */
1846                 reg_ac_data *aho
1847                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1848                 reg_trie_data *trie
1849                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1850                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1851
1852                 const char *last_start = strend - trie->minlen;
1853 #ifdef DEBUGGING
1854                 const char *real_start = s;
1855 #endif
1856                 STRLEN maxlen = trie->maxlen;
1857                 SV *sv_points;
1858                 U8 **points; /* map of where we were in the input string
1859                                 when reading a given char. For ASCII this
1860                                 is unnecessary overhead as the relationship
1861                                 is always 1:1, but for Unicode, especially
1862                                 case folded Unicode this is not true. */
1863                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1864                 U8 *bitmap=NULL;
1865
1866
1867                 GET_RE_DEBUG_FLAGS_DECL;
1868
1869                 /* We can't just allocate points here. We need to wrap it in
1870                  * an SV so it gets freed properly if there is a croak while
1871                  * running the match */
1872                 ENTER;
1873                 SAVETMPS;
1874                 sv_points=newSV(maxlen * sizeof(U8 *));
1875                 SvCUR_set(sv_points,
1876                     maxlen * sizeof(U8 *));
1877                 SvPOK_on(sv_points);
1878                 sv_2mortal(sv_points);
1879                 points=(U8**)SvPV_nolen(sv_points );
1880                 if ( trie_type != trie_utf8_fold
1881                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1882                 {
1883                     if (trie->bitmap)
1884                         bitmap=(U8*)trie->bitmap;
1885                     else
1886                         bitmap=(U8*)ANYOF_BITMAP(c);
1887                 }
1888                 /* this is the Aho-Corasick algorithm modified a touch
1889                    to include special handling for long "unknown char"
1890                    sequences. The basic idea being that we use AC as long
1891                    as we are dealing with a possible matching char, when
1892                    we encounter an unknown char (and we have not encountered
1893                    an accepting state) we scan forward until we find a legal
1894                    starting char.
1895                    AC matching is basically that of trie matching, except
1896                    that when we encounter a failing transition, we fall back
1897                    to the current states "fail state", and try the current char
1898                    again, a process we repeat until we reach the root state,
1899                    state 1, or a legal transition. If we fail on the root state
1900                    then we can either terminate if we have reached an accepting
1901                    state previously, or restart the entire process from the beginning
1902                    if we have not.
1903
1904                  */
1905                 while (s <= last_start) {
1906                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1907                     U8 *uc = (U8*)s;
1908                     U16 charid = 0;
1909                     U32 base = 1;
1910                     U32 state = 1;
1911                     UV uvc = 0;
1912                     STRLEN len = 0;
1913                     STRLEN foldlen = 0;
1914                     U8 *uscan = (U8*)NULL;
1915                     U8 *leftmost = NULL;
1916 #ifdef DEBUGGING
1917                     U32 accepted_word= 0;
1918 #endif
1919                     U32 pointpos = 0;
1920
1921                     while ( state && uc <= (U8*)strend ) {
1922                         int failed=0;
1923                         U32 word = aho->states[ state ].wordnum;
1924
1925                         if( state==1 ) {
1926                             if ( bitmap ) {
1927                                 DEBUG_TRIE_EXECUTE_r(
1928                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1929                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1930                                             (char *)uc, utf8_target );
1931                                         PerlIO_printf( Perl_debug_log,
1932                                             " Scanning for legal start char...\n");
1933                                     }
1934                                 );
1935                                 if (utf8_target) {
1936                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1937                                         uc += UTF8SKIP(uc);
1938                                     }
1939                                 } else {
1940                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1941                                         uc++;
1942                                     }
1943                                 }
1944                                 s= (char *)uc;
1945                             }
1946                             if (uc >(U8*)last_start) break;
1947                         }
1948
1949                         if ( word ) {
1950                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1951                             if (!leftmost || lpos < leftmost) {
1952                                 DEBUG_r(accepted_word=word);
1953                                 leftmost= lpos;
1954                             }
1955                             if (base==0) break;
1956
1957                         }
1958                         points[pointpos++ % maxlen]= uc;
1959                         if (foldlen || uc < (U8*)strend) {
1960                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1961                                              widecharmap, uc,
1962                                              uscan, len, uvc, charid, foldlen,
1963                                              foldbuf, uniflags);
1964                             DEBUG_TRIE_EXECUTE_r({
1965                                 dump_exec_pos( (char *)uc, c, strend,
1966                                             real_start, s, utf8_target);
1967                                 PerlIO_printf(Perl_debug_log,
1968                                     " Charid:%3u CP:%4"UVxf" ",
1969                                      charid, uvc);
1970                             });
1971                         }
1972                         else {
1973                             len = 0;
1974                             charid = 0;
1975                         }
1976
1977
1978                         do {
1979 #ifdef DEBUGGING
1980                             word = aho->states[ state ].wordnum;
1981 #endif
1982                             base = aho->states[ state ].trans.base;
1983
1984                             DEBUG_TRIE_EXECUTE_r({
1985                                 if (failed)
1986                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1987                                         s,   utf8_target );
1988                                 PerlIO_printf( Perl_debug_log,
1989                                     "%sState: %4"UVxf", word=%"UVxf,
1990                                     failed ? " Fail transition to " : "",
1991                                     (UV)state, (UV)word);
1992                             });
1993                             if ( base ) {
1994                                 U32 tmp;
1995                                 I32 offset;
1996                                 if (charid &&
1997                                      ( ((offset = base + charid
1998                                         - 1 - trie->uniquecharcount)) >= 0)
1999                                      && ((U32)offset < trie->lasttrans)
2000                                      && trie->trans[offset].check == state
2001                                      && (tmp=trie->trans[offset].next))
2002                                 {
2003                                     DEBUG_TRIE_EXECUTE_r(
2004                                         PerlIO_printf( Perl_debug_log," - legal\n"));
2005                                     state = tmp;
2006                                     break;
2007                                 }
2008                                 else {
2009                                     DEBUG_TRIE_EXECUTE_r(
2010                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2011                                     failed = 1;
2012                                     state = aho->fail[state];
2013                                 }
2014                             }
2015                             else {
2016                                 /* we must be accepting here */
2017                                 DEBUG_TRIE_EXECUTE_r(
2018                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2019                                 failed = 1;
2020                                 break;
2021                             }
2022                         } while(state);
2023                         uc += len;
2024                         if (failed) {
2025                             if (leftmost)
2026                                 break;
2027                             if (!state) state = 1;
2028                         }
2029                     }
2030                     if ( aho->states[ state ].wordnum ) {
2031                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2032                         if (!leftmost || lpos < leftmost) {
2033                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2034                             leftmost = lpos;
2035                         }
2036                     }
2037                     if (leftmost) {
2038                         s = (char*)leftmost;
2039                         DEBUG_TRIE_EXECUTE_r({
2040                             PerlIO_printf(
2041                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2042                                 (UV)accepted_word, (IV)(s - real_start)
2043                             );
2044                         });
2045                         if (!reginfo || regtry(reginfo, &s)) {
2046                             FREETMPS;
2047                             LEAVE;
2048                             goto got_it;
2049                         }
2050                         s = HOPc(s,1);
2051                         DEBUG_TRIE_EXECUTE_r({
2052                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2053                         });
2054                     } else {
2055                         DEBUG_TRIE_EXECUTE_r(
2056                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2057                         break;
2058                     }
2059                 }
2060                 FREETMPS;
2061                 LEAVE;
2062             }
2063             break;
2064         default:
2065             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2066             break;
2067         }
2068         return 0;
2069       got_it:
2070         return s;
2071 }
2072
2073
2074 /*
2075  - regexec_flags - match a regexp against a string
2076  */
2077 I32
2078 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2079               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2080 /* stringarg: the point in the string at which to begin matching */
2081 /* strend:    pointer to null at end of string */
2082 /* strbeg:    real beginning of string */
2083 /* minend:    end of match must be >= minend bytes after stringarg. */
2084 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2085  *            itself is accessed via the pointers above */
2086 /* data:      May be used for some additional optimizations.
2087               Currently its only used, with a U32 cast, for transmitting
2088               the ganch offset when doing a /g match. This will change */
2089 /* nosave:    For optimizations. */
2090
2091 {
2092     dVAR;
2093     struct regexp *const prog = (struct regexp *)SvANY(rx);
2094     /*register*/ char *s;
2095     regnode *c;
2096     /*register*/ char *startpos = stringarg;
2097     I32 minlen;         /* must match at least this many chars */
2098     I32 dontbother = 0; /* how many characters not to try at end */
2099     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2100     I32 scream_pos = -1;                /* Internal iterator of scream. */
2101     char *scream_olds = NULL;
2102     const bool utf8_target = cBOOL(DO_UTF8(sv));
2103     I32 multiline;
2104     RXi_GET_DECL(prog,progi);
2105     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2106     regexp_paren_pair *swap = NULL;
2107     GET_RE_DEBUG_FLAGS_DECL;
2108
2109     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2110     PERL_UNUSED_ARG(data);
2111
2112     /* Be paranoid... */
2113     if (prog == NULL || startpos == NULL) {
2114         Perl_croak(aTHX_ "NULL regexp parameter");
2115         return 0;
2116     }
2117
2118     multiline = prog->extflags & RXf_PMf_MULTILINE;
2119     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2120
2121     RX_MATCH_UTF8_set(rx, utf8_target);
2122     DEBUG_EXECUTE_r(
2123         debug_start_match(rx, utf8_target, startpos, strend,
2124         "Matching");
2125     );
2126
2127     minlen = prog->minlen;
2128
2129     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2130         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2131                               "String too short [regexec_flags]...\n"));
2132         goto phooey;
2133     }
2134
2135
2136     /* Check validity of program. */
2137     if (UCHARAT(progi->program) != REG_MAGIC) {
2138         Perl_croak(aTHX_ "corrupted regexp program");
2139     }
2140
2141     PL_reg_flags = 0;
2142     PL_reg_state.re_state_eval_setup_done = FALSE;
2143     PL_reg_maxiter = 0;
2144
2145     if (RX_UTF8(rx))
2146         PL_reg_flags |= RF_utf8;
2147
2148     /* Mark beginning of line for ^ and lookbehind. */
2149     reginfo.bol = startpos; /* XXX not used ??? */
2150     PL_bostr  = strbeg;
2151     reginfo.sv = sv;
2152
2153     /* Mark end of line for $ (and such) */
2154     PL_regeol = strend;
2155
2156     /* see how far we have to get to not match where we matched before */
2157     reginfo.till = startpos+minend;
2158
2159     /* If there is a "must appear" string, look for it. */
2160     s = startpos;
2161
2162     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2163         MAGIC *mg;
2164         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2165             reginfo.ganch = startpos + prog->gofs;
2166             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2167               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2168         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2169                   && SvMAGIC(sv)
2170                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2171                   && mg->mg_len >= 0) {
2172             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2173             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2174                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2175
2176             if (prog->extflags & RXf_ANCH_GPOS) {
2177                 if (s > reginfo.ganch)
2178                     goto phooey;
2179                 s = reginfo.ganch - prog->gofs;
2180                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2181                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2182                 if (s < strbeg)
2183                     goto phooey;
2184             }
2185         }
2186         else if (data) {
2187             reginfo.ganch = strbeg + PTR2UV(data);
2188             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2189                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2190
2191         } else {                                /* pos() not defined */
2192             reginfo.ganch = strbeg;
2193             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194                  "GPOS: reginfo.ganch = strbeg\n"));
2195         }
2196     }
2197     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2198         /* We have to be careful. If the previous successful match
2199            was from this regex we don't want a subsequent partially
2200            successful match to clobber the old results.
2201            So when we detect this possibility we add a swap buffer
2202            to the re, and switch the buffer each match. If we fail
2203            we switch it back, otherwise we leave it swapped.
2204         */
2205         swap = prog->offs;
2206         /* do we need a save destructor here for eval dies? */
2207         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2208         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2209             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2210             PTR2UV(prog),
2211             PTR2UV(swap),
2212             PTR2UV(prog->offs)
2213         ));
2214     }
2215     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2216         re_scream_pos_data d;
2217
2218         d.scream_olds = &scream_olds;
2219         d.scream_pos = &scream_pos;
2220         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2221         if (!s) {
2222             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2223             goto phooey;        /* not present */
2224         }
2225     }
2226
2227
2228
2229     /* Simplest case:  anchored match need be tried only once. */
2230     /*  [unless only anchor is BOL and multiline is set] */
2231     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2232         if (s == startpos && regtry(&reginfo, &startpos))
2233             goto got_it;
2234         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2235                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2236         {
2237             char *end;
2238
2239             if (minlen)
2240                 dontbother = minlen - 1;
2241             end = HOP3c(strend, -dontbother, strbeg) - 1;
2242             /* for multiline we only have to try after newlines */
2243             if (prog->check_substr || prog->check_utf8) {
2244                 /* because of the goto we can not easily reuse the macros for bifurcating the
2245                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2246                 if (utf8_target) {
2247                     if (s == startpos)
2248                         goto after_try_utf8;
2249                     while (1) {
2250                         if (regtry(&reginfo, &s)) {
2251                             goto got_it;
2252                         }
2253                       after_try_utf8:
2254                         if (s > end) {
2255                             goto phooey;
2256                         }
2257                         if (prog->extflags & RXf_USE_INTUIT) {
2258                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2259                             if (!s) {
2260                                 goto phooey;
2261                             }
2262                         }
2263                         else {
2264                             s += UTF8SKIP(s);
2265                         }
2266                     }
2267                 } /* end search for check string in unicode */
2268                 else {
2269                     if (s == startpos) {
2270                         goto after_try_latin;
2271                     }
2272                     while (1) {
2273                         if (regtry(&reginfo, &s)) {
2274                             goto got_it;
2275                         }
2276                       after_try_latin:
2277                         if (s > end) {
2278                             goto phooey;
2279                         }
2280                         if (prog->extflags & RXf_USE_INTUIT) {
2281                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2282                             if (!s) {
2283                                 goto phooey;
2284                             }
2285                         }
2286                         else {
2287                             s++;
2288                         }
2289                     }
2290                 } /* end search for check string in latin*/
2291             } /* end search for check string */
2292             else { /* search for newline */
2293                 if (s > startpos) {
2294                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2295                     s--;
2296                 }
2297                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2298                 while (s <= end) { /* note it could be possible to match at the end of the string */
2299                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2300                         if (regtry(&reginfo, &s))
2301                             goto got_it;
2302                     }
2303                 }
2304             } /* end search for newline */
2305         } /* end anchored/multiline check string search */
2306         goto phooey;
2307     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2308     {
2309         /* the warning about reginfo.ganch being used without initialization
2310            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2311            and we only enter this block when the same bit is set. */
2312         char *tmp_s = reginfo.ganch - prog->gofs;
2313
2314         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2315             goto got_it;
2316         goto phooey;
2317     }
2318
2319     /* Messy cases:  unanchored match. */
2320     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2321         /* we have /x+whatever/ */
2322         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2323         char ch;
2324 #ifdef DEBUGGING
2325         int did_match = 0;
2326 #endif
2327         if (utf8_target) {
2328             if (! prog->anchored_utf8) {
2329                 to_utf8_substr(prog);
2330             }
2331             ch = SvPVX_const(prog->anchored_utf8)[0];
2332             REXEC_FBC_SCAN(
2333                 if (*s == ch) {
2334                     DEBUG_EXECUTE_r( did_match = 1 );
2335                     if (regtry(&reginfo, &s)) goto got_it;
2336                     s += UTF8SKIP(s);
2337                     while (s < strend && *s == ch)
2338                         s += UTF8SKIP(s);
2339                 }
2340             );
2341
2342         }
2343         else {
2344             if (! prog->anchored_substr) {
2345                 if (! to_byte_substr(prog)) {
2346                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2347                                             non_utf8_target_but_utf8_required));
2348                     goto phooey;
2349                 }
2350             }
2351             ch = SvPVX_const(prog->anchored_substr)[0];
2352             REXEC_FBC_SCAN(
2353                 if (*s == ch) {
2354                     DEBUG_EXECUTE_r( did_match = 1 );
2355                     if (regtry(&reginfo, &s)) goto got_it;
2356                     s++;
2357                     while (s < strend && *s == ch)
2358                         s++;
2359                 }
2360             );
2361         }
2362         DEBUG_EXECUTE_r(if (!did_match)
2363                 PerlIO_printf(Perl_debug_log,
2364                                   "Did not find anchored character...\n")
2365                );
2366     }
2367     else if (prog->anchored_substr != NULL
2368               || prog->anchored_utf8 != NULL
2369               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2370                   && prog->float_max_offset < strend - s)) {
2371         SV *must;
2372         I32 back_max;
2373         I32 back_min;
2374         char *last;
2375         char *last1;            /* Last position checked before */
2376 #ifdef DEBUGGING
2377         int did_match = 0;
2378 #endif
2379         if (prog->anchored_substr || prog->anchored_utf8) {
2380             if (utf8_target) {
2381                 if (! prog->anchored_utf8) {
2382                     to_utf8_substr(prog);
2383                 }
2384                 must = prog->anchored_utf8;
2385             }
2386             else {
2387                 if (! prog->anchored_substr) {
2388                     if (! to_byte_substr(prog)) {
2389                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2390                                             non_utf8_target_but_utf8_required));
2391                         goto phooey;
2392                     }
2393                 }
2394                 must = prog->anchored_substr;
2395             }
2396             back_max = back_min = prog->anchored_offset;
2397         } else {
2398             if (utf8_target) {
2399                 if (! prog->float_utf8) {
2400                     to_utf8_substr(prog);
2401                 }
2402                 must = prog->float_utf8;
2403             }
2404             else {
2405                 if (! prog->float_substr) {
2406                     if (! to_byte_substr(prog)) {
2407                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2408                                             non_utf8_target_but_utf8_required));
2409                         goto phooey;
2410                     }
2411                 }
2412                 must = prog->float_substr;
2413             }
2414             back_max = prog->float_max_offset;
2415             back_min = prog->float_min_offset;
2416         }
2417
2418         if (back_min<0) {
2419             last = strend;
2420         } else {
2421             last = HOP3c(strend,        /* Cannot start after this */
2422                   -(I32)(CHR_SVLEN(must)
2423                          - (SvTAIL(must) != 0) + back_min), strbeg);
2424         }
2425         if (s > PL_bostr)
2426             last1 = HOPc(s, -1);
2427         else
2428             last1 = s - 1;      /* bogus */
2429
2430         /* XXXX check_substr already used to find "s", can optimize if
2431            check_substr==must. */
2432         scream_pos = -1;
2433         dontbother = end_shift;
2434         strend = HOPc(strend, -dontbother);
2435         while ( (s <= last) &&
2436                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2437                                   (unsigned char*)strend, must,
2438                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2439             DEBUG_EXECUTE_r( did_match = 1 );
2440             if (HOPc(s, -back_max) > last1) {
2441                 last1 = HOPc(s, -back_min);
2442                 s = HOPc(s, -back_max);
2443             }
2444             else {
2445                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2446
2447                 last1 = HOPc(s, -back_min);
2448                 s = t;
2449             }
2450             if (utf8_target) {
2451                 while (s <= last1) {
2452                     if (regtry(&reginfo, &s))
2453                         goto got_it;
2454                     if (s >= last1) {
2455                         s++; /* to break out of outer loop */
2456                         break;
2457                     }
2458                     s += UTF8SKIP(s);
2459                 }
2460             }
2461             else {
2462                 while (s <= last1) {
2463                     if (regtry(&reginfo, &s))
2464                         goto got_it;
2465                     s++;
2466                 }
2467             }
2468         }
2469         DEBUG_EXECUTE_r(if (!did_match) {
2470             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2471                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2472             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2473                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2474                                ? "anchored" : "floating"),
2475                 quoted, RE_SV_TAIL(must));
2476         });
2477         goto phooey;
2478     }
2479     else if ( (c = progi->regstclass) ) {
2480         if (minlen) {
2481             const OPCODE op = OP(progi->regstclass);
2482             /* don't bother with what can't match */
2483             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2484                 strend = HOPc(strend, -(minlen - 1));
2485         }
2486         DEBUG_EXECUTE_r({
2487             SV * const prop = sv_newmortal();
2488             regprop(prog, prop, c);
2489             {
2490                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2491                     s,strend-s,60);
2492                 PerlIO_printf(Perl_debug_log,
2493                     "Matching stclass %.*s against %s (%d bytes)\n",
2494                     (int)SvCUR(prop), SvPVX_const(prop),
2495                      quoted, (int)(strend - s));
2496             }
2497         });
2498         if (find_byclass(prog, c, s, strend, &reginfo))
2499             goto got_it;
2500         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2501     }
2502     else {
2503         dontbother = 0;
2504         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2505             /* Trim the end. */
2506             char *last= NULL;
2507             SV* float_real;
2508             STRLEN len;
2509             const char *little;
2510
2511             if (utf8_target) {
2512                 if (! prog->float_utf8) {
2513                     to_utf8_substr(prog);
2514                 }
2515                 float_real = prog->float_utf8;
2516             }
2517             else {
2518                 if (! prog->float_substr) {
2519                     if (! to_byte_substr(prog)) {
2520                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2521                                             non_utf8_target_but_utf8_required));
2522                         goto phooey;
2523                     }
2524                 }
2525                 float_real = prog->float_substr;
2526             }
2527
2528             little = SvPV_const(float_real, len);
2529             if (SvTAIL(float_real)) {
2530                     /* This means that float_real contains an artificial \n on
2531                      * the end due to the presence of something like this:
2532                      * /foo$/ where we can match both "foo" and "foo\n" at the
2533                      * end of the string.  So we have to compare the end of the
2534                      * string first against the float_real without the \n and
2535                      * then against the full float_real with the string.  We
2536                      * have to watch out for cases where the string might be
2537                      * smaller than the float_real or the float_real without
2538                      * the \n. */
2539                     char *checkpos= strend - len;
2540                     DEBUG_OPTIMISE_r(
2541                         PerlIO_printf(Perl_debug_log,
2542                             "%sChecking for float_real.%s\n",
2543                             PL_colors[4], PL_colors[5]));
2544                     if (checkpos + 1 < strbeg) {
2545                         /* can't match, even if we remove the trailing \n
2546                          * string is too short to match */
2547                         DEBUG_EXECUTE_r(
2548                             PerlIO_printf(Perl_debug_log,
2549                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2550                                 PL_colors[4], PL_colors[5]));
2551                         goto phooey;
2552                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2553                         /* can match, the end of the string matches without the
2554                          * "\n" */
2555                         last = checkpos + 1;
2556                     } else if (checkpos < strbeg) {
2557                         /* cant match, string is too short when the "\n" is
2558                          * included */
2559                         DEBUG_EXECUTE_r(
2560                             PerlIO_printf(Perl_debug_log,
2561                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2562                                 PL_colors[4], PL_colors[5]));
2563                         goto phooey;
2564                     } else if (!multiline) {
2565                         /* non multiline match, so compare with the "\n" at the
2566                          * end of the string */
2567                         if (memEQ(checkpos, little, len)) {
2568                             last= checkpos;
2569                         } else {
2570                             DEBUG_EXECUTE_r(
2571                                 PerlIO_printf(Perl_debug_log,
2572                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2573                                     PL_colors[4], PL_colors[5]));
2574                             goto phooey;
2575                         }
2576                     } else {
2577                         /* multiline match, so we have to search for a place
2578                          * where the full string is located */
2579                         goto find_last;
2580                     }
2581             } else {
2582                   find_last:
2583                     if (len)
2584                         last = rninstr(s, strend, little, little + len);
2585                     else
2586                         last = strend;  /* matching "$" */
2587             }
2588             if (!last) {
2589                 /* at one point this block contained a comment which was
2590                  * probably incorrect, which said that this was a "should not
2591                  * happen" case.  Even if it was true when it was written I am
2592                  * pretty sure it is not anymore, so I have removed the comment
2593                  * and replaced it with this one. Yves */
2594                 DEBUG_EXECUTE_r(
2595                     PerlIO_printf(Perl_debug_log,
2596                         "String does not contain required substring, cannot match.\n"
2597                     ));
2598                 goto phooey;
2599             }
2600             dontbother = strend - last + prog->float_min_offset;
2601         }
2602         if (minlen && (dontbother < minlen))
2603             dontbother = minlen - 1;
2604         strend -= dontbother;              /* this one's always in bytes! */
2605         /* We don't know much -- general case. */
2606         if (utf8_target) {
2607             for (;;) {
2608                 if (regtry(&reginfo, &s))
2609                     goto got_it;
2610                 if (s >= strend)
2611                     break;
2612                 s += UTF8SKIP(s);
2613             };
2614         }
2615         else {
2616             do {
2617                 if (regtry(&reginfo, &s))
2618                     goto got_it;
2619             } while (s++ < strend);
2620         }
2621     }
2622
2623     /* Failure. */
2624     goto phooey;
2625
2626 got_it:
2627     DEBUG_BUFFERS_r(
2628         if (swap)
2629             PerlIO_printf(Perl_debug_log,
2630                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2631                 PTR2UV(prog),
2632                 PTR2UV(swap)
2633             );
2634     );
2635     Safefree(swap);
2636     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2637
2638     if (PL_reg_state.re_state_eval_setup_done)
2639         restore_pos(aTHX_ prog);
2640     if (RXp_PAREN_NAMES(prog))
2641         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2642
2643     /* make sure $`, $&, $', and $digit will work later */
2644     if ( !(flags & REXEC_NOT_FIRST) ) {
2645         if (flags & REXEC_COPY_STR) {
2646 #ifdef PERL_OLD_COPY_ON_WRITE
2647             if ((SvIsCOW(sv)
2648                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2649                 if (DEBUG_C_TEST) {
2650                     PerlIO_printf(Perl_debug_log,
2651                                   "Copy on write: regexp capture, type %d\n",
2652                                   (int) SvTYPE(sv));
2653                 }
2654                 RX_MATCH_COPY_FREE(rx);
2655                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2656                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2657                 assert (SvPOKp(prog->saved_copy));
2658                 prog->sublen  = PL_regeol - strbeg;
2659                 prog->suboffset = 0;
2660                 prog->subcoffset = 0;
2661             } else
2662 #endif
2663             {
2664                 I32 min = 0;
2665                 I32 max = PL_regeol - strbeg;
2666                 I32 sublen;
2667
2668                 if (    (flags & REXEC_COPY_SKIP_POST)
2669                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2670                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2671                 ) { /* don't copy $' part of string */
2672                     U32 n = 0;
2673                     max = -1;
2674                     /* calculate the right-most part of the string covered
2675                      * by a capture. Due to look-ahead, this may be to
2676                      * the right of $&, so we have to scan all captures */
2677                     while (n <= prog->lastparen) {
2678                         if (prog->offs[n].end > max)
2679                             max = prog->offs[n].end;
2680                         n++;
2681                     }
2682                     if (max == -1)
2683                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2684                                 ? prog->offs[0].start
2685                                 : 0;
2686                     assert(max >= 0 && max <= PL_regeol - strbeg);
2687                 }
2688
2689                 if (    (flags & REXEC_COPY_SKIP_PRE)
2690                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2691                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2692                 ) { /* don't copy $` part of string */
2693                     U32 n = 0;
2694                     min = max;
2695                     /* calculate the left-most part of the string covered
2696                      * by a capture. Due to look-behind, this may be to
2697                      * the left of $&, so we have to scan all captures */
2698                     while (min && n <= prog->lastparen) {
2699                         if (   prog->offs[n].start != -1
2700                             && prog->offs[n].start < min)
2701                         {
2702                             min = prog->offs[n].start;
2703                         }
2704                         n++;
2705                     }
2706                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2707                         && min >  prog->offs[0].end
2708                     )
2709                         min = prog->offs[0].end;
2710
2711                 }
2712
2713                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2714                 sublen = max - min;
2715
2716                 if (RX_MATCH_COPIED(rx)) {
2717                     if (sublen > prog->sublen)
2718                         prog->subbeg =
2719                                 (char*)saferealloc(prog->subbeg, sublen+1);
2720                 }
2721                 else
2722                     prog->subbeg = (char*)safemalloc(sublen+1);
2723                 Copy(strbeg + min, prog->subbeg, sublen, char);
2724                 prog->subbeg[sublen] = '\0';
2725                 prog->suboffset = min;
2726                 prog->sublen = sublen;
2727                 RX_MATCH_COPIED_on(rx);
2728             }
2729             prog->subcoffset = prog->suboffset;
2730             if (prog->suboffset && utf8_target) {
2731                 /* Convert byte offset to chars.
2732                  * XXX ideally should only compute this if @-/@+
2733                  * has been seen, a la PL_sawampersand ??? */
2734
2735                 /* If there's a direct correspondence between the
2736                  * string which we're matching and the original SV,
2737                  * then we can use the utf8 len cache associated with
2738                  * the SV. In particular, it means that under //g,
2739                  * sv_pos_b2u() will use the previously cached
2740                  * position to speed up working out the new length of
2741                  * subcoffset, rather than counting from the start of
2742                  * the string each time. This stops
2743                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2744                  * from going quadratic */
2745                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2746                     sv_pos_b2u(sv, &(prog->subcoffset));
2747                 else
2748                     prog->subcoffset = utf8_length((U8*)strbeg,
2749                                         (U8*)(strbeg+prog->suboffset));
2750             }
2751         }
2752         else {
2753             RX_MATCH_COPY_FREE(rx);
2754             prog->subbeg = strbeg;
2755             prog->suboffset = 0;
2756             prog->subcoffset = 0;
2757             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2758         }
2759     }
2760
2761     return 1;
2762
2763 phooey:
2764     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2765                           PL_colors[4], PL_colors[5]));
2766     if (PL_reg_state.re_state_eval_setup_done)
2767         restore_pos(aTHX_ prog);
2768     if (swap) {
2769         /* we failed :-( roll it back */
2770         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2771             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2772             PTR2UV(prog),
2773             PTR2UV(prog->offs),
2774             PTR2UV(swap)
2775         ));
2776         Safefree(prog->offs);
2777         prog->offs = swap;
2778     }
2779     return 0;
2780 }
2781
2782
2783 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2784  * Do inc before dec, in case old and new rex are the same */
2785 #define SET_reg_curpm(Re2) \
2786     if (PL_reg_state.re_state_eval_setup_done) {    \
2787         (void)ReREFCNT_inc(Re2);                    \
2788         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2789         PM_SETRE((PL_reg_curpm), (Re2));            \
2790     }
2791
2792
2793 /*
2794  - regtry - try match at specific point
2795  */
2796 STATIC I32                      /* 0 failure, 1 success */
2797 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2798 {
2799     dVAR;
2800     CHECKPOINT lastcp;
2801     REGEXP *const rx = reginfo->prog;
2802     regexp *const prog = (struct regexp *)SvANY(rx);
2803     I32 result;
2804     RXi_GET_DECL(prog,progi);
2805     GET_RE_DEBUG_FLAGS_DECL;
2806
2807     PERL_ARGS_ASSERT_REGTRY;
2808
2809     reginfo->cutpoint=NULL;
2810
2811     if ((prog->extflags & RXf_EVAL_SEEN)
2812         && !PL_reg_state.re_state_eval_setup_done)
2813     {
2814         MAGIC *mg;
2815
2816         PL_reg_state.re_state_eval_setup_done = TRUE;
2817         if (reginfo->sv) {
2818             /* Make $_ available to executed code. */
2819             if (reginfo->sv != DEFSV) {
2820                 SAVE_DEFSV;
2821                 DEFSV_set(reginfo->sv);
2822             }
2823
2824             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2825                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2826                 /* prepare for quick setting of pos */
2827 #ifdef PERL_OLD_COPY_ON_WRITE
2828                 if (SvIsCOW(reginfo->sv))
2829                     sv_force_normal_flags(reginfo->sv, 0);
2830 #endif
2831                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2832                                  &PL_vtbl_mglob, NULL, 0);
2833                 mg->mg_len = -1;
2834             }
2835             PL_reg_magic    = mg;
2836             PL_reg_oldpos   = mg->mg_len;
2837             SAVEDESTRUCTOR_X(restore_pos, prog);
2838         }
2839         if (!PL_reg_curpm) {
2840             Newxz(PL_reg_curpm, 1, PMOP);
2841 #ifdef USE_ITHREADS
2842             {
2843                 SV* const repointer = &PL_sv_undef;
2844                 /* this regexp is also owned by the new PL_reg_curpm, which
2845                    will try to free it.  */
2846                 av_push(PL_regex_padav, repointer);
2847                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2848                 PL_regex_pad = AvARRAY(PL_regex_padav);
2849             }
2850 #endif
2851         }
2852         SET_reg_curpm(rx);
2853         PL_reg_oldcurpm = PL_curpm;
2854         PL_curpm = PL_reg_curpm;
2855         if (RXp_MATCH_COPIED(prog)) {
2856             /*  Here is a serious problem: we cannot rewrite subbeg,
2857                 since it may be needed if this match fails.  Thus
2858                 $` inside (?{}) could fail... */
2859             PL_reg_oldsaved = prog->subbeg;
2860             PL_reg_oldsavedlen = prog->sublen;
2861             PL_reg_oldsavedoffset = prog->suboffset;
2862             PL_reg_oldsavedcoffset = prog->suboffset;
2863 #ifdef PERL_OLD_COPY_ON_WRITE
2864             PL_nrs = prog->saved_copy;
2865 #endif
2866             RXp_MATCH_COPIED_off(prog);
2867         }
2868         else
2869             PL_reg_oldsaved = NULL;
2870         prog->subbeg = PL_bostr;
2871         prog->suboffset = 0;
2872         prog->subcoffset = 0;
2873         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2874     }
2875 #ifdef DEBUGGING
2876     PL_reg_starttry = *startposp;
2877 #endif
2878     prog->offs[0].start = *startposp - PL_bostr;
2879     prog->lastparen = 0;
2880     prog->lastcloseparen = 0;
2881     PL_regsize = 0;
2882
2883     /* XXXX What this code is doing here?!!!  There should be no need
2884        to do this again and again, prog->lastparen should take care of
2885        this!  --ilya*/
2886
2887     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2888      * Actually, the code in regcppop() (which Ilya may be meaning by
2889      * prog->lastparen), is not needed at all by the test suite
2890      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2891      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2892      * Meanwhile, this code *is* needed for the
2893      * above-mentioned test suite tests to succeed.  The common theme
2894      * on those tests seems to be returning null fields from matches.
2895      * --jhi updated by dapm */
2896 #if 1
2897     if (prog->nparens) {
2898         regexp_paren_pair *pp = prog->offs;
2899         I32 i;
2900         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2901             ++pp;
2902             pp->start = -1;
2903             pp->end = -1;
2904         }
2905     }
2906 #endif
2907     REGCP_SET(lastcp);
2908     result = regmatch(reginfo, *startposp, progi->program + 1);
2909     if (result != -1) {
2910         prog->offs[0].end = result;
2911         return 1;
2912     }
2913     if (reginfo->cutpoint)
2914         *startposp= reginfo->cutpoint;
2915     REGCP_UNWIND(lastcp);
2916     return 0;
2917 }
2918
2919
2920 #define sayYES goto yes
2921 #define sayNO goto no
2922 #define sayNO_SILENT goto no_silent
2923
2924 /* we dont use STMT_START/END here because it leads to
2925    "unreachable code" warnings, which are bogus, but distracting. */
2926 #define CACHEsayNO \
2927     if (ST.cache_mask) \
2928        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2929     sayNO
2930
2931 /* this is used to determine how far from the left messages like
2932    'failed...' are printed. It should be set such that messages
2933    are inline with the regop output that created them.
2934 */
2935 #define REPORT_CODE_OFF 32
2936
2937
2938 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2939 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2940
2941 #define SLAB_FIRST(s) (&(s)->states[0])
2942 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2943
2944 /* grab a new slab and return the first slot in it */
2945
2946 STATIC regmatch_state *
2947 S_push_slab(pTHX)
2948 {
2949 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2950     dMY_CXT;
2951 #endif
2952     regmatch_slab *s = PL_regmatch_slab->next;
2953     if (!s) {
2954         Newx(s, 1, regmatch_slab);
2955         s->prev = PL_regmatch_slab;
2956         s->next = NULL;
2957         PL_regmatch_slab->next = s;
2958     }
2959     PL_regmatch_slab = s;
2960     return SLAB_FIRST(s);
2961 }
2962
2963
2964 /* push a new state then goto it */
2965
2966 #define PUSH_STATE_GOTO(state, node, input) \
2967     pushinput = input; \
2968     scan = node; \
2969     st->resume_state = state; \
2970     goto push_state;
2971
2972 /* push a new state with success backtracking, then goto it */
2973
2974 #define PUSH_YES_STATE_GOTO(state, node, input) \
2975     pushinput = input; \
2976     scan = node; \
2977     st->resume_state = state; \
2978     goto push_yes_state;
2979
2980
2981
2982
2983 /*
2984
2985 regmatch() - main matching routine
2986
2987 This is basically one big switch statement in a loop. We execute an op,
2988 set 'next' to point the next op, and continue. If we come to a point which
2989 we may need to backtrack to on failure such as (A|B|C), we push a
2990 backtrack state onto the backtrack stack. On failure, we pop the top
2991 state, and re-enter the loop at the state indicated. If there are no more
2992 states to pop, we return failure.
2993
2994 Sometimes we also need to backtrack on success; for example /A+/, where
2995 after successfully matching one A, we need to go back and try to
2996 match another one; similarly for lookahead assertions: if the assertion
2997 completes successfully, we backtrack to the state just before the assertion
2998 and then carry on.  In these cases, the pushed state is marked as
2999 'backtrack on success too'. This marking is in fact done by a chain of
3000 pointers, each pointing to the previous 'yes' state. On success, we pop to
3001 the nearest yes state, discarding any intermediate failure-only states.
3002 Sometimes a yes state is pushed just to force some cleanup code to be
3003 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3004 it to free the inner regex.
3005
3006 Note that failure backtracking rewinds the cursor position, while
3007 success backtracking leaves it alone.
3008
3009 A pattern is complete when the END op is executed, while a subpattern
3010 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3011 ops trigger the "pop to last yes state if any, otherwise return true"
3012 behaviour.
3013
3014 A common convention in this function is to use A and B to refer to the two
3015 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3016 the subpattern to be matched possibly multiple times, while B is the entire
3017 rest of the pattern. Variable and state names reflect this convention.
3018
3019 The states in the main switch are the union of ops and failure/success of
3020 substates associated with with that op.  For example, IFMATCH is the op
3021 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3022 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3023 successfully matched A and IFMATCH_A_fail is a state saying that we have
3024 just failed to match A. Resume states always come in pairs. The backtrack
3025 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3026 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3027 on success or failure.
3028
3029 The struct that holds a backtracking state is actually a big union, with
3030 one variant for each major type of op. The variable st points to the
3031 top-most backtrack struct. To make the code clearer, within each
3032 block of code we #define ST to alias the relevant union.
3033
3034 Here's a concrete example of a (vastly oversimplified) IFMATCH
3035 implementation:
3036
3037     switch (state) {
3038     ....
3039
3040 #define ST st->u.ifmatch
3041
3042     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3043         ST.foo = ...; // some state we wish to save
3044         ...
3045         // push a yes backtrack state with a resume value of
3046         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3047         // first node of A:
3048         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3049         // NOTREACHED
3050
3051     case IFMATCH_A: // we have successfully executed A; now continue with B
3052         next = B;
3053         bar = ST.foo; // do something with the preserved value
3054         break;
3055
3056     case IFMATCH_A_fail: // A failed, so the assertion failed
3057         ...;   // do some housekeeping, then ...
3058         sayNO; // propagate the failure
3059
3060 #undef ST
3061
3062     ...
3063     }
3064
3065 For any old-timers reading this who are familiar with the old recursive
3066 approach, the code above is equivalent to:
3067
3068     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3069     {
3070         int foo = ...
3071         ...
3072         if (regmatch(A)) {
3073             next = B;
3074             bar = foo;
3075             break;
3076         }
3077         ...;   // do some housekeeping, then ...
3078         sayNO; // propagate the failure
3079     }
3080
3081 The topmost backtrack state, pointed to by st, is usually free. If you
3082 want to claim it, populate any ST.foo fields in it with values you wish to
3083 save, then do one of
3084
3085         PUSH_STATE_GOTO(resume_state, node, newinput);
3086         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3087
3088 which sets that backtrack state's resume value to 'resume_state', pushes a
3089 new free entry to the top of the backtrack stack, then goes to 'node'.
3090 On backtracking, the free slot is popped, and the saved state becomes the
3091 new free state. An ST.foo field in this new top state can be temporarily
3092 accessed to retrieve values, but once the main loop is re-entered, it
3093 becomes available for reuse.
3094
3095 Note that the depth of the backtrack stack constantly increases during the
3096 left-to-right execution of the pattern, rather than going up and down with
3097 the pattern nesting. For example the stack is at its maximum at Z at the
3098 end of the pattern, rather than at X in the following:
3099
3100     /(((X)+)+)+....(Y)+....Z/
3101
3102 The only exceptions to this are lookahead/behind assertions and the cut,
3103 (?>A), which pop all the backtrack states associated with A before
3104 continuing.
3105
3106 Backtrack state structs are allocated in slabs of about 4K in size.
3107 PL_regmatch_state and st always point to the currently active state,
3108 and PL_regmatch_slab points to the slab currently containing
3109 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3110 allocated, and is never freed until interpreter destruction. When the slab
3111 is full, a new one is allocated and chained to the end. At exit from
3112 regmatch(), slabs allocated since entry are freed.
3113
3114 */
3115
3116
3117 #define DEBUG_STATE_pp(pp)                                  \
3118     DEBUG_STATE_r({                                         \
3119         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3120         PerlIO_printf(Perl_debug_log,                       \
3121             "    %*s"pp" %s%s%s%s%s\n",                     \
3122             depth*2, "",                                    \
3123             PL_reg_name[st->resume_state],                     \
3124             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3125             ((st==yes_state) ? "Y" : ""),                   \
3126             ((st==mark_state) ? "M" : ""),                  \
3127             ((st==yes_state||st==mark_state) ? "]" : "")    \
3128         );                                                  \
3129     });
3130
3131
3132 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3133
3134 #ifdef DEBUGGING
3135
3136 STATIC void
3137 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3138     const char *start, const char *end, const char *blurb)
3139 {
3140     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3141
3142     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3143
3144     if (!PL_colorset)
3145             reginitcolors();
3146     {
3147         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3148             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3149
3150         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3151             start, end - start, 60);
3152
3153         PerlIO_printf(Perl_debug_log,
3154             "%s%s REx%s %s against %s\n",
3155                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3156
3157         if (utf8_target||utf8_pat)
3158             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3159                 utf8_pat ? "pattern" : "",
3160                 utf8_pat && utf8_target ? " and " : "",
3161                 utf8_target ? "string" : ""
3162             );
3163     }
3164 }
3165
3166 STATIC void
3167 S_dump_exec_pos(pTHX_ const char *locinput,
3168                       const regnode *scan,
3169                       const char *loc_regeol,
3170                       const char *loc_bostr,
3171                       const char *loc_reg_starttry,
3172                       const bool utf8_target)
3173 {
3174     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3175     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3176     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3177     /* The part of the string before starttry has one color
3178        (pref0_len chars), between starttry and current
3179        position another one (pref_len - pref0_len chars),
3180        after the current position the third one.
3181        We assume that pref0_len <= pref_len, otherwise we
3182        decrease pref0_len.  */
3183     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3184         ? (5 + taill) - l : locinput - loc_bostr;
3185     int pref0_len;
3186
3187     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3188
3189     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3190         pref_len++;
3191     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3192     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3193         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3194               ? (5 + taill) - pref_len : loc_regeol - locinput);
3195     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3196         l--;
3197     if (pref0_len < 0)
3198         pref0_len = 0;
3199     if (pref0_len > pref_len)
3200         pref0_len = pref_len;
3201     {
3202         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3203
3204         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3205             (locinput - pref_len),pref0_len, 60, 4, 5);
3206
3207         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3208                     (locinput - pref_len + pref0_len),
3209                     pref_len - pref0_len, 60, 2, 3);
3210
3211         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3212                     locinput, loc_regeol - locinput, 10, 0, 1);
3213
3214         const STRLEN tlen=len0+len1+len2;
3215         PerlIO_printf(Perl_debug_log,
3216                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3217                     (IV)(locinput - loc_bostr),
3218                     len0, s0,
3219                     len1, s1,
3220                     (docolor ? "" : "> <"),
3221                     len2, s2,
3222                     (int)(tlen > 19 ? 0 :  19 - tlen),
3223                     "");
3224     }
3225 }
3226
3227 #endif
3228
3229 /* reg_check_named_buff_matched()
3230  * Checks to see if a named buffer has matched. The data array of
3231  * buffer numbers corresponding to the buffer is expected to reside
3232  * in the regexp->data->data array in the slot stored in the ARG() of
3233  * node involved. Note that this routine doesn't actually care about the
3234  * name, that information is not preserved from compilation to execution.
3235  * Returns the index of the leftmost defined buffer with the given name
3236  * or 0 if non of the buffers matched.
3237  */
3238 STATIC I32
3239 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3240 {
3241     I32 n;
3242     RXi_GET_DECL(rex,rexi);
3243     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3244     I32 *nums=(I32*)SvPVX(sv_dat);
3245
3246     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3247
3248     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3249         if ((I32)rex->lastparen >= nums[n] &&
3250             rex->offs[nums[n]].end != -1)
3251         {
3252             return nums[n];
3253         }
3254     }
3255     return 0;
3256 }
3257
3258
3259 /* free all slabs above current one  - called during LEAVE_SCOPE */
3260
3261 STATIC void
3262 S_clear_backtrack_stack(pTHX_ void *p)
3263 {
3264     regmatch_slab *s = PL_regmatch_slab->next;
3265     PERL_UNUSED_ARG(p);
3266
3267     if (!s)
3268         return;
3269     PL_regmatch_slab->next = NULL;
3270     while (s) {
3271         regmatch_slab * const osl = s;
3272         s = s->next;
3273         Safefree(osl);
3274     }
3275 }
3276 static bool
3277 S_setup_EXACTISH_ST_c1_c2(pTHX_ regnode *text_node, I32 *c1, I32 *c2)
3278 {
3279     /* This sets up a relatively quick check for the initial part of what must
3280      * match after a CURLY-type operation condition (the "B" in A*B), where B
3281      * starts with an EXACTish node, <text_node>.  If this check is not met,
3282      * the caller knows that it should continue with the loop.  If the check is
3283      * met, the caller must see if all of B is met, before making the decision.
3284      *
3285      * This function sets *<c1> and *<c2> to be the first code point of B.  If
3286      * there are two possible such code points (as when the text_node is
3287      * folded), *<c2> is set to the second.  If there are more than two (which
3288      * happens for some folds), or there is some other complication, these
3289      * parameters are set to CHRTEST_VOID, to indicate not to do a quick check:
3290      * just try all of B after every time through the loop.
3291      *
3292      * If the routine determines that there is no possible way for there to be
3293      * a match, it returns FALSE.
3294      * */
3295
3296     const bool utf8_target = PL_reg_match_utf8;
3297     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3298     dVAR;
3299
3300     /* First byte from the EXACTish node */
3301     U8 *pat_byte = (U8*)STRING(text_node);
3302
3303     if (! UTF_PATTERN) {    /* Not UTF-8: the code point is the byte */
3304         *c1 = *pat_byte;
3305         if (OP(text_node) == EXACT) {
3306             *c2 = *c1;
3307         }
3308         else if (utf8_target
3309                  && HAS_NONLATIN1_FOLD_CLOSURE(*c1)
3310                  && (OP(text_node) != EXACTFA || ! isASCII(*c1)))
3311         {
3312             /* Here, there could be something above Latin1 in the target which
3313              * folds to this character in the pattern, which means there are
3314              * more than two possible beginnings of B. */
3315             *c1 = *c2 = CHRTEST_VOID;
3316         }
3317         else { /* Here nothing above Latin1 can fold to the pattern character */
3318             switch (OP(text_node)) {
3319
3320                 case EXACTFL:   /* /l rules */
3321                     *c2 = PL_fold_locale[*c1];
3322                     break;
3323
3324                 case EXACTFU_SS: /* This requires special handling: Don't
3325                                     shortcut */
3326                     *c1 = *c2 = CHRTEST_VOID;
3327                     break;
3328
3329                 case EXACTF:
3330                     if (! utf8_target) {    /* /d rules */
3331                         *c2 = PL_fold[*c1];
3332                         break;
3333                     }
3334                     /* FALLTHROUGH */
3335                     /* /u rules for all these.  This happens to work for
3336                      * EXACTFA in the ASCII range as nothing in Latin1 folds to
3337                      * ASCII */
3338                 case EXACTFA:
3339                 case EXACTFU_TRICKYFOLD:
3340                 case EXACTFU:
3341                     *c2 = PL_fold_latin1[*c1];
3342                     break;
3343
3344                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3345             }
3346         }
3347     }
3348     else { /* UTF_PATTERN */
3349         if (OP(text_node) == EXACT) {
3350             *c2 = *c1 = utf8n_to_uvchr(pat_byte, UTF8_MAXBYTES, 0, uniflags);
3351             if (*c1 < 0) {  /* Overflowed what we can handle */
3352                 *c1 = *c2 = CHRTEST_VOID;
3353             }
3354             else if (*c1 > 255 && ! utf8_target) {
3355                 return FALSE; /* Can't possibly match */
3356             }
3357         }
3358         else {
3359             if (UTF8_IS_ABOVE_LATIN1(*pat_byte)) {
3360
3361                 /* A multi-character fold is complicated, probably has more
3362                  * than two possibilities */
3363                 if (is_MULTI_CHAR_FOLD_utf8_safe((char*) pat_byte,
3364                                         (char*) pat_byte + STR_LEN(text_node)))
3365                 {
3366                     *c1 = *c2 = CHRTEST_VOID;
3367                 }
3368                 else {  /* Not a multi-char fold */
3369
3370                     /* Load the folds hash, if not already done */
3371                     SV** listp;
3372                     if (! PL_utf8_foldclosures) {
3373                         if (! PL_utf8_tofold) {
3374                             U8 dummy[UTF8_MAXBYTES+1];
3375                             STRLEN dummy_len;
3376
3377                             /* Force loading this by folding an above-Latin1
3378                              * char */
3379                             to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
3380                             assert(PL_utf8_tofold); /* Verify that worked */
3381                         }
3382                         PL_utf8_foldclosures =
3383                                          _swash_inversion_hash(PL_utf8_tofold);
3384                     }
3385
3386                     /* The fold closures data structure is a hash with the keys
3387                     * being every character that is folded to, like 'k', and
3388                     * the values each an array of everything that folds to its
3389                     * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
3390                     if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3391                                              (char *) pat_byte,
3392                                              UTF8SKIP(pat_byte),
3393                                              FALSE))))
3394                     {
3395                         /* Not found in the hash, therefore there are no folds
3396                          * containing it, so there is only a single char
3397                          * possible for beginning B */
3398                         *c2 = *c1 = utf8n_to_uvchr(pat_byte, STR_LEN(text_node),
3399                                                                   0, uniflags);
3400                         if (*c1 < 0) {  /* Overflowed what we can handle */
3401                             *c1 = *c2 = CHRTEST_VOID;
3402                         }
3403                     }
3404                     else {
3405                         AV* list = (AV*) *listp;
3406                         if (av_len(list) != 1) {    /* If there aren't exactly
3407                                                        two folds to this, have
3408                                                        to test B completely */
3409                             *c1 = *c2 = CHRTEST_VOID;
3410                         }
3411                         else {  /* There are two.  Set *c1 and *c2 to them */
3412                             SV** c_p = av_fetch(list, 0, FALSE);
3413                             if (c_p == NULL) {
3414                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3415                             }
3416                             *c1 = SvUV(*c_p);
3417                             c_p = av_fetch(list, 1, FALSE);
3418                             if (c_p == NULL) {
3419                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3420                             }
3421                             *c2 = SvUV(*c_p);
3422                         }
3423                     }
3424                 }
3425             }
3426             else {
3427                 /* Get the character represented by the UTF-8-encoded byte */
3428                 U8 c = (UTF8_IS_INVARIANT(*pat_byte))
3429                         ? *pat_byte
3430                         : TWO_BYTE_UTF8_TO_UNI(*pat_byte, *(pat_byte+1));
3431
3432                 if (HAS_NONLATIN1_FOLD_CLOSURE(c)
3433                     && (OP(text_node) != EXACTFA || ! isASCII(c)))
3434                 {   /* Something above Latin1 folds to this; hence there are
3435                        more than 2 possibilities for B to begin with */
3436                     *c1 = *c2 = CHRTEST_VOID;
3437                 }
3438                 else {
3439                     *c1 = c;
3440                     *c2 = (OP(text_node) == EXACTFL)
3441                            ? PL_fold_locale[*c1]
3442                            : PL_fold_latin1[*c1];
3443                 }
3444             }
3445         }
3446     }
3447
3448     return TRUE;
3449 }
3450
3451 /* returns -1 on failure, $+[0] on success */
3452 STATIC I32
3453 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3454 {
3455 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3456     dMY_CXT;
3457 #endif
3458     dVAR;
3459     const bool utf8_target = PL_reg_match_utf8;
3460     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3461     REGEXP *rex_sv = reginfo->prog;
3462     regexp *rex = (struct regexp *)SvANY(rex_sv);
3463     RXi_GET_DECL(rex,rexi);
3464     I32 oldsave;
3465     /* the current state. This is a cached copy of PL_regmatch_state */
3466     regmatch_state *st;
3467     /* cache heavy used fields of st in registers */
3468     regnode *scan;
3469     regnode *next;
3470     U32 n = 0;  /* general value; init to avoid compiler warning */
3471     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3472     char *locinput = startpos;
3473     char *pushinput; /* where to continue after a PUSH */
3474     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3475
3476     bool result = 0;        /* return value of S_regmatch */
3477     int depth = 0;          /* depth of backtrack stack */
3478     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3479     const U32 max_nochange_depth =
3480         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3481         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3482     regmatch_state *yes_state = NULL; /* state to pop to on success of
3483                                                             subpattern */
3484     /* mark_state piggy backs on the yes_state logic so that when we unwind
3485        the stack on success we can update the mark_state as we go */
3486     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3487     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3488     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3489     U32 state_num;
3490     bool no_final = 0;      /* prevent failure from backtracking? */
3491     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3492     char *startpoint = locinput;
3493     SV *popmark = NULL;     /* are we looking for a mark? */
3494     SV *sv_commit = NULL;   /* last mark name seen in failure */
3495     SV *sv_yes_mark = NULL; /* last mark name we have seen
3496                                during a successful match */
3497     U32 lastopen = 0;       /* last open we saw */
3498     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3499     SV* const oreplsv = GvSV(PL_replgv);
3500     /* these three flags are set by various ops to signal information to
3501      * the very next op. They have a useful lifetime of exactly one loop
3502      * iteration, and are not preserved or restored by state pushes/pops
3503      */
3504     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3505     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3506     int logical = 0;        /* the following EVAL is:
3507                                 0: (?{...})
3508                                 1: (?(?{...})X|Y)
3509                                 2: (??{...})
3510                                or the following IFMATCH/UNLESSM is:
3511                                 false: plain (?=foo)
3512                                 true:  used as a condition: (?(?=foo))
3513                             */
3514     PAD* last_pad = NULL;
3515     dMULTICALL;
3516     I32 gimme = G_SCALAR;
3517     CV *caller_cv = NULL;       /* who called us */
3518     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3519     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3520
3521 #ifdef DEBUGGING
3522     GET_RE_DEBUG_FLAGS_DECL;
3523 #endif
3524
3525     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3526     multicall_oldcatch = 0;
3527     multicall_cv = NULL;
3528     cx = NULL;
3529     PERL_UNUSED_VAR(multicall_cop);
3530     PERL_UNUSED_VAR(newsp);
3531
3532
3533     PERL_ARGS_ASSERT_REGMATCH;
3534
3535     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3536             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3537     }));
3538     /* on first ever call to regmatch, allocate first slab */
3539     if (!PL_regmatch_slab) {
3540         Newx(PL_regmatch_slab, 1, regmatch_slab);
3541         PL_regmatch_slab->prev = NULL;
3542         PL_regmatch_slab->next = NULL;
3543         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3544     }
3545
3546     oldsave = PL_savestack_ix;
3547     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3548     SAVEVPTR(PL_regmatch_slab);
3549     SAVEVPTR(PL_regmatch_state);
3550
3551     /* grab next free state slot */
3552     st = ++PL_regmatch_state;
3553     if (st >  SLAB_LAST(PL_regmatch_slab))
3554         st = PL_regmatch_state = S_push_slab(aTHX);
3555
3556     /* Note that nextchr is a byte even in UTF */
3557     SET_nextchr;
3558     scan = prog;
3559     while (scan != NULL) {
3560
3561         DEBUG_EXECUTE_r( {
3562             SV * const prop = sv_newmortal();
3563             regnode *rnext=regnext(scan);
3564             DUMP_EXEC_POS( locinput, scan, utf8_target );
3565             regprop(rex, prop, scan);
3566
3567             PerlIO_printf(Perl_debug_log,
3568                     "%3"IVdf":%*s%s(%"IVdf")\n",
3569                     (IV)(scan - rexi->program), depth*2, "",
3570                     SvPVX_const(prop),
3571                     (PL_regkind[OP(scan)] == END || !rnext) ?
3572                         0 : (IV)(rnext - rexi->program));
3573         });
3574
3575         next = scan + NEXT_OFF(scan);
3576         if (next == scan)
3577             next = NULL;
3578         state_num = OP(scan);
3579
3580       reenter_switch:
3581
3582         SET_nextchr;
3583
3584         switch (state_num) {
3585         case BOL: /*  /^../  */
3586             if (locinput == PL_bostr)
3587             {
3588                 /* reginfo->till = reginfo->bol; */
3589                 break;
3590             }
3591             sayNO;
3592
3593         case MBOL: /*  /^../m  */
3594             if (locinput == PL_bostr ||
3595                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3596             {
3597                 break;
3598             }
3599             sayNO;
3600
3601         case SBOL: /*  /^../s  */
3602             if (locinput == PL_bostr)
3603                 break;
3604             sayNO;
3605
3606         case GPOS: /*  \G  */
3607             if (locinput == reginfo->ganch)
3608                 break;
3609             sayNO;
3610
3611         case KEEPS: /*   \K  */
3612             /* update the startpoint */
3613             st->u.keeper.val = rex->offs[0].start;
3614             rex->offs[0].start = locinput - PL_bostr;
3615             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3616             /*NOT-REACHED*/
3617         case KEEPS_next_fail:
3618             /* rollback the start point change */
3619             rex->offs[0].start = st->u.keeper.val;
3620             sayNO_SILENT;
3621             /*NOT-REACHED*/
3622
3623         case EOL: /* /..$/  */
3624                 goto seol;
3625
3626         case MEOL: /* /..$/m  */
3627             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3628                 sayNO;
3629             break;
3630
3631         case SEOL: /* /..$/s  */
3632           seol:
3633             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3634                 sayNO;
3635             if (PL_regeol - locinput > 1)
3636                 sayNO;
3637             break;
3638
3639         case EOS: /*  \z  */
3640             if (!NEXTCHR_IS_EOS)
3641                 sayNO;
3642             break;
3643
3644         case SANY: /*  /./s  */
3645             if (NEXTCHR_IS_EOS)
3646                 sayNO;
3647             goto increment_locinput;
3648
3649         case CANY: /*  \C  */
3650             if (NEXTCHR_IS_EOS)
3651                 sayNO;
3652             locinput++;
3653             break;
3654
3655         case REG_ANY: /*  /./  */
3656             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3657                 sayNO;
3658             goto increment_locinput;
3659
3660
3661 #undef  ST
3662 #define ST st->u.trie
3663         case TRIEC: /* (ab|cd) with known charclass */
3664             /* In this case the charclass data is available inline so
3665                we can fail fast without a lot of extra overhead.
3666              */
3667             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3668                 DEBUG_EXECUTE_r(
3669                     PerlIO_printf(Perl_debug_log,
3670                               "%*s  %sfailed to match trie start class...%s\n",
3671                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3672                 );
3673                 sayNO_SILENT;
3674                 assert(0); /* NOTREACHED */
3675             }
3676             /* FALL THROUGH */
3677         case TRIE:  /* (ab|cd)  */
3678             /* the basic plan of execution of the trie is:
3679              * At the beginning, run though all the states, and
3680              * find the longest-matching word. Also remember the position
3681              * of the shortest matching word. For example, this pattern:
3682              *    1  2 3 4    5
3683              *    ab|a|x|abcd|abc
3684              * when matched against the string "abcde", will generate
3685              * accept states for all words except 3, with the longest
3686              * matching word being 4, and the shortest being 2 (with
3687              * the position being after char 1 of the string).
3688              *
3689              * Then for each matching word, in word order (i.e. 1,2,4,5),
3690              * we run the remainder of the pattern; on each try setting
3691              * the current position to the character following the word,
3692              * returning to try the next word on failure.
3693              *
3694              * We avoid having to build a list of words at runtime by
3695              * using a compile-time structure, wordinfo[].prev, which
3696              * gives, for each word, the previous accepting word (if any).
3697              * In the case above it would contain the mappings 1->2, 2->0,
3698              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3699              * the longest word (4 above), a list of all words, by
3700              * following the list of prev pointers; this gives us the
3701              * unordered list 4,5,1,2. Then given the current word we have
3702              * just tried, we can go through the list and find the
3703              * next-biggest word to try (so if we just failed on word 2,
3704              * the next in the list is 4).
3705              *
3706              * Since at runtime we don't record the matching position in
3707              * the string for each word, we have to work that out for
3708              * each word we're about to process. The wordinfo table holds
3709              * the character length of each word; given that we recorded
3710              * at the start: the position of the shortest word and its
3711              * length in chars, we just need to move the pointer the
3712              * difference between the two char lengths. Depending on
3713              * Unicode status and folding, that's cheap or expensive.
3714              *
3715              * This algorithm is optimised for the case where are only a
3716              * small number of accept states, i.e. 0,1, or maybe 2.
3717              * With lots of accepts states, and having to try all of them,
3718              * it becomes quadratic on number of accept states to find all
3719              * the next words.
3720              */
3721
3722             {
3723                 /* what type of TRIE am I? (utf8 makes this contextual) */
3724                 DECL_TRIE_TYPE(scan);
3725
3726                 /* what trie are we using right now */
3727                 reg_trie_data * const trie
3728                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3729                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3730                 U32 state = trie->startstate;
3731
3732                 if (   trie->bitmap
3733                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3734                 {
3735                     if (trie->states[ state ].wordnum) {
3736                          DEBUG_EXECUTE_r(
3737                             PerlIO_printf(Perl_debug_log,
3738                                           "%*s  %smatched empty string...%s\n",
3739                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3740                         );
3741                         if (!trie->jump)
3742                             break;
3743                     } else {
3744                         DEBUG_EXECUTE_r(
3745                             PerlIO_printf(Perl_debug_log,
3746                                           "%*s  %sfailed to match trie start class...%s\n",
3747                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3748                         );
3749                         sayNO_SILENT;
3750                    }
3751                 }
3752
3753             {
3754                 U8 *uc = ( U8* )locinput;
3755
3756                 STRLEN len = 0;
3757                 STRLEN foldlen = 0;
3758                 U8 *uscan = (U8*)NULL;
3759                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3760                 U32 charcount = 0; /* how many input chars we have matched */
3761                 U32 accepted = 0; /* have we seen any accepting states? */
3762
3763                 ST.jump = trie->jump;
3764                 ST.me = scan;
3765                 ST.firstpos = NULL;
3766                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3767                 ST.nextword = 0;
3768
3769                 /* fully traverse the TRIE; note the position of the
3770                    shortest accept state and the wordnum of the longest
3771                    accept state */
3772
3773                 while ( state && uc <= (U8*)PL_regeol ) {
3774                     U32 base = trie->states[ state ].trans.base;
3775                     UV uvc = 0;
3776                     U16 charid = 0;
3777                     U16 wordnum;
3778                     wordnum = trie->states[ state ].wordnum;
3779
3780                     if (wordnum) { /* it's an accept state */
3781                         if (!accepted) {
3782                             accepted = 1;
3783                             /* record first match position */
3784                             if (ST.longfold) {
3785                                 ST.firstpos = (U8*)locinput;
3786                                 ST.firstchars = 0;
3787                             }
3788                             else {
3789                                 ST.firstpos = uc;
3790                                 ST.firstchars = charcount;
3791                             }
3792                         }
3793                         if (!ST.nextword || wordnum < ST.nextword)
3794                             ST.nextword = wordnum;
3795                         ST.topword = wordnum;
3796                     }
3797
3798                     DEBUG_TRIE_EXECUTE_r({
3799                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3800                                 PerlIO_printf( Perl_debug_log,
3801                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3802                                     2+depth * 2, "", PL_colors[4],
3803                                     (UV)state, (accepted ? 'Y' : 'N'));
3804                     });
3805
3806                     /* read a char and goto next state */
3807                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3808                         I32 offset;
3809                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3810                                              uscan, len, uvc, charid, foldlen,
3811                                              foldbuf, uniflags);
3812                         charcount++;
3813                         if (foldlen>0)
3814                             ST.longfold = TRUE;
3815                         if (charid &&
3816                              ( ((offset =
3817                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3818
3819                              && ((U32)offset < trie->lasttrans)
3820                              && trie->trans[offset].check == state)
3821                         {
3822                             state = trie->trans[offset].next;
3823                         }
3824                         else {
3825                             state = 0;
3826                         }
3827                         uc += len;
3828
3829                     }
3830                     else {
3831                         state = 0;
3832                     }
3833                     DEBUG_TRIE_EXECUTE_r(
3834                         PerlIO_printf( Perl_debug_log,
3835                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3836                             charid, uvc, (UV)state, PL_colors[5] );
3837                     );
3838                 }
3839                 if (!accepted)
3840                    sayNO;
3841
3842                 /* calculate total number of accept states */
3843                 {
3844                     U16 w = ST.topword;
3845                     accepted = 0;
3846                     while (w) {
3847                         w = trie->wordinfo[w].prev;
3848                         accepted++;
3849                     }
3850                     ST.accepted = accepted;
3851                 }
3852
3853                 DEBUG_EXECUTE_r(
3854                     PerlIO_printf( Perl_debug_log,
3855                         "%*s  %sgot %"IVdf" possible matches%s\n",
3856                         REPORT_CODE_OFF + depth * 2, "",
3857                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3858                 );
3859                 goto trie_first_try; /* jump into the fail handler */
3860             }}
3861             assert(0); /* NOTREACHED */
3862
3863         case TRIE_next_fail: /* we failed - try next alternative */
3864         {
3865             U8 *uc;
3866             if ( ST.jump) {
3867                 REGCP_UNWIND(ST.cp);
3868                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3869             }
3870             if (!--ST.accepted) {
3871                 DEBUG_EXECUTE_r({
3872                     PerlIO_printf( Perl_debug_log,
3873                         "%*s  %sTRIE failed...%s\n",
3874                         REPORT_CODE_OFF+depth*2, "",
3875                         PL_colors[4],
3876                         PL_colors[5] );
3877                 });
3878                 sayNO_SILENT;
3879             }
3880             {
3881                 /* Find next-highest word to process.  Note that this code
3882                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3883                 U16 min = 0;
3884                 U16 word;
3885                 U16 const nextword = ST.nextword;
3886                 reg_trie_wordinfo * const wordinfo
3887                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3888                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3889                     if (word > nextword && (!min || word < min))
3890                         min = word;
3891                 }
3892                 ST.nextword = min;
3893             }
3894
3895           trie_first_try:
3896             if (do_cutgroup) {
3897                 do_cutgroup = 0;
3898                 no_final = 0;
3899             }
3900
3901             if ( ST.jump) {
3902                 ST.lastparen = rex->lastparen;
3903                 ST.lastcloseparen = rex->lastcloseparen;
3904                 REGCP_SET(ST.cp);
3905             }
3906
3907             /* find start char of end of current word */
3908             {
3909                 U32 chars; /* how many chars to skip */
3910                 reg_trie_data * const trie
3911                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3912
3913                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3914                             >=  ST.firstchars);
3915                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3916                             - ST.firstchars;
3917                 uc = ST.firstpos;
3918
3919                 if (ST.longfold) {
3920                     /* the hard option - fold each char in turn and find
3921                      * its folded length (which may be different */
3922                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3923                     STRLEN foldlen;
3924                     STRLEN len;
3925                     UV uvc;
3926                     U8 *uscan;
3927
3928                     while (chars) {
3929                         if (utf8_target) {
3930                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3931                                                     uniflags);
3932                             uc += len;
3933                         }
3934                         else {
3935                             uvc = *uc;
3936                             uc++;
3937                         }
3938                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3939                         uscan = foldbuf;
3940                         while (foldlen) {
3941                             if (!--chars)
3942                                 break;
3943                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3944                                             uniflags);
3945                             uscan += len;
3946                             foldlen -= len;
3947                         }
3948                     }
3949                 }
3950                 else {
3951                     if (utf8_target)
3952                         while (chars--)
3953                             uc += UTF8SKIP(uc);
3954                     else
3955                         uc += chars;
3956                 }
3957             }
3958
3959             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3960                             ? ST.jump[ST.nextword]
3961                             : NEXT_OFF(ST.me));
3962
3963             DEBUG_EXECUTE_r({
3964                 PerlIO_printf( Perl_debug_log,
3965                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3966                     REPORT_CODE_OFF+depth*2, "",
3967                     PL_colors[4],
3968                     ST.nextword,
3969                     PL_colors[5]
3970                     );
3971             });
3972
3973             if (ST.accepted > 1 || has_cutgroup) {
3974                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
3975                 assert(0); /* NOTREACHED */
3976             }
3977             /* only one choice left - just continue */
3978             DEBUG_EXECUTE_r({
3979                 AV *const trie_words
3980                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3981                 SV ** const tmp = av_fetch( trie_words,
3982                     ST.nextword-1, 0 );
3983                 SV *sv= tmp ? sv_newmortal() : NULL;
3984
3985                 PerlIO_printf( Perl_debug_log,
3986                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3987                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3988                     ST.nextword,
3989                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3990                             PL_colors[0], PL_colors[1],
3991                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3992                         )
3993                     : "not compiled under -Dr",
3994                     PL_colors[5] );
3995             });
3996
3997             locinput = (char*)uc;
3998             continue; /* execute rest of RE */
3999             assert(0); /* NOTREACHED */
4000         }
4001 #undef  ST
4002
4003         case EXACT: {            /*  /abc/        */
4004             char *s = STRING(scan);
4005             ln = STR_LEN(scan);
4006             if (utf8_target != UTF_PATTERN) {
4007                 /* The target and the pattern have differing utf8ness. */
4008                 char *l = locinput;
4009                 const char * const e = s + ln;
4010
4011                 if (utf8_target) {
4012                     /* The target is utf8, the pattern is not utf8.
4013                      * Above-Latin1 code points can't match the pattern;
4014                      * invariants match exactly, and the other Latin1 ones need
4015                      * to be downgraded to a single byte in order to do the
4016                      * comparison.  (If we could be confident that the target
4017                      * is not malformed, this could be refactored to have fewer
4018                      * tests by just assuming that if the first bytes match, it
4019                      * is an invariant, but there are tests in the test suite
4020                      * dealing with (??{...}) which violate this) */
4021                     while (s < e) {
4022                         if (l >= PL_regeol)
4023                              sayNO;
4024                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4025                             sayNO;
4026                         }
4027                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4028                             if (*l != *s) {
4029                                 sayNO;
4030                             }
4031                             l++;
4032                         }
4033                         else {
4034                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4035                                 sayNO;
4036                             }
4037                             l += 2;
4038                         }
4039                         s++;
4040                     }
4041                 }
4042                 else {
4043                     /* The target is not utf8, the pattern is utf8. */
4044                     while (s < e) {
4045                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4046                         {
4047                             sayNO;
4048                         }
4049                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4050                             if (*s != *l) {
4051                                 sayNO;
4052                             }
4053                             s++;
4054                         }
4055                         else {
4056                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4057                                 sayNO;
4058                             }
4059                             s += 2;
4060                         }
4061                         l++;
4062                     }
4063                 }
4064                 locinput = l;
4065                 break;
4066             }
4067             /* The target and the pattern have the same utf8ness. */
4068             /* Inline the first character, for speed. */
4069             if (UCHARAT(s) != nextchr)
4070                 sayNO;
4071             if (PL_regeol - locinput < ln)
4072                 sayNO;
4073             if (ln > 1 && memNE(s, locinput, ln))
4074                 sayNO;
4075             locinput += ln;
4076             break;
4077             }
4078
4079         case EXACTFL: {          /*  /abc/il      */
4080             re_fold_t folder;
4081             const U8 * fold_array;
4082             const char * s;
4083             U32 fold_utf8_flags;
4084
4085             PL_reg_flags |= RF_tainted;
4086             folder = foldEQ_locale;
4087             fold_array = PL_fold_locale;
4088             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4089             goto do_exactf;
4090
4091         case EXACTFU_SS:         /*  /\x{df}/iu   */
4092         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4093         case EXACTFU:            /*  /abc/iu      */
4094             folder = foldEQ_latin1;
4095             fold_array = PL_fold_latin1;
4096             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4097             goto do_exactf;
4098
4099         case EXACTFA:            /*  /abc/iaa     */
4100             folder = foldEQ_latin1;
4101             fold_array = PL_fold_latin1;
4102             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4103             goto do_exactf;
4104
4105         case EXACTF:             /*  /abc/i       */
4106             folder = foldEQ;
4107             fold_array = PL_fold;
4108             fold_utf8_flags = 0;
4109
4110           do_exactf:
4111             s = STRING(scan);
4112             ln = STR_LEN(scan);
4113
4114             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4115               /* Either target or the pattern are utf8, or has the issue where
4116                * the fold lengths may differ. */
4117                 const char * const l = locinput;
4118                 char *e = PL_regeol;
4119
4120                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4121                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4122                 {
4123                     sayNO;
4124                 }
4125                 locinput = e;
4126                 break;
4127             }
4128
4129             /* Neither the target nor the pattern are utf8 */
4130             if (UCHARAT(s) != nextchr &&
4131                 UCHARAT(s) != fold_array[nextchr])
4132             {
4133                 sayNO;
4134             }
4135             if (PL_regeol - locinput < ln)
4136                 sayNO;
4137             if (ln > 1 && ! folder(s, locinput, ln))
4138                 sayNO;
4139             locinput += ln;
4140             break;
4141         }
4142
4143         /* XXX Could improve efficiency by separating these all out using a
4144          * macro or in-line function.  At that point regcomp.c would no longer
4145          * have to set the FLAGS fields of these */
4146         case BOUNDL:  /*  /\b/l  */
4147         case NBOUNDL: /*  /\B/l  */
4148             PL_reg_flags |= RF_tainted;
4149             /* FALL THROUGH */
4150         case BOUND:   /*  /\b/   */
4151         case BOUNDU:  /*  /\b/u  */
4152         case BOUNDA:  /*  /\b/a  */
4153         case NBOUND:  /*  /\B/   */
4154         case NBOUNDU: /*  /\B/u  */
4155         case NBOUNDA: /*  /\B/a  */
4156             /* was last char in word? */
4157             if (utf8_target
4158                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4159                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4160             {
4161                 if (locinput == PL_bostr)
4162                     ln = '\n';
4163                 else {
4164                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4165
4166                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4167                 }
4168                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4169                     ln = isALNUM_uni(ln);
4170                     if (NEXTCHR_IS_EOS)
4171                         n = 0;
4172                     else {
4173                         LOAD_UTF8_CHARCLASS_ALNUM();
4174                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4175                                                                 utf8_target);
4176                     }
4177                 }
4178                 else {
4179                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4180                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4181                 }
4182             }
4183             else {
4184
4185                 /* Here the string isn't utf8, or is utf8 and only ascii
4186                  * characters are to match \w.  In the latter case looking at
4187                  * the byte just prior to the current one may be just the final
4188                  * byte of a multi-byte character.  This is ok.  There are two
4189                  * cases:
4190                  * 1) it is a single byte character, and then the test is doing
4191                  *      just what it's supposed to.
4192                  * 2) it is a multi-byte character, in which case the final
4193                  *      byte is never mistakable for ASCII, and so the test
4194                  *      will say it is not a word character, which is the
4195                  *      correct answer. */
4196                 ln = (locinput != PL_bostr) ?
4197                     UCHARAT(locinput - 1) : '\n';
4198                 switch (FLAGS(scan)) {
4199                     case REGEX_UNICODE_CHARSET:
4200                         ln = isWORDCHAR_L1(ln);
4201                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4202                         break;
4203                     case REGEX_LOCALE_CHARSET:
4204                         ln = isALNUM_LC(ln);
4205                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4206                         break;
4207                     case REGEX_DEPENDS_CHARSET:
4208                         ln = isALNUM(ln);
4209                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4210                         break;
4211                     case REGEX_ASCII_RESTRICTED_CHARSET:
4212                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4213                         ln = isWORDCHAR_A(ln);
4214                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4215                         break;
4216                     default:
4217                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4218                         break;
4219                 }
4220             }
4221             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4222              * regcomp.sym */
4223             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4224                     sayNO;
4225             break;
4226
4227         case ANYOFV: /*  /[abx{df}]/i  */
4228         case ANYOF:  /*  /[abc]/       */
4229             if (NEXTCHR_IS_EOS)
4230                 sayNO;
4231             if (utf8_target || state_num == ANYOFV) {
4232                 STRLEN inclasslen = PL_regeol - locinput;
4233                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
4234                     sayNO;
4235                 locinput += inclasslen;
4236                 break;
4237             }
4238             else {
4239                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4240                     sayNO;
4241                 locinput++;
4242                 break;
4243             }
4244             break;
4245
4246         /* Special char classes: \d, \w etc.
4247          * The defines start on line 166 or so */
4248         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4249                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4250                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4251                   ALNUMA, NALNUMA, isWORDCHAR_A,
4252                   alnum, "a");
4253
4254         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
4255                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4256                   SPACEU, NSPACEU, isSPACE_L1,
4257                   SPACEA, NSPACEA, isSPACE_A,
4258                   space, " ");
4259
4260         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4261                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4262                 DIGITA, NDIGITA, isDIGIT_A,
4263                 digit, "0");
4264
4265         case POSIXA: /* /[[:ascii:]]/ etc */
4266             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4267                 sayNO;
4268             }
4269             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4270             locinput++;
4271             break;
4272
4273         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4274             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4275                 sayNO;
4276             }
4277             goto increment_locinput;
4278
4279         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4280                        a Unicode extended Grapheme Cluster */
4281             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4282               extended Grapheme Cluster is:
4283
4284                CR LF
4285                | Prepend* Begin Extend*
4286                | .
4287
4288                Begin is:           ( Special_Begin | ! Control )
4289                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4290                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4291                Control is:         [ GCB_Control  CR  LF ]
4292                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4293
4294                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4295                we can rewrite
4296
4297                    Begin is ( Regular_Begin + Special Begin )
4298
4299                It turns out that 98.4% of all Unicode code points match
4300                Regular_Begin.  Doing it this way eliminates a table match in
4301                the previous implementation for almost all Unicode code points.
4302
4303                There is a subtlety with Prepend* which showed up in testing.
4304                Note that the Begin, and only the Begin is required in:
4305                 | Prepend* Begin Extend*
4306                Also, Begin contains '! Control'.  A Prepend must be a
4307                '!  Control', which means it must also be a Begin.  What it
4308                comes down to is that if we match Prepend* and then find no
4309                suitable Begin afterwards, that if we backtrack the last
4310                Prepend, that one will be a suitable Begin.
4311             */
4312
4313             if (NEXTCHR_IS_EOS)
4314                 sayNO;
4315             if  (! utf8_target) {
4316
4317                 /* Match either CR LF  or '.', as all the other possibilities
4318                  * require utf8 */
4319                 locinput++;         /* Match the . or CR */
4320                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4321                                        match the LF */
4322                     && locinput < PL_regeol
4323                     && UCHARAT(locinput) == '\n') locinput++;
4324             }
4325             else {
4326
4327                 /* Utf8: See if is ( CR LF ); already know that locinput <
4328                  * PL_regeol, so locinput+1 is in bounds */
4329                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4330                         && UCHARAT(locinput + 1) == '\n')
4331                 {
4332                     locinput += 2;
4333                 }
4334                 else {
4335                     STRLEN len;
4336
4337                     /* In case have to backtrack to beginning, then match '.' */
4338                     char *starting = locinput;
4339
4340                     /* In case have to backtrack the last prepend */
4341                     char *previous_prepend = 0;
4342
4343                     LOAD_UTF8_CHARCLASS_GCB();
4344
4345                     /* Match (prepend)*   */
4346                     while (locinput < PL_regeol
4347                            && (len = is_GCB_Prepend_utf8(locinput)))
4348                     {
4349                         previous_prepend = locinput;
4350                         locinput += len;
4351                     }
4352
4353                     /* As noted above, if we matched a prepend character, but
4354                      * the next thing won't match, back off the last prepend we
4355                      * matched, as it is guaranteed to match the begin */
4356                     if (previous_prepend
4357                         && (locinput >=  PL_regeol
4358                             || (! swash_fetch(PL_utf8_X_regular_begin,
4359                                              (U8*)locinput, utf8_target)
4360                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4361                         )
4362                     {
4363                         locinput = previous_prepend;
4364                     }
4365
4366                     /* Note that here we know PL_regeol > locinput, as we
4367                      * tested that upon input to this switch case, and if we
4368                      * moved locinput forward, we tested the result just above
4369                      * and it either passed, or we backed off so that it will
4370                      * now pass */
4371                     if (swash_fetch(PL_utf8_X_regular_begin,
4372                                     (U8*)locinput, utf8_target)) {
4373                         locinput += UTF8SKIP(locinput);
4374                     }
4375                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4376
4377                         /* Here did not match the required 'Begin' in the
4378                          * second term.  So just match the very first
4379                          * character, the '.' of the final term of the regex */
4380                         locinput = starting + UTF8SKIP(starting);
4381                         goto exit_utf8;
4382                     } else {
4383
4384                         /* Here is a special begin.  It can be composed of
4385                          * several individual characters.  One possibility is
4386                          * RI+ */
4387                         if ((len = is_GCB_RI_utf8(locinput))) {
4388                             locinput += len;
4389                             while (locinput < PL_regeol
4390                                    && (len = is_GCB_RI_utf8(locinput)))
4391                             {
4392                                 locinput += len;
4393                             }
4394                         } else if ((len = is_GCB_T_utf8(locinput))) {
4395                             /* Another possibility is T+ */
4396                             locinput += len;
4397                             while (locinput < PL_regeol
4398                                 && (len = is_GCB_T_utf8(locinput)))
4399                             {
4400                                 locinput += len;
4401                             }
4402                         } else {
4403
4404                             /* Here, neither RI+ nor T+; must be some other
4405                              * Hangul.  That means it is one of the others: L,
4406                              * LV, LVT or V, and matches:
4407                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4408
4409                             /* Match L*           */
4410                             while (locinput < PL_regeol
4411                                    && (len = is_GCB_L_utf8(locinput)))
4412                             {
4413                                 locinput += len;
4414                             }
4415
4416                             /* Here, have exhausted L*.  If the next character
4417                              * is not an LV, LVT nor V, it means we had to have
4418                              * at least one L, so matches L+ in the original
4419                              * equation, we have a complete hangul syllable.
4420                              * Are done. */
4421
4422                             if (locinput < PL_regeol
4423                                 && is_GCB_LV_LVT_V_utf8(locinput))
4424                             {
4425
4426                                 /* Otherwise keep going.  Must be LV, LVT or V.
4427                                  * See if LVT */
4428                                 if (is_utf8_X_LVT((U8*)locinput)) {
4429                                     locinput += UTF8SKIP(locinput);
4430                                 } else {
4431
4432                                     /* Must be  V or LV.  Take it, then match
4433                                      * V*     */
4434                                     locinput += UTF8SKIP(locinput);
4435                                     while (locinput < PL_regeol
4436                                            && (len = is_GCB_V_utf8(locinput)))
4437                                     {
4438                                         locinput += len;
4439                                     }
4440                                 }
4441
4442                                 /* And any of LV, LVT, or V can be followed
4443                                  * by T*            */
4444                                 while (locinput < PL_regeol
4445                                        && (len = is_GCB_T_utf8(locinput)))
4446                                 {
4447                                     locinput += len;
4448                                 }
4449                             }
4450                         }
4451                     }
4452
4453                     /* Match any extender */
4454                     while (locinput < PL_regeol
4455                             && swash_fetch(PL_utf8_X_extend,
4456                                             (U8*)locinput, utf8_target))
4457                     {
4458                         locinput += UTF8SKIP(locinput);
4459                     }
4460                 }
4461             exit_utf8:
4462                 if (locinput > PL_regeol) sayNO;
4463             }
4464             break;
4465
4466         case NREFFL:  /*  /\g{name}/il  */
4467         {   /* The capture buffer cases.  The ones beginning with N for the
4468                named buffers just convert to the equivalent numbered and
4469                pretend they were called as the corresponding numbered buffer
4470                op.  */
4471             /* don't initialize these in the declaration, it makes C++
4472                unhappy */
4473             char *s;
4474             char type;
4475             re_fold_t folder;
4476             const U8 *fold_array;
4477             UV utf8_fold_flags;
4478
4479             PL_reg_flags |= RF_tainted;
4480             folder = foldEQ_locale;
4481             fold_array = PL_fold_locale;
4482             type = REFFL;
4483             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4484             goto do_nref;
4485
4486         case NREFFA:  /*  /\g{name}/iaa  */
4487             folder = foldEQ_latin1;
4488             fold_array = PL_fold_latin1;
4489             type = REFFA;
4490             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4491             goto do_nref;
4492
4493         case NREFFU:  /*  /\g{name}/iu  */
4494             folder = foldEQ_latin1;
4495             fold_array = PL_fold_latin1;
4496             type = REFFU;
4497             utf8_fold_flags = 0;
4498             goto do_nref;
4499
4500         case NREFF:  /*  /\g{name}/i  */
4501             folder = foldEQ;
4502             fold_array = PL_fold;
4503             type = REFF;
4504             utf8_fold_flags = 0;
4505             goto do_nref;
4506
4507         case NREF:  /*  /\g{name}/   */
4508             type = REF;
4509             folder = NULL;
4510             fold_array = NULL;
4511             utf8_fold_flags = 0;
4512           do_nref:
4513
4514             /* For the named back references, find the corresponding buffer
4515              * number */
4516             n = reg_check_named_buff_matched(rex,scan);
4517
4518             if ( ! n ) {
4519                 sayNO;
4520             }
4521             goto do_nref_ref_common;
4522
4523         case REFFL:  /*  /\1/il  */
4524             PL_reg_flags |= RF_tainted;
4525             folder = foldEQ_locale;
4526             fold_array = PL_fold_locale;
4527             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4528             goto do_ref;
4529
4530         case REFFA:  /*  /\1/iaa  */
4531             folder = foldEQ_latin1;
4532             fold_array = PL_fold_latin1;
4533             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4534             goto do_ref;
4535
4536         case REFFU:  /*  /\1/iu  */
4537             folder = foldEQ_latin1;
4538             fold_array = PL_fold_latin1;
4539             utf8_fold_flags = 0;
4540             goto do_ref;
4541
4542         case REFF:  /*  /\1/i  */
4543             folder = foldEQ;
4544             fold_array = PL_fold;
4545             utf8_fold_flags = 0;
4546             goto do_ref;
4547
4548         case REF:  /*  /\1/    */
4549             folder = NULL;
4550             fold_array = NULL;
4551             utf8_fold_flags = 0;
4552
4553           do_ref:
4554             type = OP(scan);
4555             n = ARG(scan);  /* which paren pair */
4556
4557           do_nref_ref_common:
4558             ln = rex->offs[n].start;
4559             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4560             if (rex->lastparen < n || ln == -1)
4561                 sayNO;                  /* Do not match unless seen CLOSEn. */
4562             if (ln == rex->offs[n].end)
4563                 break;
4564
4565             s = PL_bostr + ln;
4566             if (type != REF     /* REF can do byte comparison */
4567                 && (utf8_target || type == REFFU))
4568             { /* XXX handle REFFL better */
4569                 char * limit = PL_regeol;
4570
4571                 /* This call case insensitively compares the entire buffer
4572                     * at s, with the current input starting at locinput, but
4573                     * not going off the end given by PL_regeol, and returns in
4574                     * limit upon success, how much of the current input was
4575                     * matched */
4576                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4577                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4578                 {
4579                     sayNO;
4580                 }
4581                 locinput = limit;
4582                 break;
4583             }
4584
4585             /* Not utf8:  Inline the first character, for speed. */
4586             if (!NEXTCHR_IS_EOS &&
4587                 UCHARAT(s) != nextchr &&
4588                 (type == REF ||
4589                  UCHARAT(s) != fold_array[nextchr]))
4590                 sayNO;
4591             ln = rex->offs[n].end - ln;
4592             if (locinput + ln > PL_regeol)
4593                 sayNO;
4594             if (ln > 1 && (type == REF
4595                            ? memNE(s, locinput, ln)
4596                            : ! folder(s, locinput, ln)))
4597                 sayNO;
4598             locinput += ln;
4599             break;
4600         }
4601
4602         case NOTHING: /* null op; e.g. the 'nothing' following
4603                        * the '*' in m{(a+|b)*}' */
4604             break;
4605         case TAIL: /* placeholder while compiling (A|B|C) */
4606             break;
4607
4608         case BACK: /* ??? doesn't appear to be used ??? */
4609             break;
4610
4611 #undef  ST
4612 #define ST st->u.eval
4613         {
4614             SV *ret;
4615             REGEXP *re_sv;
4616             regexp *re;
4617             regexp_internal *rei;
4618             regnode *startpoint;
4619
4620         case GOSTART: /*  (?R)  */
4621         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4622             if (cur_eval && cur_eval->locinput==locinput) {
4623                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4624                     Perl_croak(aTHX_ "Infinite recursion in regex");
4625                 if ( ++nochange_depth > max_nochange_depth )
4626                     Perl_croak(aTHX_
4627                         "Pattern subroutine nesting without pos change"
4628                         " exceeded limit in regex");
4629             } else {
4630                 nochange_depth = 0;
4631             }
4632             re_sv = rex_sv;
4633             re = rex;
4634             rei = rexi;
4635             if (OP(scan)==GOSUB) {
4636                 startpoint = scan + ARG2L(scan);
4637                 ST.close_paren = ARG(scan);
4638             } else {
4639                 startpoint = rei->program+1;
4640                 ST.close_paren = 0;
4641             }
4642             goto eval_recurse_doit;
4643             assert(0); /* NOTREACHED */
4644
4645         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4646             if (cur_eval && cur_eval->locinput==locinput) {
4647                 if ( ++nochange_depth > max_nochange_depth )
4648                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4649             } else {
4650                 nochange_depth = 0;
4651             }
4652             {
4653                 /* execute the code in the {...} */
4654
4655                 dSP;
4656                 SV ** before;
4657                 OP * const oop = PL_op;
4658                 COP * const ocurcop = PL_curcop;
4659                 OP *nop;
4660                 char *saved_regeol = PL_regeol;
4661                 struct re_save_state saved_state;
4662                 CV *newcv;
4663
4664                 /* save *all* paren positions */
4665                 regcppush(rex, 0);
4666                 REGCP_SET(runops_cp);
4667
4668                 /* To not corrupt the existing regex state while executing the
4669                  * eval we would normally put it on the save stack, like with
4670                  * save_re_context. However, re-evals have a weird scoping so we
4671                  * can't just add ENTER/LEAVE here. With that, things like
4672                  *
4673                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4674                  *
4675                  * would break, as they expect the localisation to be unwound
4676                  * only when the re-engine backtracks through the bit that
4677                  * localised it.
4678                  *
4679                  * What we do instead is just saving the state in a local c
4680                  * variable.
4681                  */
4682                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4683
4684                 PL_reg_state.re_reparsing = FALSE;
4685
4686                 if (!caller_cv)
4687                     caller_cv = find_runcv(NULL);
4688
4689                 n = ARG(scan);
4690
4691                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4692                     newcv = ((struct regexp *)SvANY(
4693                                                 (REGEXP*)(rexi->data->data[n])
4694                                             ))->qr_anoncv
4695                                         ;
4696                     nop = (OP*)rexi->data->data[n+1];
4697                 }
4698                 else if (rexi->data->what[n] == 'l') { /* literal code */
4699                     newcv = caller_cv;
4700                     nop = (OP*)rexi->data->data[n];
4701                     assert(CvDEPTH(newcv));
4702                 }
4703                 else {
4704                     /* literal with own CV */
4705                     assert(rexi->data->what[n] == 'L');
4706                     newcv = rex->qr_anoncv;
4707                     nop = (OP*)rexi->data->data[n];
4708                 }
4709
4710                 /* normally if we're about to execute code from the same
4711                  * CV that we used previously, we just use the existing
4712                  * CX stack entry. However, its possible that in the
4713                  * meantime we may have backtracked, popped from the save
4714                  * stack, and undone the SAVECOMPPAD(s) associated with
4715                  * PUSH_MULTICALL; in which case PL_comppad no longer
4716                  * points to newcv's pad. */
4717                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4718                 {
4719                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4720                     if (last_pushed_cv) {
4721                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4722                     }
4723                     else {
4724                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4725                     }
4726                     last_pushed_cv = newcv;
4727                 }
4728                 last_pad = PL_comppad;
4729
4730                 /* the initial nextstate you would normally execute
4731                  * at the start of an eval (which would cause error
4732                  * messages to come from the eval), may be optimised
4733                  * away from the execution path in the regex code blocks;
4734                  * so manually set PL_curcop to it initially */
4735                 {
4736                     OP *o = cUNOPx(nop)->op_first;
4737                     assert(o->op_type == OP_NULL);
4738                     if (o->op_targ == OP_SCOPE) {
4739                         o = cUNOPo->op_first;
4740                     }
4741                     else {
4742                         assert(o->op_targ == OP_LEAVE);
4743                         o = cUNOPo->op_first;
4744                         assert(o->op_type == OP_ENTER);
4745                         o = o->op_sibling;
4746                     }
4747
4748                     if (o->op_type != OP_STUB) {
4749                         assert(    o->op_type == OP_NEXTSTATE
4750                                 || o->op_type == OP_DBSTATE
4751                                 || (o->op_type == OP_NULL
4752                                     &&  (  o->op_targ == OP_NEXTSTATE
4753                                         || o->op_targ == OP_DBSTATE
4754                                         )
4755                                     )
4756                         );
4757                         PL_curcop = (COP*)o;
4758                     }
4759                 }
4760                 nop = nop->op_next;
4761
4762                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4763                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4764
4765                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4766
4767                 if (sv_yes_mark) {
4768                     SV *sv_mrk = get_sv("REGMARK", 1);
4769                     sv_setsv(sv_mrk, sv_yes_mark);
4770                 }
4771
4772                 /* we don't use MULTICALL here as we want to call the
4773                  * first op of the block of interest, rather than the
4774                  * first op of the sub */
4775                 before = SP;
4776                 PL_op = nop;
4777                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4778                 SPAGAIN;
4779                 if (SP == before)
4780                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4781                 else {
4782                     ret = POPs;
4783                     PUTBACK;
4784                 }
4785
4786                 /* before restoring everything, evaluate the returned
4787                  * value, so that 'uninit' warnings don't use the wrong
4788                  * PL_op or pad. Also need to process any magic vars
4789                  * (e.g. $1) *before* parentheses are restored */
4790
4791                 PL_op = NULL;
4792
4793                 re_sv = NULL;
4794                 if (logical == 0)        /*   (?{})/   */
4795                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4796                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4797                     sw = cBOOL(SvTRUE(ret));
4798                     logical = 0;
4799                 }
4800                 else {                   /*  /(??{})  */
4801                     /*  if its overloaded, let the regex compiler handle
4802                      *  it; otherwise extract regex, or stringify  */
4803                     if (!SvAMAGIC(ret)) {
4804                         SV *sv = ret;
4805                         if (SvROK(sv))
4806                             sv = SvRV(sv);
4807                         if (SvTYPE(sv) == SVt_REGEXP)
4808                             re_sv = (REGEXP*) sv;
4809                         else if (SvSMAGICAL(sv)) {
4810                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4811                             if (mg)
4812                                 re_sv = (REGEXP *) mg->mg_obj;
4813                         }
4814
4815                         /* force any magic, undef warnings here */
4816                         if (!re_sv) {
4817                             ret = sv_mortalcopy(ret);
4818                             (void) SvPV_force_nolen(ret);
4819                         }
4820                     }
4821
4822                 }
4823
4824                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4825
4826                 /* *** Note that at this point we don't restore
4827                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4828                  * be used again soon. This is safe as long as nothing
4829                  * in the regexp code uses the pad ! */
4830                 PL_op = oop;
4831                 PL_curcop = ocurcop;
4832                 PL_regeol = saved_regeol;
4833                 S_regcp_restore(aTHX_ rex, runops_cp);
4834
4835                 if (logical != 2)
4836                     break;
4837             }
4838
4839                 /* only /(??{})/  from now on */
4840                 logical = 0;
4841                 {
4842                     /* extract RE object from returned value; compiling if
4843                      * necessary */
4844
4845                     if (re_sv) {
4846                         re_sv = reg_temp_copy(NULL, re_sv);
4847                     }
4848                     else {
4849                         U32 pm_flags = 0;
4850                         const I32 osize = PL_regsize;
4851
4852                         if (SvUTF8(ret) && IN_BYTES) {
4853                             /* In use 'bytes': make a copy of the octet
4854                              * sequence, but without the flag on */
4855                             STRLEN len;
4856                             const char *const p = SvPV(ret, len);
4857                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4858                         }
4859                         if (rex->intflags & PREGf_USE_RE_EVAL)
4860                             pm_flags |= PMf_USE_RE_EVAL;
4861
4862                         /* if we got here, it should be an engine which
4863                          * supports compiling code blocks and stuff */
4864                         assert(rex->engine && rex->engine->op_comp);
4865                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4866                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4867                                     rex->engine, NULL, NULL,
4868                                     /* copy /msix etc to inner pattern */
4869                                     scan->flags,
4870                                     pm_flags);
4871
4872                         if (!(SvFLAGS(ret)
4873                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4874                                  | SVs_GMG))) {
4875                             /* This isn't a first class regexp. Instead, it's
4876                                caching a regexp onto an existing, Perl visible
4877                                scalar.  */
4878                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4879                         }
4880                         PL_regsize = osize;
4881                         /* safe to do now that any $1 etc has been
4882                          * interpolated into the new pattern string and
4883                          * compiled */
4884                         S_regcp_restore(aTHX_ rex, runops_cp);
4885                     }
4886                     re = (struct regexp *)SvANY(re_sv);
4887                 }
4888                 RXp_MATCH_COPIED_off(re);
4889                 re->subbeg = rex->subbeg;
4890                 re->sublen = rex->sublen;
4891                 re->suboffset = rex->suboffset;
4892                 re->subcoffset = rex->subcoffset;
4893                 rei = RXi_GET(re);
4894                 DEBUG_EXECUTE_r(
4895                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4896                         "Matching embedded");
4897                 );
4898                 startpoint = rei->program + 1;
4899                 ST.close_paren = 0; /* only used for GOSUB */
4900
4901         eval_recurse_doit: /* Share code with GOSUB below this line */
4902                 /* run the pattern returned from (??{...}) */
4903                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4904                 REGCP_SET(ST.lastcp);
4905
4906                 re->lastparen = 0;
4907                 re->lastcloseparen = 0;
4908
4909                 PL_regsize = 0;
4910
4911                 /* XXXX This is too dramatic a measure... */
4912                 PL_reg_maxiter = 0;
4913
4914                 ST.toggle_reg_flags = PL_reg_flags;
4915                 if (RX_UTF8(re_sv))
4916                     PL_reg_flags |= RF_utf8;
4917                 else
4918                     PL_reg_flags &= ~RF_utf8;
4919                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4920
4921                 ST.prev_rex = rex_sv;
4922                 ST.prev_curlyx = cur_curlyx;
4923                 rex_sv = re_sv;
4924                 SET_reg_curpm(rex_sv);
4925                 rex = re;
4926                 rexi = rei;
4927                 cur_curlyx = NULL;
4928                 ST.B = next;
4929                 ST.prev_eval = cur_eval;
4930                 cur_eval = st;
4931                 /* now continue from first node in postoned RE */
4932                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
4933                 assert(0); /* NOTREACHED */
4934         }
4935
4936         case EVAL_AB: /* cleanup after a successful (??{A})B */
4937             /* note: this is called twice; first after popping B, then A */
4938             PL_reg_flags ^= ST.toggle_reg_flags;
4939             rex_sv = ST.prev_rex;
4940             SET_reg_curpm(rex_sv);
4941             rex = (struct regexp *)SvANY(rex_sv);
4942             rexi = RXi_GET(rex);
4943             regcpblow(ST.cp);
4944             cur_eval = ST.prev_eval;
4945             cur_curlyx = ST.prev_curlyx;
4946
4947             /* XXXX This is too dramatic a measure... */
4948             PL_reg_maxiter = 0;
4949             if ( nochange_depth )
4950                 nochange_depth--;
4951             sayYES;
4952
4953
4954         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4955             /* note: this is called twice; first after popping B, then A */
4956             PL_reg_flags ^= ST.toggle_reg_flags;
4957             rex_sv = ST.prev_rex;
4958             SET_reg_curpm(rex_sv);
4959             rex = (struct regexp *)SvANY(rex_sv);
4960             rexi = RXi_GET(rex);
4961
4962             REGCP_UNWIND(ST.lastcp);
4963             regcppop(rex);
4964             cur_eval = ST.prev_eval;
4965             cur_curlyx = ST.prev_curlyx;
4966             /* XXXX This is too dramatic a measure... */
4967             PL_reg_maxiter = 0;
4968             if ( nochange_depth )
4969                 nochange_depth--;
4970             sayNO_SILENT;
4971 #undef ST
4972
4973         case OPEN: /*  (  */
4974             n = ARG(scan);  /* which paren pair */
4975             rex->offs[n].start_tmp = locinput - PL_bostr;
4976             if (n > PL_regsize)
4977                 PL_regsize = n;
4978             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4979                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4980                 PTR2UV(rex),
4981                 PTR2UV(rex->offs),
4982                 (UV)n,
4983                 (IV)rex->offs[n].start_tmp,
4984                 (UV)PL_regsize
4985             ));
4986             lastopen = n;
4987             break;
4988
4989 /* XXX really need to log other places start/end are set too */
4990 #define CLOSE_CAPTURE \
4991     rex->offs[n].start = rex->offs[n].start_tmp; \
4992     rex->offs[n].end = locinput - PL_bostr; \
4993     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4994         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4995         PTR2UV(rex), \
4996         PTR2UV(rex->offs), \
4997         (UV)n, \
4998         (IV)rex->offs[n].start, \
4999         (IV)rex->offs[n].end \
5000     ))
5001
5002         case CLOSE:  /*  )  */
5003             n = ARG(scan);  /* which paren pair */
5004             CLOSE_CAPTURE;
5005             /*if (n > PL_regsize)
5006                 PL_regsize = n;*/
5007             if (n > rex->lastparen)
5008                 rex->lastparen = n;
5009             rex->lastcloseparen = n;
5010             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5011                 goto fake_end;
5012             }
5013             break;
5014
5015         case ACCEPT:  /*  (*ACCEPT)  */
5016             if (ARG(scan)){
5017                 regnode *cursor;
5018                 for (cursor=scan;
5019                      cursor && OP(cursor)!=END;
5020                      cursor=regnext(cursor))
5021                 {
5022                     if ( OP(cursor)==CLOSE ){
5023                         n = ARG(cursor);
5024                         if ( n <= lastopen ) {
5025                             CLOSE_CAPTURE;
5026                             /*if (n > PL_regsize)
5027                             PL_regsize = n;*/
5028                             if (n > rex->lastparen)
5029                                 rex->lastparen = n;
5030                             rex->lastcloseparen = n;
5031                             if ( n == ARG(scan) || (cur_eval &&
5032                                 cur_eval->u.eval.close_paren == n))
5033                                 break;
5034                         }
5035                     }
5036                 }
5037             }
5038             goto fake_end;
5039             /*NOTREACHED*/
5040
5041         case GROUPP:  /*  (?(1))  */
5042             n = ARG(scan);  /* which paren pair */
5043             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5044             break;
5045
5046         case NGROUPP:  /*  (?(<name>))  */
5047             /* reg_check_named_buff_matched returns 0 for no match */
5048             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5049             break;
5050
5051         case INSUBP:   /*  (?(R))  */
5052             n = ARG(scan);
5053             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5054             break;
5055
5056         case DEFINEP:  /*  (?(DEFINE))  */
5057             sw = 0;
5058             break;
5059
5060         case IFTHEN:   /*  (?(cond)A|B)  */
5061             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5062             if (sw)
5063                 next = NEXTOPER(NEXTOPER(scan));
5064             else {
5065                 next = scan + ARG(scan);
5066                 if (OP(next) == IFTHEN) /* Fake one. */
5067                     next = NEXTOPER(NEXTOPER(next));
5068             }
5069             break;
5070
5071         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5072             logical = scan->flags;
5073             break;
5074
5075 /*******************************************************************
5076
5077 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5078 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5079 STAR/PLUS/CURLY/CURLYN are used instead.)
5080
5081 A*B is compiled as <CURLYX><A><WHILEM><B>
5082
5083 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5084 state, which contains the current count, initialised to -1. It also sets
5085 cur_curlyx to point to this state, with any previous value saved in the
5086 state block.
5087
5088 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5089 since the pattern may possibly match zero times (i.e. it's a while {} loop
5090 rather than a do {} while loop).
5091
5092 Each entry to WHILEM represents a successful match of A. The count in the
5093 CURLYX block is incremented, another WHILEM state is pushed, and execution
5094 passes to A or B depending on greediness and the current count.
5095
5096 For example, if matching against the string a1a2a3b (where the aN are
5097 substrings that match /A/), then the match progresses as follows: (the
5098 pushed states are interspersed with the bits of strings matched so far):
5099
5100     <CURLYX cnt=-1>
5101     <CURLYX cnt=0><WHILEM>
5102     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5103     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5104     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5105     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5106
5107 (Contrast this with something like CURLYM, which maintains only a single
5108 backtrack state:
5109
5110     <CURLYM cnt=0> a1
5111     a1 <CURLYM cnt=1> a2
5112     a1 a2 <CURLYM cnt=2> a3
5113     a1 a2 a3 <CURLYM cnt=3> b
5114 )
5115
5116 Each WHILEM state block marks a point to backtrack to upon partial failure
5117 of A or B, and also contains some minor state data related to that
5118 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5119 overall state, such as the count, and pointers to the A and B ops.
5120
5121 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5122 must always point to the *current* CURLYX block, the rules are:
5123
5124 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5125 and set cur_curlyx to point the new block.
5126
5127 When popping the CURLYX block after a successful or unsuccessful match,
5128 restore the previous cur_curlyx.
5129
5130 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5131 to the outer one saved in the CURLYX block.
5132
5133 When popping the WHILEM block after a successful or unsuccessful B match,
5134 restore the previous cur_curlyx.
5135
5136 Here's an example for the pattern (AI* BI)*BO
5137 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5138
5139 cur_
5140 curlyx backtrack stack
5141 ------ ---------------
5142 NULL
5143 CO     <CO prev=NULL> <WO>
5144 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5145 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5146 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5147
5148 At this point the pattern succeeds, and we work back down the stack to
5149 clean up, restoring as we go:
5150
5151 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5152 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5153 CO     <CO prev=NULL> <WO>
5154 NULL
5155
5156 *******************************************************************/
5157
5158 #define ST st->u.curlyx
5159
5160         case CURLYX:    /* start of /A*B/  (for complex A) */
5161         {
5162             /* No need to save/restore up to this paren */
5163             I32 parenfloor = scan->flags;
5164
5165             assert(next); /* keep Coverity happy */
5166             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5167                 next += ARG(next);
5168
5169             /* XXXX Probably it is better to teach regpush to support
5170                parenfloor > PL_regsize... */
5171             if (parenfloor > (I32)rex->lastparen)
5172                 parenfloor = rex->lastparen; /* Pessimization... */
5173
5174             ST.prev_curlyx= cur_curlyx;
5175             cur_curlyx = st;
5176             ST.cp = PL_savestack_ix;
5177
5178             /* these fields contain the state of the current curly.
5179              * they are accessed by subsequent WHILEMs */
5180             ST.parenfloor = parenfloor;
5181             ST.me = scan;
5182             ST.B = next;
5183             ST.minmod = minmod;
5184             minmod = 0;
5185             ST.count = -1;      /* this will be updated by WHILEM */
5186             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5187
5188             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5189             assert(0); /* NOTREACHED */
5190         }
5191
5192         case CURLYX_end: /* just finished matching all of A*B */
5193             cur_curlyx = ST.prev_curlyx;
5194             sayYES;
5195             assert(0); /* NOTREACHED */
5196
5197         case CURLYX_end_fail: /* just failed to match all of A*B */
5198             regcpblow(ST.cp);
5199             cur_curlyx = ST.prev_curlyx;
5200             sayNO;
5201             assert(0); /* NOTREACHED */
5202
5203
5204 #undef ST
5205 #define ST st->u.whilem
5206
5207         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5208         {
5209             /* see the discussion above about CURLYX/WHILEM */
5210             I32 n;
5211             int min = ARG1(cur_curlyx->u.curlyx.me);
5212             int max = ARG2(cur_curlyx->u.curlyx.me);
5213             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5214
5215             assert(cur_curlyx); /* keep Coverity happy */
5216             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5217             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5218             ST.cache_offset = 0;
5219             ST.cache_mask = 0;
5220
5221
5222             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5223                   "%*s  whilem: matched %ld out of %d..%d\n",
5224                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5225             );
5226
5227             /* First just match a string of min A's. */
5228
5229             if (n < min) {
5230                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5231                 cur_curlyx->u.curlyx.lastloc = locinput;
5232                 REGCP_SET(ST.lastcp);
5233
5234                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5235                 assert(0); /* NOTREACHED */
5236             }
5237
5238             /* If degenerate A matches "", assume A done. */
5239
5240             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5241                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5242                    "%*s  whilem: empty match detected, trying continuation...\n",
5243                    REPORT_CODE_OFF+depth*2, "")
5244                 );
5245                 goto do_whilem_B_max;
5246             }
5247
5248             /* super-linear cache processing */
5249
5250             if (scan->flags) {
5251
5252                 if (!PL_reg_maxiter) {
5253                     /* start the countdown: Postpone detection until we
5254                      * know the match is not *that* much linear. */
5255                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5256                     /* possible overflow for long strings and many CURLYX's */
5257                     if (PL_reg_maxiter < 0)
5258                         PL_reg_maxiter = I32_MAX;
5259                     PL_reg_leftiter = PL_reg_maxiter;
5260                 }
5261
5262                 if (PL_reg_leftiter-- == 0) {
5263                     /* initialise cache */
5264                     const I32 size = (PL_reg_maxiter + 7)/8;
5265                     if (PL_reg_poscache) {
5266                         if ((I32)PL_reg_poscache_size < size) {
5267                             Renew(PL_reg_poscache, size, char);
5268                             PL_reg_poscache_size = size;
5269                         }
5270                         Zero(PL_reg_poscache, size, char);
5271                     }
5272                     else {
5273                         PL_reg_poscache_size = size;
5274                         Newxz(PL_reg_poscache, size, char);
5275                     }
5276                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5277       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5278                               PL_colors[4], PL_colors[5])
5279                     );
5280                 }
5281
5282                 if (PL_reg_leftiter < 0) {
5283                     /* have we already failed at this position? */
5284                     I32 offset, mask;
5285                     offset  = (scan->flags & 0xf) - 1
5286                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5287                     mask    = 1 << (offset % 8);
5288                     offset /= 8;
5289                     if (PL_reg_poscache[offset] & mask) {
5290                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5291                             "%*s  whilem: (cache) already tried at this position...\n",
5292                             REPORT_CODE_OFF+depth*2, "")
5293                         );
5294                         sayNO; /* cache records failure */
5295                     }
5296                     ST.cache_offset = offset;
5297                     ST.cache_mask   = mask;
5298                 }
5299             }
5300
5301             /* Prefer B over A for minimal matching. */
5302
5303             if (cur_curlyx->u.curlyx.minmod) {
5304                 ST.save_curlyx = cur_curlyx;
5305                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5306                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5307                 REGCP_SET(ST.lastcp);
5308                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5309                                     locinput);
5310                 assert(0); /* NOTREACHED */
5311             }
5312
5313             /* Prefer A over B for maximal matching. */
5314
5315             if (n < max) { /* More greed allowed? */
5316                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5317                 cur_curlyx->u.curlyx.lastloc = locinput;
5318                 REGCP_SET(ST.lastcp);
5319                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5320                 assert(0); /* NOTREACHED */
5321             }
5322             goto do_whilem_B_max;
5323         }
5324         assert(0); /* NOTREACHED */
5325
5326         case WHILEM_B_min: /* just matched B in a minimal match */
5327         case WHILEM_B_max: /* just matched B in a maximal match */
5328             cur_curlyx = ST.save_curlyx;
5329             sayYES;
5330             assert(0); /* NOTREACHED */
5331
5332         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5333             cur_curlyx = ST.save_curlyx;
5334             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5335             cur_curlyx->u.curlyx.count--;
5336             CACHEsayNO;
5337             assert(0); /* NOTREACHED */
5338
5339         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5340             /* FALL THROUGH */
5341         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5342             REGCP_UNWIND(ST.lastcp);
5343             regcppop(rex);
5344             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5345             cur_curlyx->u.curlyx.count--;
5346             CACHEsayNO;
5347             assert(0); /* NOTREACHED */
5348
5349         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5350             REGCP_UNWIND(ST.lastcp);
5351             regcppop(rex);      /* Restore some previous $<digit>s? */
5352             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5353                 "%*s  whilem: failed, trying continuation...\n",
5354                 REPORT_CODE_OFF+depth*2, "")
5355             );
5356           do_whilem_B_max:
5357             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5358                 && ckWARN(WARN_REGEXP)
5359                 && !(PL_reg_flags & RF_warned))
5360             {
5361                 PL_reg_flags |= RF_warned;
5362                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5363                      "Complex regular subexpression recursion limit (%d) "
5364                      "exceeded",
5365                      REG_INFTY - 1);
5366             }
5367
5368             /* now try B */
5369             ST.save_curlyx = cur_curlyx;
5370             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5371             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5372                                 locinput);
5373             assert(0); /* NOTREACHED */
5374
5375         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5376             cur_curlyx = ST.save_curlyx;
5377             REGCP_UNWIND(ST.lastcp);
5378             regcppop(rex);
5379
5380             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5381                 /* Maximum greed exceeded */
5382                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5383                     && ckWARN(WARN_REGEXP)
5384                     && !(PL_reg_flags & RF_warned))
5385                 {
5386                     PL_reg_flags |= RF_warned;
5387                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5388                         "Complex regular subexpression recursion "
5389                         "limit (%d) exceeded",
5390                         REG_INFTY - 1);
5391                 }
5392                 cur_curlyx->u.curlyx.count--;
5393                 CACHEsayNO;
5394             }
5395
5396             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5397                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5398             );
5399             /* Try grabbing another A and see if it helps. */
5400             cur_curlyx->u.curlyx.lastloc = locinput;
5401             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5402             REGCP_SET(ST.lastcp);
5403             PUSH_STATE_GOTO(WHILEM_A_min,
5404                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5405                 locinput);
5406             assert(0); /* NOTREACHED */
5407
5408 #undef  ST
5409 #define ST st->u.branch
5410
5411         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5412             next = scan + ARG(scan);
5413             if (next == scan)
5414                 next = NULL;
5415             scan = NEXTOPER(scan);
5416             /* FALL THROUGH */
5417
5418         case BRANCH:        /*  /(...|A|...)/ */
5419             scan = NEXTOPER(scan); /* scan now points to inner node */
5420             ST.lastparen = rex->lastparen;
5421             ST.lastcloseparen = rex->lastcloseparen;
5422             ST.next_branch = next;
5423             REGCP_SET(ST.cp);
5424
5425             /* Now go into the branch */
5426             if (has_cutgroup) {
5427                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5428             } else {
5429                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5430             }
5431             assert(0); /* NOTREACHED */
5432
5433         case CUTGROUP:  /*  /(*THEN)/  */
5434             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5435                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5436             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5437             assert(0); /* NOTREACHED */
5438
5439         case CUTGROUP_next_fail:
5440             do_cutgroup = 1;
5441             no_final = 1;
5442             if (st->u.mark.mark_name)
5443                 sv_commit = st->u.mark.mark_name;
5444             sayNO;
5445             assert(0); /* NOTREACHED */
5446
5447         case BRANCH_next:
5448             sayYES;
5449             assert(0); /* NOTREACHED */
5450
5451         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5452             if (do_cutgroup) {
5453                 do_cutgroup = 0;
5454                 no_final = 0;
5455             }
5456             REGCP_UNWIND(ST.cp);
5457             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5458             scan = ST.next_branch;
5459             /* no more branches? */
5460             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5461                 DEBUG_EXECUTE_r({
5462                     PerlIO_printf( Perl_debug_log,
5463                         "%*s  %sBRANCH failed...%s\n",
5464                         REPORT_CODE_OFF+depth*2, "",
5465                         PL_colors[4],
5466                         PL_colors[5] );
5467                 });
5468                 sayNO_SILENT;
5469             }
5470             continue; /* execute next BRANCH[J] op */
5471             assert(0); /* NOTREACHED */
5472
5473         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5474             minmod = 1;
5475             break;
5476
5477 #undef  ST
5478 #define ST st->u.curlym
5479
5480         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5481
5482             /* This is an optimisation of CURLYX that enables us to push
5483              * only a single backtracking state, no matter how many matches
5484              * there are in {m,n}. It relies on the pattern being constant
5485              * length, with no parens to influence future backrefs
5486              */
5487
5488             ST.me = scan;
5489             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5490
5491             ST.lastparen      = rex->lastparen;
5492             ST.lastcloseparen = rex->lastcloseparen;
5493
5494             /* if paren positive, emulate an OPEN/CLOSE around A */
5495             if (ST.me->flags) {
5496                 U32 paren = ST.me->flags;
5497                 if (paren > PL_regsize)
5498                     PL_regsize = paren;
5499                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5500             }
5501             ST.A = scan;
5502             ST.B = next;
5503             ST.alen = 0;
5504             ST.count = 0;
5505             ST.minmod = minmod;
5506             minmod = 0;
5507             ST.c1 = CHRTEST_UNINIT;
5508             REGCP_SET(ST.cp);
5509
5510             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5511                 goto curlym_do_B;
5512
5513           curlym_do_A: /* execute the A in /A{m,n}B/  */
5514             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5515             assert(0); /* NOTREACHED */
5516
5517         case CURLYM_A: /* we've just matched an A */
5518             ST.count++;
5519             /* after first match, determine A's length: u.curlym.alen */
5520             if (ST.count == 1) {
5521                 if (PL_reg_match_utf8) {
5522                     char *s = st->locinput;
5523                     while (s < locinput) {
5524                         ST.alen++;
5525                         s += UTF8SKIP(s);
5526                     }
5527                 }
5528                 else {
5529                     ST.alen = locinput - st->locinput;
5530                 }
5531                 if (ST.alen == 0)
5532                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5533             }
5534             DEBUG_EXECUTE_r(
5535                 PerlIO_printf(Perl_debug_log,
5536                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5537                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5538                           (IV) ST.count, (IV)ST.alen)
5539             );
5540
5541             if (cur_eval && cur_eval->u.eval.close_paren &&
5542                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5543                 goto fake_end;
5544
5545             {
5546                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5547                 if ( max == REG_INFTY || ST.count < max )
5548                     goto curlym_do_A; /* try to match another A */
5549             }
5550             goto curlym_do_B; /* try to match B */
5551
5552         case CURLYM_A_fail: /* just failed to match an A */
5553             REGCP_UNWIND(ST.cp);
5554
5555             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5556                 || (cur_eval && cur_eval->u.eval.close_paren &&
5557                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5558                 sayNO;
5559
5560           curlym_do_B: /* execute the B in /A{m,n}B/  */
5561             if (ST.c1 == CHRTEST_UNINIT) {
5562                 /* calculate c1 and c2 for possible match of 1st char
5563                  * following curly */
5564                 ST.c1 = ST.c2 = CHRTEST_VOID;
5565                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5566                     regnode *text_node = ST.B;
5567                     if (! HAS_TEXT(text_node))
5568                         FIND_NEXT_IMPT(text_node);
5569                     /* this used to be
5570
5571                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5572
5573                         But the former is redundant in light of the latter.
5574
5575                         if this changes back then the macro for
5576                         IS_TEXT and friends need to change.
5577                      */
5578                     if (PL_regkind[OP(text_node)] == EXACT) {
5579                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node,
5580                                                               &ST.c1, &ST.c2))
5581                         {
5582                             sayNO;
5583                         }
5584                     }
5585                 }
5586             }
5587
5588             DEBUG_EXECUTE_r(
5589                 PerlIO_printf(Perl_debug_log,
5590                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5591                     (int)(REPORT_CODE_OFF+(depth*2)),
5592                     "", (IV)ST.count)
5593                 );
5594             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5595                 const UV c = (utf8_target)
5596                               ? utf8n_to_uvchr((U8*)locinput,
5597                                                UTF8_MAXBYTES, NULL,
5598                                                uniflags)
5599                               : nextchr;
5600                 if (c != (UV) ST.c1 && c != (UV) ST.c2) {
5601                     /* simulate B failing */
5602                     DEBUG_OPTIMISE_r(
5603                         PerlIO_printf(Perl_debug_log,
5604                             "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5605                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5606                             (IV)ST.c1,(IV)ST.c2
5607                     ));
5608                     state_num = CURLYM_B_fail;
5609                     goto reenter_switch;
5610                 }
5611             }
5612
5613             if (ST.me->flags) {
5614                 /* emulate CLOSE: mark current A as captured */
5615                 I32 paren = ST.me->flags;
5616                 if (ST.count) {
5617                     rex->offs[paren].start
5618                         = HOPc(locinput, -ST.alen) - PL_bostr;
5619                     rex->offs[paren].end = locinput - PL_bostr;
5620                     if ((U32)paren > rex->lastparen)
5621                         rex->lastparen = paren;
5622                     rex->lastcloseparen = paren;
5623                 }
5624                 else
5625                     rex->offs[paren].end = -1;
5626                 if (cur_eval && cur_eval->u.eval.close_paren &&
5627                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5628                 {
5629                     if (ST.count)
5630                         goto fake_end;
5631                     else
5632                         sayNO;
5633                 }
5634             }
5635
5636             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5637             assert(0); /* NOTREACHED */
5638
5639         case CURLYM_B_fail: /* just failed to match a B */
5640             REGCP_UNWIND(ST.cp);
5641             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5642             if (ST.minmod) {
5643                 I32 max = ARG2(ST.me);
5644                 if (max != REG_INFTY && ST.count == max)
5645                     sayNO;
5646                 goto curlym_do_A; /* try to match a further A */
5647             }
5648             /* backtrack one A */
5649             if (ST.count == ARG1(ST.me) /* min */)
5650                 sayNO;
5651             ST.count--;
5652             SET_locinput(HOPc(locinput, -ST.alen));
5653             goto curlym_do_B; /* try to match B */
5654
5655 #undef ST
5656 #define ST st->u.curly
5657
5658 #define CURLY_SETPAREN(paren, success) \
5659     if (paren) { \
5660         if (success) { \
5661             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5662             rex->offs[paren].end = locinput - PL_bostr; \
5663             if (paren > rex->lastparen) \
5664                 rex->lastparen = paren; \
5665             rex->lastcloseparen = paren; \
5666         } \
5667         else { \
5668             rex->offs[paren].end = -1; \
5669             rex->lastparen      = ST.lastparen; \
5670             rex->lastcloseparen = ST.lastcloseparen; \
5671         } \
5672     }
5673
5674         case STAR:              /*  /A*B/ where A is width 1 char */
5675             ST.paren = 0;
5676             ST.min = 0;
5677             ST.max = REG_INFTY;
5678             scan = NEXTOPER(scan);
5679             goto repeat;
5680
5681         case PLUS:              /*  /A+B/ where A is width 1 char */
5682             ST.paren = 0;
5683             ST.min = 1;
5684             ST.max = REG_INFTY;
5685             scan = NEXTOPER(scan);
5686             goto repeat;
5687
5688         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5689             ST.paren = scan->flags;     /* Which paren to set */
5690             ST.lastparen      = rex->lastparen;
5691             ST.lastcloseparen = rex->lastcloseparen;
5692             if (ST.paren > PL_regsize)
5693                 PL_regsize = ST.paren;
5694             ST.min = ARG1(scan);  /* min to match */
5695             ST.max = ARG2(scan);  /* max to match */
5696             if (cur_eval && cur_eval->u.eval.close_paren &&
5697                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5698                 ST.min=1;
5699                 ST.max=1;
5700             }
5701             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5702             goto repeat;
5703
5704         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5705             ST.paren = 0;
5706             ST.min = ARG1(scan);  /* min to match */
5707             ST.max = ARG2(scan);  /* max to match */
5708             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5709           repeat:
5710             /*
5711             * Lookahead to avoid useless match attempts
5712             * when we know what character comes next.
5713             *
5714             * Used to only do .*x and .*?x, but now it allows
5715             * for )'s, ('s and (?{ ... })'s to be in the way
5716             * of the quantifier and the EXACT-like node.  -- japhy
5717             */
5718
5719             assert(ST.min <= ST.max);
5720             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5721                 ST.c1 = ST.c2 = CHRTEST_VOID;
5722             }
5723             else {
5724                 regnode *text_node = next;
5725
5726                 if (! HAS_TEXT(text_node))
5727                     FIND_NEXT_IMPT(text_node);
5728
5729                 if (! HAS_TEXT(text_node))
5730                     ST.c1 = ST.c2 = CHRTEST_VOID;
5731                 else {
5732                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5733                         ST.c1 = ST.c2 = CHRTEST_VOID;
5734                     }
5735                     else {
5736
5737                     /*  Currently we only get here when
5738
5739                         PL_rekind[OP(text_node)] == EXACT
5740
5741                         if this changes back then the macro for IS_TEXT and
5742                         friends need to change. */
5743                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node,
5744                                                               &ST.c1, &ST.c2))
5745                         {
5746                             sayNO;
5747                         }
5748                     }
5749                 }
5750             }
5751
5752             ST.A = scan;
5753             ST.B = next;
5754             if (minmod) {
5755                 char *li = locinput;
5756                 minmod = 0;
5757                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5758                     sayNO;
5759                 SET_locinput(li);
5760                 ST.count = ST.min;
5761                 REGCP_SET(ST.cp);
5762                 if (ST.c1 == CHRTEST_VOID)
5763                     goto curly_try_B_min;
5764
5765                 ST.oldloc = locinput;
5766
5767                 /* set ST.maxpos to the furthest point along the
5768                  * string that could possibly match */
5769                 if  (ST.max == REG_INFTY) {
5770                     ST.maxpos = PL_regeol - 1;
5771                     if (utf8_target)
5772                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5773                             ST.maxpos--;
5774                 }
5775                 else if (utf8_target) {
5776                     int m = ST.max - ST.min;
5777                     for (ST.maxpos = locinput;
5778                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5779                         ST.maxpos += UTF8SKIP(ST.maxpos);
5780                 }
5781                 else {
5782                     ST.maxpos = locinput + ST.max - ST.min;
5783                     if (ST.maxpos >= PL_regeol)
5784                         ST.maxpos = PL_regeol - 1;
5785                 }
5786                 goto curly_try_B_min_known;
5787
5788             }
5789             else {
5790                 /* avoid taking address of locinput, so it can remain
5791                  * a register var */
5792                 char *li = locinput;
5793                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5794                 if (ST.count < ST.min)
5795                     sayNO;
5796                 SET_locinput(li);
5797                 if ((ST.count > ST.min)
5798                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5799                 {
5800                     /* A{m,n} must come at the end of the string, there's
5801                      * no point in backing off ... */
5802                     ST.min = ST.count;
5803                     /* ...except that $ and \Z can match before *and* after
5804                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5805                        We may back off by one in this case. */
5806                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5807                         ST.min--;
5808                 }
5809                 REGCP_SET(ST.cp);
5810                 goto curly_try_B_max;
5811             }
5812             assert(0); /* NOTREACHED */
5813
5814
5815         case CURLY_B_min_known_fail:
5816             /* failed to find B in a non-greedy match where c1,c2 valid */
5817
5818             REGCP_UNWIND(ST.cp);
5819             if (ST.paren) {
5820                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5821             }
5822             /* Couldn't or didn't -- move forward. */
5823             ST.oldloc = locinput;
5824             if (utf8_target)
5825                 locinput += UTF8SKIP(locinput);
5826             else
5827                 locinput++;
5828             ST.count++;
5829           curly_try_B_min_known:
5830              /* find the next place where 'B' could work, then call B */
5831             {
5832                 int n;
5833                 if (utf8_target) {
5834                     n = (ST.oldloc == locinput) ? 0 : 1;
5835                     if (ST.c1 == ST.c2) {
5836                         STRLEN len;
5837                         /* set n to utf8_distance(oldloc, locinput) */
5838                         while (locinput <= ST.maxpos &&
5839                                utf8n_to_uvchr((U8*)locinput,
5840                                               UTF8_MAXBYTES, &len,
5841                                               uniflags) != (UV)ST.c1) {
5842                             locinput += len;
5843                             n++;
5844                         }
5845                     }
5846                     else {
5847                         /* set n to utf8_distance(oldloc, locinput) */
5848                         while (locinput <= ST.maxpos) {
5849                             STRLEN len;
5850                             const UV c = utf8n_to_uvchr((U8*)locinput,
5851                                                   UTF8_MAXBYTES, &len,
5852                                                   uniflags);
5853                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5854                                 break;
5855                             locinput += len;
5856                             n++;
5857                         }
5858                     }
5859                 }
5860                 else {  /* Not utf8_target */
5861                     if (ST.c1 == ST.c2) {
5862                         while (locinput <= ST.maxpos &&
5863                                UCHARAT(locinput) != ST.c1)
5864                             locinput++;
5865                     }
5866                     else {
5867                         while (locinput <= ST.maxpos
5868                                && UCHARAT(locinput) != ST.c1
5869                                && UCHARAT(locinput) != ST.c2)
5870                             locinput++;
5871                     }
5872                     n = locinput - ST.oldloc;
5873                 }
5874                 if (locinput > ST.maxpos)
5875                     sayNO;
5876                 if (n) {
5877                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5878                      * at b; check that everything between oldloc and
5879                      * locinput matches */
5880                     char *li = ST.oldloc;
5881                     ST.count += n;
5882                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
5883                         sayNO;
5884                     assert(n == REG_INFTY || locinput == li);
5885                 }
5886                 CURLY_SETPAREN(ST.paren, ST.count);
5887                 if (cur_eval && cur_eval->u.eval.close_paren &&
5888                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5889                     goto fake_end;
5890                 }
5891                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5892             }
5893             assert(0); /* NOTREACHED */
5894
5895
5896         case CURLY_B_min_fail:
5897             /* failed to find B in a non-greedy match where c1,c2 invalid */
5898
5899             REGCP_UNWIND(ST.cp);
5900             if (ST.paren) {
5901                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5902             }
5903             /* failed -- move forward one */
5904             {
5905                 char *li = locinput;
5906                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5907                     sayNO;
5908                 }
5909                 locinput = li;
5910             }
5911             {
5912                 ST.count++;
5913                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5914                         ST.count > 0)) /* count overflow ? */
5915                 {
5916                   curly_try_B_min:
5917                     CURLY_SETPAREN(ST.paren, ST.count);
5918                     if (cur_eval && cur_eval->u.eval.close_paren &&
5919                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5920                         goto fake_end;
5921                     }
5922                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
5923                 }
5924             }
5925             sayNO;
5926             assert(0); /* NOTREACHED */
5927
5928
5929         curly_try_B_max:
5930             /* a successful greedy match: now try to match B */
5931             if (cur_eval && cur_eval->u.eval.close_paren &&
5932                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5933                 goto fake_end;
5934             }
5935             {
5936                 UV c = 0;
5937                 if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
5938                     c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
5939                                            UTF8_MAXBYTES, 0, uniflags)
5940                                 : (UV) UCHARAT(locinput);
5941                 /* If it could work, try it. */
5942                 if (ST.c1 == CHRTEST_VOID
5943                     || (locinput < PL_regeol &&
5944                         (c == (UV)ST.c1 || c == (UV)ST.c2)))
5945                 {
5946                     CURLY_SETPAREN(ST.paren, ST.count);
5947                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
5948                     assert(0); /* NOTREACHED */
5949                 }
5950             }
5951             /* FALL THROUGH */
5952
5953         case CURLY_B_max_fail:
5954             /* failed to find B in a greedy match */
5955
5956             REGCP_UNWIND(ST.cp);
5957             if (ST.paren) {
5958                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5959             }
5960             /*  back up. */
5961             if (--ST.count < ST.min)
5962                 sayNO;
5963             locinput = HOPc(locinput, -1);
5964             goto curly_try_B_max;
5965
5966 #undef ST
5967
5968         case END: /*  last op of main pattern  */
5969             fake_end:
5970             if (cur_eval) {
5971                 /* we've just finished A in /(??{A})B/; now continue with B */
5972                 st->u.eval.toggle_reg_flags
5973                             = cur_eval->u.eval.toggle_reg_flags;
5974                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5975
5976                 st->u.eval.prev_rex = rex_sv;           /* inner */
5977                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5978                 rex_sv = cur_eval->u.eval.prev_rex;
5979                 SET_reg_curpm(rex_sv);
5980                 rex = (struct regexp *)SvANY(rex_sv);
5981                 rexi = RXi_GET(rex);
5982                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5983
5984                 REGCP_SET(st->u.eval.lastcp);
5985
5986                 /* Restore parens of the outer rex without popping the
5987                  * savestack */
5988                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5989
5990                 st->u.eval.prev_eval = cur_eval;
5991                 cur_eval = cur_eval->u.eval.prev_eval;
5992                 DEBUG_EXECUTE_r(
5993                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5994                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5995                 if ( nochange_depth )
5996                     nochange_depth--;
5997
5998                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
5999                                     locinput); /* match B */
6000             }
6001
6002             if (locinput < reginfo->till) {
6003                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6004                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6005                                       PL_colors[4],
6006                                       (long)(locinput - PL_reg_starttry),
6007                                       (long)(reginfo->till - PL_reg_starttry),
6008                                       PL_colors[5]));
6009
6010                 sayNO_SILENT;           /* Cannot match: too short. */
6011             }
6012             sayYES;                     /* Success! */
6013
6014         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6015             DEBUG_EXECUTE_r(
6016             PerlIO_printf(Perl_debug_log,
6017                 "%*s  %ssubpattern success...%s\n",
6018                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6019             sayYES;                     /* Success! */
6020
6021 #undef  ST
6022 #define ST st->u.ifmatch
6023
6024         {
6025             char *newstart;
6026
6027         case SUSPEND:   /* (?>A) */
6028             ST.wanted = 1;
6029             newstart = locinput;
6030             goto do_ifmatch;
6031
6032         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6033             ST.wanted = 0;
6034             goto ifmatch_trivial_fail_test;
6035
6036         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6037             ST.wanted = 1;
6038           ifmatch_trivial_fail_test:
6039             if (scan->flags) {
6040                 char * const s = HOPBACKc(locinput, scan->flags);
6041                 if (!s) {
6042                     /* trivial fail */
6043                     if (logical) {
6044                         logical = 0;
6045                         sw = 1 - cBOOL(ST.wanted);
6046                     }
6047                     else if (ST.wanted)
6048                         sayNO;
6049                     next = scan + ARG(scan);
6050                     if (next == scan)
6051                         next = NULL;
6052                     break;
6053                 }
6054                 newstart = s;
6055             }
6056             else
6057                 newstart = locinput;
6058
6059           do_ifmatch:
6060             ST.me = scan;
6061             ST.logical = logical;
6062             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6063
6064             /* execute body of (?...A) */
6065             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6066             assert(0); /* NOTREACHED */
6067         }
6068
6069         case IFMATCH_A_fail: /* body of (?...A) failed */
6070             ST.wanted = !ST.wanted;
6071             /* FALL THROUGH */
6072
6073         case IFMATCH_A: /* body of (?...A) succeeded */
6074             if (ST.logical) {
6075                 sw = cBOOL(ST.wanted);
6076             }
6077             else if (!ST.wanted)
6078                 sayNO;
6079
6080             if (OP(ST.me) != SUSPEND) {
6081                 /* restore old position except for (?>...) */
6082                 locinput = st->locinput;
6083             }
6084             scan = ST.me + ARG(ST.me);
6085             if (scan == ST.me)
6086                 scan = NULL;
6087             continue; /* execute B */
6088
6089 #undef ST
6090
6091         case LONGJMP: /*  alternative with many branches compiles to
6092                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6093             next = scan + ARG(scan);
6094             if (next == scan)
6095                 next = NULL;
6096             break;
6097
6098         case COMMIT:  /*  (*COMMIT)  */
6099             reginfo->cutpoint = PL_regeol;
6100             /* FALLTHROUGH */
6101
6102         case PRUNE:   /*  (*PRUNE)   */
6103             if (!scan->flags)
6104                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6105             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6106             assert(0); /* NOTREACHED */
6107
6108         case COMMIT_next_fail:
6109             no_final = 1;
6110             /* FALLTHROUGH */
6111
6112         case OPFAIL:   /* (*FAIL)  */
6113             sayNO;
6114             assert(0); /* NOTREACHED */
6115
6116 #define ST st->u.mark
6117         case MARKPOINT: /*  (*MARK:foo)  */
6118             ST.prev_mark = mark_state;
6119             ST.mark_name = sv_commit = sv_yes_mark
6120                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6121             mark_state = st;
6122             ST.mark_loc = locinput;
6123             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6124             assert(0); /* NOTREACHED */
6125
6126         case MARKPOINT_next:
6127             mark_state = ST.prev_mark;
6128             sayYES;
6129             assert(0); /* NOTREACHED */
6130
6131         case MARKPOINT_next_fail:
6132             if (popmark && sv_eq(ST.mark_name,popmark))
6133             {
6134                 if (ST.mark_loc > startpoint)
6135                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6136                 popmark = NULL; /* we found our mark */
6137                 sv_commit = ST.mark_name;
6138
6139                 DEBUG_EXECUTE_r({
6140                         PerlIO_printf(Perl_debug_log,
6141                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6142                             REPORT_CODE_OFF+depth*2, "",
6143                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6144                 });
6145             }
6146             mark_state = ST.prev_mark;
6147             sv_yes_mark = mark_state ?
6148                 mark_state->u.mark.mark_name : NULL;
6149             sayNO;
6150             assert(0); /* NOTREACHED */
6151
6152         case SKIP:  /*  (*SKIP)  */
6153             if (scan->flags) {
6154                 /* (*SKIP) : if we fail we cut here*/
6155                 ST.mark_name = NULL;
6156                 ST.mark_loc = locinput;
6157                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6158             } else {
6159                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6160                    otherwise do nothing.  Meaning we need to scan
6161                  */
6162                 regmatch_state *cur = mark_state;
6163                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6164
6165                 while (cur) {
6166                     if ( sv_eq( cur->u.mark.mark_name,
6167                                 find ) )
6168                     {
6169                         ST.mark_name = find;
6170                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6171                     }
6172                     cur = cur->u.mark.prev_mark;
6173                 }
6174             }
6175             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6176             break;
6177
6178         case SKIP_next_fail:
6179             if (ST.mark_name) {
6180                 /* (*CUT:NAME) - Set up to search for the name as we
6181                    collapse the stack*/
6182                 popmark = ST.mark_name;
6183             } else {
6184                 /* (*CUT) - No name, we cut here.*/
6185                 if (ST.mark_loc > startpoint)
6186                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6187                 /* but we set sv_commit to latest mark_name if there
6188                    is one so they can test to see how things lead to this
6189                    cut */
6190                 if (mark_state)
6191                     sv_commit=mark_state->u.mark.mark_name;
6192             }
6193             no_final = 1;
6194             sayNO;
6195             assert(0); /* NOTREACHED */
6196 #undef ST
6197
6198         case LNBREAK: /* \R */
6199             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6200                 locinput += n;
6201             } else
6202                 sayNO;
6203             break;
6204
6205 #define CASE_CLASS(nAmE)                              \
6206         case nAmE:                                    \
6207             if (NEXTCHR_IS_EOS)                       \
6208                 sayNO;                                \
6209             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6210                 locinput += n;                        \
6211             } else                                    \
6212                 sayNO;                                \
6213             break;                                    \
6214         case N##nAmE:                                 \
6215             if (NEXTCHR_IS_EOS)                       \
6216                 sayNO;                                \
6217             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6218                 sayNO;                                \
6219             } else {                                  \
6220                 locinput += UTF8SKIP(locinput);       \
6221             }                                         \
6222             break
6223
6224         CASE_CLASS(VERTWS);  /*  \v \V  */
6225         CASE_CLASS(HORIZWS); /*  \h \H  */
6226 #undef CASE_CLASS
6227
6228         default:
6229             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6230                           PTR2UV(scan), OP(scan));
6231             Perl_croak(aTHX_ "regexp memory corruption");
6232
6233         /* this is a point to jump to in order to increment
6234          * locinput by one character */
6235         increment_locinput:
6236             if (utf8_target) {
6237                 locinput += PL_utf8skip[nextchr];
6238                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6239                 if (locinput > PL_regeol)
6240                     sayNO;
6241             }
6242             else
6243                 locinput++;
6244             break;
6245
6246         } /* end switch */
6247
6248         /* switch break jumps here */
6249         scan = next; /* prepare to execute the next op and ... */
6250         continue;    /* ... jump back to the top, reusing st */
6251         assert(0); /* NOTREACHED */
6252
6253       push_yes_state:
6254         /* push a state that backtracks on success */
6255         st->u.yes.prev_yes_state = yes_state;
6256         yes_state = st;
6257         /* FALL THROUGH */
6258       push_state:
6259         /* push a new regex state, then continue at scan  */
6260         {
6261             regmatch_state *newst;
6262
6263             DEBUG_STACK_r({
6264                 regmatch_state *cur = st;
6265                 regmatch_state *curyes = yes_state;
6266                 int curd = depth;
6267                 regmatch_slab *slab = PL_regmatch_slab;
6268                 for (;curd > -1;cur--,curd--) {
6269                     if (cur < SLAB_FIRST(slab)) {
6270                         slab = slab->prev;
6271                         cur = SLAB_LAST(slab);
6272                     }
6273                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6274                         REPORT_CODE_OFF + 2 + depth * 2,"",
6275                         curd, PL_reg_name[cur->resume_state],
6276                         (curyes == cur) ? "yes" : ""
6277                     );
6278                     if (curyes == cur)
6279                         curyes = cur->u.yes.prev_yes_state;
6280                 }
6281             } else
6282                 DEBUG_STATE_pp("push")
6283             );
6284             depth++;
6285             st->locinput = locinput;
6286             newst = st+1;
6287             if (newst >  SLAB_LAST(PL_regmatch_slab))
6288                 newst = S_push_slab(aTHX);
6289             PL_regmatch_state = newst;
6290
6291             locinput = pushinput;
6292             st = newst;
6293             continue;
6294             assert(0); /* NOTREACHED */
6295         }
6296     }
6297
6298     /*
6299     * We get here only if there's trouble -- normally "case END" is
6300     * the terminating point.
6301     */
6302     Perl_croak(aTHX_ "corrupted regexp pointers");
6303     /*NOTREACHED*/
6304     sayNO;
6305
6306 yes:
6307     if (yes_state) {
6308         /* we have successfully completed a subexpression, but we must now
6309          * pop to the state marked by yes_state and continue from there */
6310         assert(st != yes_state);
6311 #ifdef DEBUGGING
6312         while (st != yes_state) {
6313             st--;
6314             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6315                 PL_regmatch_slab = PL_regmatch_slab->prev;
6316                 st = SLAB_LAST(PL_regmatch_slab);
6317             }
6318             DEBUG_STATE_r({
6319                 if (no_final) {
6320                     DEBUG_STATE_pp("pop (no final)");
6321                 } else {
6322                     DEBUG_STATE_pp("pop (yes)");
6323                 }
6324             });
6325             depth--;
6326         }
6327 #else
6328         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6329             || yes_state > SLAB_LAST(PL_regmatch_slab))
6330         {
6331             /* not in this slab, pop slab */
6332             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6333             PL_regmatch_slab = PL_regmatch_slab->prev;
6334             st = SLAB_LAST(PL_regmatch_slab);
6335         }
6336         depth -= (st - yes_state);
6337 #endif
6338         st = yes_state;
6339         yes_state = st->u.yes.prev_yes_state;
6340         PL_regmatch_state = st;
6341
6342         if (no_final)
6343             locinput= st->locinput;
6344         state_num = st->resume_state + no_final;
6345         goto reenter_switch;
6346     }
6347
6348     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6349                           PL_colors[4], PL_colors[5]));
6350
6351     if (PL_reg_state.re_state_eval_setup_done) {
6352         /* each successfully executed (?{...}) block does the equivalent of
6353          *   local $^R = do {...}
6354          * When popping the save stack, all these locals would be undone;
6355          * bypass this by setting the outermost saved $^R to the latest
6356          * value */
6357         if (oreplsv != GvSV(PL_replgv))
6358             sv_setsv(oreplsv, GvSV(PL_replgv));
6359     }
6360     result = 1;
6361     goto final_exit;
6362
6363 no:
6364     DEBUG_EXECUTE_r(
6365         PerlIO_printf(Perl_debug_log,
6366             "%*s  %sfailed...%s\n",
6367             REPORT_CODE_OFF+depth*2, "",
6368             PL_colors[4], PL_colors[5])
6369         );
6370
6371 no_silent:
6372     if (no_final) {
6373         if (yes_state) {
6374             goto yes;
6375         } else {
6376             goto final_exit;
6377         }
6378     }
6379     if (depth) {
6380         /* there's a previous state to backtrack to */
6381         st--;
6382         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6383             PL_regmatch_slab = PL_regmatch_slab->prev;
6384             st = SLAB_LAST(PL_regmatch_slab);
6385         }
6386         PL_regmatch_state = st;
6387         locinput= st->locinput;
6388
6389         DEBUG_STATE_pp("pop");
6390         depth--;
6391         if (yes_state == st)
6392             yes_state = st->u.yes.prev_yes_state;
6393
6394         state_num = st->resume_state + 1; /* failure = success + 1 */
6395         goto reenter_switch;
6396     }
6397     result = 0;
6398
6399   final_exit:
6400     if (rex->intflags & PREGf_VERBARG_SEEN) {
6401         SV *sv_err = get_sv("REGERROR", 1);
6402         SV *sv_mrk = get_sv("REGMARK", 1);
6403         if (result) {
6404             sv_commit = &PL_sv_no;
6405             if (!sv_yes_mark)
6406                 sv_yes_mark = &PL_sv_yes;
6407         } else {
6408             if (!sv_commit)
6409                 sv_commit = &PL_sv_yes;
6410             sv_yes_mark = &PL_sv_no;
6411         }
6412         sv_setsv(sv_err, sv_commit);
6413         sv_setsv(sv_mrk, sv_yes_mark);
6414     }
6415
6416
6417     if (last_pushed_cv) {
6418         dSP;
6419         POP_MULTICALL;
6420         PERL_UNUSED_VAR(SP);
6421     }
6422
6423     /* clean up; in particular, free all slabs above current one */
6424     LEAVE_SCOPE(oldsave);
6425
6426     assert(!result ||  locinput - PL_bostr >= 0);
6427     return result ?  locinput - PL_bostr : -1;
6428 }
6429
6430 /*
6431  - regrepeat - repeatedly match something simple, report how many
6432  *
6433  * startposp - pointer a pointer to the start position.  This is updated
6434  *             to point to the byte following the highest successful
6435  *             match.
6436  * p         - the regnode to be repeatedly matched against.
6437  * max       - maximum number of characters to match.
6438  * depth     - (for debugging) backtracking depth.
6439  */
6440 STATIC I32
6441 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6442 {
6443     dVAR;
6444     char *scan;
6445     I32 c;
6446     char *loceol = PL_regeol;
6447     I32 hardcount = 0;
6448     bool utf8_target = PL_reg_match_utf8;
6449     UV utf8_flags;
6450 #ifndef DEBUGGING
6451     PERL_UNUSED_ARG(depth);
6452 #endif
6453
6454     PERL_ARGS_ASSERT_REGREPEAT;
6455
6456     scan = *startposp;
6457     if (max == REG_INFTY)
6458         max = I32_MAX;
6459     else if (max < loceol - scan)
6460         loceol = scan + max;
6461     switch (OP(p)) {
6462     case REG_ANY:
6463         if (utf8_target) {
6464             loceol = PL_regeol;
6465             while (scan < loceol && hardcount < max && *scan != '\n') {
6466                 scan += UTF8SKIP(scan);
6467                 hardcount++;
6468             }
6469         } else {
6470             while (scan < loceol && *scan != '\n')
6471                 scan++;
6472         }
6473         break;
6474     case SANY:
6475         if (utf8_target) {
6476             loceol = PL_regeol;
6477             while (scan < loceol && hardcount < max) {
6478                 scan += UTF8SKIP(scan);
6479                 hardcount++;
6480             }
6481         }
6482         else
6483             scan = loceol;
6484         break;
6485     case CANY:
6486         scan = loceol;
6487         break;
6488     case EXACT:
6489         c = (U8)*STRING(p);
6490
6491         /* Can use a simple loop if the pattern char to match on is invariant
6492          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6493          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6494          * true iff it doesn't matter if the argument is in UTF-8 or not */
6495         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6496             while (scan < loceol && UCHARAT(scan) == c) {
6497                 scan++;
6498             }
6499         }
6500         else if (UTF_PATTERN) {
6501             if (utf8_target) {
6502                 STRLEN scan_char_len;
6503                 loceol = PL_regeol;
6504
6505                 /* When both target and pattern are UTF-8, we have to do s
6506                  * string EQ */
6507                 while (hardcount < max
6508                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6509                        && scan_char_len <= STR_LEN(p)
6510                        && memEQ(scan, STRING(p), scan_char_len))
6511                 {
6512                     scan += scan_char_len;
6513                     hardcount++;
6514                 }
6515             }
6516             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6517
6518                 /* Target isn't utf8; convert the character in the UTF-8
6519                  * pattern to non-UTF8, and do a simple loop */
6520                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6521                 while (scan < loceol && UCHARAT(scan) == c) {
6522                     scan++;
6523                 }
6524             } /* else pattern char is above Latin1, can't possibly match the
6525                  non-UTF-8 target */
6526         }
6527         else {
6528
6529             /* Here, the string must be utf8; pattern isn't, and <c> is
6530              * different in utf8 than not, so can't compare them directly.
6531              * Outside the loop, find the two utf8 bytes that represent c, and
6532              * then look for those in sequence in the utf8 string */
6533             U8 high = UTF8_TWO_BYTE_HI(c);
6534             U8 low = UTF8_TWO_BYTE_LO(c);
6535             loceol = PL_regeol;
6536
6537             while (hardcount < max
6538                     && scan + 1 < loceol
6539                     && UCHARAT(scan) == high
6540                     && UCHARAT(scan + 1) == low)
6541             {
6542                 scan += 2;
6543                 hardcount++;
6544             }
6545         }
6546         break;
6547
6548     case EXACTFA:
6549         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6550         goto do_exactf;
6551
6552     case EXACTFL:
6553         PL_reg_flags |= RF_tainted;
6554         utf8_flags = FOLDEQ_UTF8_LOCALE;
6555         goto do_exactf;
6556
6557     case EXACTF:
6558             utf8_flags = 0;
6559             goto do_exactf;
6560
6561     case EXACTFU_SS:
6562     case EXACTFU_TRICKYFOLD:
6563     case EXACTFU:
6564         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6565
6566     do_exactf:
6567         c = (U8)*STRING(p);
6568
6569         if (utf8_target
6570             || OP(p) == EXACTFU_SS
6571             || (UTF_PATTERN && ! UTF8_IS_INVARIANT(c)))
6572         {
6573             /* Use full Unicode fold matching */
6574             char *tmpeol = loceol;
6575             STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6576             while (hardcount < max
6577                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6578                        STRING(p), NULL, pat_len, cBOOL(UTF_PATTERN), utf8_flags))
6579             {
6580                 scan = tmpeol;
6581                 tmpeol = loceol;
6582                 hardcount++;
6583             }
6584
6585             /* XXX Note that the above handles properly the German sharp s in
6586              * the pattern matching ss in the string.  But it doesn't handle
6587              * properly cases where the string contains say 'LIGATURE ff' and
6588              * the pattern is 'f+'.  This would require, say, a new function or
6589              * revised interface to foldEQ_utf8(), in which the maximum number
6590              * of characters to match could be passed and it would return how
6591              * many actually did.  This is just one of many cases where
6592              * multi-char folds don't work properly, and so the fix is being
6593              * deferred */
6594         }
6595         else {
6596             U8 folded;
6597
6598             /* Here, the string isn't utf8; and either the pattern isn't utf8
6599              * or c is an invariant, so its utf8ness doesn't affect c.  Can
6600              * just do simple comparisons for exact or fold matching. */
6601             switch (OP(p)) {
6602                 case EXACTF: folded = PL_fold[c]; break;
6603                 case EXACTFA:
6604                 case EXACTFU_TRICKYFOLD:
6605                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6606                 case EXACTFL: folded = PL_fold_locale[c]; break;
6607                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6608             }
6609             while (scan < loceol &&
6610                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6611             {
6612                 scan++;
6613             }
6614         }
6615         break;
6616     case ANYOFV:
6617     case ANYOF:
6618         if (utf8_target || OP(p) == ANYOFV) {
6619             STRLEN inclasslen;
6620             loceol = PL_regeol;
6621             inclasslen = loceol - scan;
6622             while (hardcount < max
6623                    && ((inclasslen = loceol - scan) > 0)
6624                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6625             {
6626                 scan += inclasslen;
6627                 hardcount++;
6628             }
6629         } else {
6630             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6631                 scan++;
6632         }
6633         break;
6634     case ALNUMU:
6635         if (utf8_target) {
6636     utf8_wordchar:
6637             loceol = PL_regeol;
6638             LOAD_UTF8_CHARCLASS_ALNUM();
6639             while (hardcount < max && scan < loceol &&
6640                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6641             {
6642                 scan += UTF8SKIP(scan);
6643                 hardcount++;
6644             }
6645         } else {
6646             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6647                 scan++;
6648             }
6649         }
6650         break;
6651     case ALNUM:
6652         if (utf8_target)
6653             goto utf8_wordchar;
6654         while (scan < loceol && isALNUM((U8) *scan)) {
6655             scan++;
6656         }
6657         break;
6658     case ALNUMA:
6659         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6660             scan++;
6661         }
6662         break;
6663     case ALNUML:
6664         PL_reg_flags |= RF_tainted;
6665         if (utf8_target) {
6666             loceol = PL_regeol;
6667             while (hardcount < max && scan < loceol &&
6668                    isALNUM_LC_utf8((U8*)scan)) {
6669                 scan += UTF8SKIP(scan);
6670                 hardcount++;
6671             }
6672         } else {
6673             while (scan < loceol && isALNUM_LC(*scan))
6674                 scan++;
6675         }
6676         break;
6677     case NALNUMU:
6678         if (utf8_target) {
6679
6680     utf8_Nwordchar:
6681
6682             loceol = PL_regeol;
6683             LOAD_UTF8_CHARCLASS_ALNUM();
6684             while (hardcount < max && scan < loceol &&
6685                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6686             {
6687                 scan += UTF8SKIP(scan);
6688                 hardcount++;
6689             }
6690         } else {
6691             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6692                 scan++;
6693             }
6694         }
6695         break;
6696     case NALNUM:
6697         if (utf8_target)
6698             goto utf8_Nwordchar;
6699         while (scan < loceol && ! isALNUM((U8) *scan)) {
6700             scan++;
6701         }
6702         break;
6703
6704     case POSIXA:
6705        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6706             scan++;
6707         }
6708         break;
6709     case NPOSIXA:
6710         if (utf8_target) {
6711             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6712                 scan += UTF8SKIP(scan);
6713             }
6714         }
6715         else {
6716             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6717                 scan++;
6718             }
6719         }
6720         break;
6721     case NALNUMA:
6722         if (utf8_target) {
6723             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6724                 scan += UTF8SKIP(scan);
6725             }
6726         }
6727         else {
6728             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6729                 scan++;
6730             }
6731         }
6732         break;
6733     case NALNUML:
6734         PL_reg_flags |= RF_tainted;
6735         if (utf8_target) {
6736             loceol = PL_regeol;
6737             while (hardcount < max && scan < loceol &&
6738                    !isALNUM_LC_utf8((U8*)scan)) {
6739                 scan += UTF8SKIP(scan);
6740                 hardcount++;
6741             }
6742         } else {
6743             while (scan < loceol && !isALNUM_LC(*scan))
6744                 scan++;
6745         }
6746         break;
6747     case SPACEU:
6748         if (utf8_target) {
6749
6750     utf8_space:
6751
6752             loceol = PL_regeol;
6753             LOAD_UTF8_CHARCLASS_SPACE();
6754             while (hardcount < max && scan < loceol &&
6755                    (*scan == ' ' ||
6756                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6757             {
6758                 scan += UTF8SKIP(scan);
6759                 hardcount++;
6760             }
6761             break;
6762         }
6763         else {
6764             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6765                 scan++;
6766             }
6767             break;
6768         }
6769     case SPACE:
6770         if (utf8_target)
6771             goto utf8_space;
6772
6773         while (scan < loceol && isSPACE((U8) *scan)) {
6774             scan++;
6775         }
6776         break;
6777     case SPACEA:
6778         while (scan < loceol && isSPACE_A((U8) *scan)) {
6779             scan++;
6780         }
6781         break;
6782     case SPACEL:
6783         PL_reg_flags |= RF_tainted;
6784         if (utf8_target) {
6785             loceol = PL_regeol;
6786             while (hardcount < max && scan < loceol &&
6787                    isSPACE_LC_utf8((U8*)scan)) {
6788                 scan += UTF8SKIP(scan);
6789                 hardcount++;
6790             }
6791         } else {
6792             while (scan < loceol && isSPACE_LC(*scan))
6793                 scan++;
6794         }
6795         break;
6796     case NSPACEU:
6797         if (utf8_target) {
6798
6799     utf8_Nspace:
6800
6801             loceol = PL_regeol;
6802             LOAD_UTF8_CHARCLASS_SPACE();
6803             while (hardcount < max && scan < loceol &&
6804                    ! (*scan == ' ' ||
6805                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6806             {
6807                 scan += UTF8SKIP(scan);
6808                 hardcount++;
6809             }
6810             break;
6811         }
6812         else {
6813             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6814                 scan++;
6815             }
6816         }
6817         break;
6818     case NSPACE:
6819         if (utf8_target)
6820             goto utf8_Nspace;
6821
6822         while (scan < loceol && ! isSPACE((U8) *scan)) {
6823             scan++;
6824         }
6825         break;
6826     case NSPACEA:
6827         if (utf8_target) {
6828             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6829                 scan += UTF8SKIP(scan);
6830             }
6831         }
6832         else {
6833             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6834                 scan++;
6835             }
6836         }
6837         break;
6838     case NSPACEL:
6839         PL_reg_flags |= RF_tainted;
6840         if (utf8_target) {
6841             loceol = PL_regeol;
6842             while (hardcount < max && scan < loceol &&
6843                    !isSPACE_LC_utf8((U8*)scan)) {
6844                 scan += UTF8SKIP(scan);
6845                 hardcount++;
6846             }
6847         } else {
6848             while (scan < loceol && !isSPACE_LC(*scan))
6849                 scan++;
6850         }
6851         break;
6852     case DIGIT:
6853         if (utf8_target) {
6854             loceol = PL_regeol;
6855             LOAD_UTF8_CHARCLASS_DIGIT();
6856             while (hardcount < max && scan < loceol &&
6857                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6858                 scan += UTF8SKIP(scan);
6859                 hardcount++;
6860             }
6861         } else {
6862             while (scan < loceol && isDIGIT(*scan))
6863                 scan++;
6864         }
6865         break;
6866     case DIGITA:
6867         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6868             scan++;
6869         }
6870         break;
6871     case DIGITL:
6872         PL_reg_flags |= RF_tainted;
6873         if (utf8_target) {
6874             loceol = PL_regeol;
6875             while (hardcount < max && scan < loceol &&
6876                    isDIGIT_LC_utf8((U8*)scan)) {
6877                 scan += UTF8SKIP(scan);
6878                 hardcount++;
6879             }
6880         } else {
6881             while (scan < loceol && isDIGIT_LC(*scan))
6882                 scan++;
6883         }
6884         break;
6885     case NDIGIT:
6886         if (utf8_target) {
6887             loceol = PL_regeol;
6888             LOAD_UTF8_CHARCLASS_DIGIT();
6889             while (hardcount < max && scan < loceol &&
6890                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6891                 scan += UTF8SKIP(scan);
6892                 hardcount++;
6893             }
6894         } else {
6895             while (scan < loceol && !isDIGIT(*scan))
6896                 scan++;
6897         }
6898         break;
6899     case NDIGITA:
6900         if (utf8_target) {
6901             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6902                 scan += UTF8SKIP(scan);
6903             }
6904         }
6905         else {
6906             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6907                 scan++;
6908             }
6909         }
6910         break;
6911     case NDIGITL:
6912         PL_reg_flags |= RF_tainted;
6913         if (utf8_target) {
6914             loceol = PL_regeol;
6915             while (hardcount < max && scan < loceol &&
6916                    !isDIGIT_LC_utf8((U8*)scan)) {
6917                 scan += UTF8SKIP(scan);
6918                 hardcount++;
6919             }
6920         } else {
6921             while (scan < loceol && !isDIGIT_LC(*scan))
6922                 scan++;
6923         }
6924         break;
6925     case LNBREAK:
6926         if (utf8_target) {
6927             loceol = PL_regeol;
6928             while (hardcount < max && scan < loceol &&
6929                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
6930                 scan += c;
6931                 hardcount++;
6932             }
6933         } else {
6934             /*
6935               LNBREAK can match two latin chars, which is ok,
6936               because we have a null terminated string, but we
6937               have to use hardcount in this situation
6938             */
6939             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
6940                 scan+=c;
6941                 hardcount++;
6942             }
6943         }
6944         break;
6945     case HORIZWS:
6946         if (utf8_target) {
6947             loceol = PL_regeol;
6948             while (hardcount < max && scan < loceol &&
6949                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
6950             {
6951                 scan += c;
6952                 hardcount++;
6953             }
6954         } else {
6955             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
6956                 scan++;
6957         }
6958         break;
6959     case NHORIZWS:
6960         if (utf8_target) {
6961             loceol = PL_regeol;
6962             while (hardcount < max && scan < loceol &&
6963                         !is_HORIZWS_utf8_safe(scan, loceol))
6964             {
6965                 scan += UTF8SKIP(scan);
6966                 hardcount++;
6967             }
6968         } else {
6969             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
6970                 scan++;
6971
6972         }
6973         break;
6974     case VERTWS:
6975         if (utf8_target) {
6976             loceol = PL_regeol;
6977             while (hardcount < max && scan < loceol &&
6978                             (c=is_VERTWS_utf8_safe(scan, loceol)))
6979             {
6980                 scan += c;
6981                 hardcount++;
6982             }
6983         } else {
6984             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
6985                 scan++;
6986
6987         }
6988         break;
6989     case NVERTWS:
6990         if (utf8_target) {
6991             loceol = PL_regeol;
6992             while (hardcount < max && scan < loceol &&
6993                                 !is_VERTWS_utf8_safe(scan, loceol))
6994             {
6995                 scan += UTF8SKIP(scan);
6996                 hardcount++;
6997             }
6998         } else {
6999             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7000                 scan++;
7001
7002         }
7003         break;
7004
7005     default:            /* Called on something of 0 width. */
7006         break;          /* So match right here or not at all. */
7007     }
7008
7009     if (hardcount)
7010         c = hardcount;
7011     else
7012         c = scan - *startposp;
7013     *startposp = scan;
7014
7015     DEBUG_r({
7016         GET_RE_DEBUG_FLAGS_DECL;
7017         DEBUG_EXECUTE_r({
7018             SV * const prop = sv_newmortal();
7019             regprop(prog, prop, p);
7020             PerlIO_printf(Perl_debug_log,
7021                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7022                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7023         });
7024     });
7025
7026     return(c);
7027 }
7028
7029
7030 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7031 /*
7032 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7033 create a copy so that changes the caller makes won't change the shared one
7034  */
7035 SV *
7036 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7037 {
7038     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7039     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
7040 }
7041 #endif
7042
7043 STATIC SV *
7044 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7045 {
7046     /* Returns the swash for the input 'node' in the regex 'prog'.
7047      * If <doinit> is true, will attempt to create the swash if not already
7048      *    done.
7049      * If <listsvp> is non-null, will return the swash initialization string in
7050      *    it.
7051      * If <altsvp> is non-null, will return the alternates to the regular swash
7052      *    in it
7053      * Tied intimately to how regcomp.c sets up the data structure */
7054
7055     dVAR;
7056     SV *sw  = NULL;
7057     SV *si  = NULL;
7058     SV *alt = NULL;
7059     SV*  invlist = NULL;
7060
7061     RXi_GET_DECL(prog,progi);
7062     const struct reg_data * const data = prog ? progi->data : NULL;
7063
7064     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7065
7066     assert(ANYOF_NONBITMAP(node));
7067
7068     if (data && data->count) {
7069         const U32 n = ARG(node);
7070
7071         if (data->what[n] == 's') {
7072             SV * const rv = MUTABLE_SV(data->data[n]);
7073             AV * const av = MUTABLE_AV(SvRV(rv));
7074             SV **const ary = AvARRAY(av);
7075             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7076
7077             si = *ary;  /* ary[0] = the string to initialize the swash with */
7078
7079             /* Elements 3 and 4 are either both present or both absent. [3] is
7080              * any inversion list generated at compile time; [4] indicates if
7081              * that inversion list has any user-defined properties in it. */
7082             if (av_len(av) >= 3) {
7083                 invlist = ary[3];
7084                 if (SvUV(ary[4])) {
7085                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7086                 }
7087             }
7088             else {
7089                 invlist = NULL;
7090             }
7091
7092             /* Element [1] is reserved for the set-up swash.  If already there,
7093              * return it; if not, create it and store it there */
7094             if (SvROK(ary[1])) {
7095                 sw = ary[1];
7096             }
7097             else if (si && doinit) {
7098
7099                 sw = _core_swash_init("utf8", /* the utf8 package */
7100                                       "", /* nameless */
7101                                       si,
7102                                       1, /* binary */
7103                                       0, /* not from tr/// */
7104                                       invlist,
7105                                       &swash_init_flags);
7106                 (void)av_store(av, 1, sw);
7107             }
7108
7109             /* Element [2] is for any multi-char folds.  Note that is a
7110              * fundamentally flawed design, because can't backtrack and try
7111              * again.  See [perl #89774] */
7112             if (SvTYPE(ary[2]) == SVt_PVAV) {
7113                 alt = ary[2];
7114             }
7115         }
7116     }
7117
7118     if (listsvp) {
7119         SV* matches_string = newSVpvn("", 0);
7120
7121         /* Use the swash, if any, which has to have incorporated into it all
7122          * possibilities */
7123         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7124             && (si && si != &PL_sv_undef))
7125         {
7126
7127             /* If no swash, use the input initialization string, if available */
7128             sv_catsv(matches_string, si);
7129         }
7130
7131         /* Add the inversion list to whatever we have.  This may have come from
7132          * the swash, or from an input parameter */
7133         if (invlist) {
7134             sv_catsv(matches_string, _invlist_contents(invlist));
7135         }
7136         *listsvp = matches_string;
7137     }
7138
7139     if (altsvp)
7140         *altsvp  = alt;
7141
7142     return sw;
7143 }
7144
7145 /*
7146  - reginclass - determine if a character falls into a character class
7147
7148   n is the ANYOF regnode
7149   p is the target string
7150   lenp is pointer to the maximum number of bytes of how far to go in p
7151     (This is assumed wthout checking to always be at least the current
7152     character's size)
7153   utf8_target tells whether p is in UTF-8.
7154
7155   Returns true if matched; false otherwise.  If lenp is not NULL, on return
7156   from a successful match, the value it points to will be updated to how many
7157   bytes in p were matched.  If there was no match, the value is undefined,
7158   possibly changed from the input.
7159
7160   Note that this can be a synthetic start class, a combination of various
7161   nodes, so things you think might be mutually exclusive, such as locale,
7162   aren't.  It can match both locale and non-locale
7163
7164  */
7165
7166 STATIC bool
7167 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
7168 {
7169     dVAR;
7170     const char flags = ANYOF_FLAGS(n);
7171     bool match = FALSE;
7172     UV c = *p;
7173     STRLEN c_len = 0;
7174     STRLEN maxlen;
7175
7176     PERL_ARGS_ASSERT_REGINCLASS;
7177
7178     /* If c is not already the code point, get it */
7179     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
7180         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7181                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7182                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7183                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7184                  * UTF8_ALLOW_FFFF */
7185         if (c_len == (STRLEN)-1)
7186             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7187     }
7188     else {
7189         c_len = 1;
7190     }
7191
7192     /* Use passed in max length, or one character if none passed in or less
7193      * than one character.  And assume will match just one character.  This is
7194      * overwritten later if matched more. */
7195     if (lenp) {
7196         maxlen = (*lenp > c_len) ? *lenp : c_len;
7197         *lenp = c_len;
7198
7199     }
7200     else {
7201         maxlen = c_len;
7202     }
7203
7204     /* If this character is potentially in the bitmap, check it */
7205     if (c < 256) {
7206         if (ANYOF_BITMAP_TEST(n, c))
7207             match = TRUE;
7208         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7209                 && ! utf8_target
7210                 && ! isASCII(c))
7211         {
7212             match = TRUE;
7213         }
7214
7215         else if (flags & ANYOF_LOCALE) {
7216             PL_reg_flags |= RF_tainted;
7217
7218             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
7219                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7220             {
7221                 match = TRUE;
7222             }
7223             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7224                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7225                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7226                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7227                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7228                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7229                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7230                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7231                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7232                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7233                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7234                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7235                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7236                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7237                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7238                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7239                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7240                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7241                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7242                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7243                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7244                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7245                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7246                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7247                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7248                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7249                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7250                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7251                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7252                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7253                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7254                      ) /* How's that for a conditional? */
7255             ) {
7256                 match = TRUE;
7257             }
7258         }
7259     }
7260
7261     /* If the bitmap didn't (or couldn't) match, and something outside the
7262      * bitmap could match, try that.  Locale nodes specifiy completely the
7263      * behavior of code points in the bit map (otherwise, a utf8 target would
7264      * cause them to be treated as Unicode and not locale), except in
7265      * the very unlikely event when this node is a synthetic start class, which
7266      * could be a combination of locale and non-locale nodes.  So allow locale
7267      * to match for the synthetic start class, which will give a false
7268      * positive that will be resolved when the match is done again as not part
7269      * of the synthetic start class */
7270     if (!match) {
7271         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7272             match = TRUE;       /* Everything above 255 matches */
7273         }
7274         else if (ANYOF_NONBITMAP(n)
7275                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7276                      || (utf8_target
7277                          && (c >=256
7278                              || (! (flags & ANYOF_LOCALE))
7279                              || (flags & ANYOF_IS_SYNTHETIC)))))
7280         {
7281             AV *av;
7282             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7283
7284             if (sw) {
7285                 U8 * utf8_p;
7286                 if (utf8_target) {
7287                     utf8_p = (U8 *) p;
7288                 } else {
7289
7290                     /* Not utf8.  Convert as much of the string as available up
7291                      * to the limit of how far the (single) character in the
7292                      * pattern can possibly match (no need to go further).  If
7293                      * the node is a straight ANYOF or not folding, it can't
7294                      * match more than one.  Otherwise, It can match up to how
7295                      * far a single char can fold to.  Since not utf8, each
7296                      * character is a single byte, so the max it can be in
7297                      * bytes is the same as the max it can be in characters */
7298                     STRLEN len = (OP(n) == ANYOF
7299                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
7300                                   ? 1
7301                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
7302                                     ? maxlen
7303                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
7304                     utf8_p = bytes_to_utf8(p, &len);
7305                 }
7306
7307                 if (swash_fetch(sw, utf8_p, TRUE))
7308                     match = TRUE;
7309                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
7310
7311                     /* Here, we need to test if the fold of the target string
7312                      * matches.  The non-multi char folds have all been moved to
7313                      * the compilation phase, and the multi-char folds have
7314                      * been stored by regcomp into 'av'; we linearly check to
7315                      * see if any match the target string (folded).   We know
7316                      * that the originals were each one character, but we don't
7317                      * currently know how many characters/bytes each folded to,
7318                      * except we do know that there are small limits imposed by
7319                      * Unicode.  XXX A performance enhancement would be to have
7320                      * regcomp.c store the max number of chars/bytes that are
7321                      * in an av entry, as, say the 0th element.  Even better
7322                      * would be to have a hash of the few characters that can
7323                      * start a multi-char fold to the max number of chars of
7324                      * those folds.
7325                      *
7326                      * If there is a match, we will need to advance (if lenp is
7327                      * specified) the match pointer in the target string.  But
7328                      * what we are comparing here isn't that string directly,
7329                      * but its fold, whose length may differ from the original.
7330                      * As we go along in constructing the fold, therefore, we
7331                      * create a map so that we know how many bytes in the
7332                      * source to advance given that we have matched a certain
7333                      * number of bytes in the fold.  This map is stored in
7334                      * 'map_fold_len_back'.  Let n mean the number of bytes in
7335                      * the fold of the first character that we are folding.
7336                      * Then map_fold_len_back[n] is set to the number of bytes
7337                      * in that first character.  Similarly let m be the
7338                      * corresponding number for the second character to be
7339                      * folded.  Then map_fold_len_back[n+m] is set to the
7340                      * number of bytes occupied by the first two source
7341                      * characters. ... */
7342                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
7343                     U8 folded[UTF8_MAXBYTES_CASE+1];
7344                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
7345                     STRLEN total_foldlen = 0; /* num bytes in fold of all
7346                                                   chars */
7347
7348                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
7349
7350                         /* Here, only need to fold the first char of the target
7351                          * string.  It the source wasn't utf8, is 1 byte long */
7352                         to_utf8_fold(utf8_p, folded, &foldlen);
7353                         total_foldlen = foldlen;
7354                         map_fold_len_back[foldlen] = (utf8_target)
7355                                                      ? UTF8SKIP(utf8_p)
7356                                                      : 1;
7357                     }
7358                     else {
7359
7360                         /* Here, need to fold more than the first char.  Do so
7361                          * up to the limits */
7362                         U8* source_ptr = utf8_p;    /* The source for the fold
7363                                                        is the regex target
7364                                                        string */
7365                         U8* folded_ptr = folded;
7366                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
7367                                                        available byte in the
7368                                                        target string */
7369                         U8 i;
7370                         for (i = 0;
7371                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
7372                              i++)
7373                         {
7374
7375                             /* Fold the next character */
7376                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
7377                             STRLEN this_char_foldlen;
7378                             to_utf8_fold(source_ptr,
7379                                          this_char_folded,
7380                                          &this_char_foldlen);
7381
7382                             /* Bail if it would exceed the byte limit for
7383                              * folding a single char. */
7384                             if (this_char_foldlen + folded_ptr - folded >
7385                                                             UTF8_MAXBYTES_CASE)
7386                             {
7387                                 break;
7388                             }
7389
7390                             /* Add the fold of this character */
7391                             Copy(this_char_folded,
7392                                  folded_ptr,
7393                                  this_char_foldlen,
7394                                  U8);
7395                             source_ptr += UTF8SKIP(source_ptr);
7396                             folded_ptr += this_char_foldlen;
7397                             total_foldlen = folded_ptr - folded;
7398
7399                             /* Create map from the number of bytes in the fold
7400                              * back to the number of bytes in the source.  If
7401                              * the source isn't utf8, the byte count is just
7402                              * the number of characters so far */
7403                             map_fold_len_back[total_foldlen]
7404                                                       = (utf8_target)
7405                                                         ? source_ptr - utf8_p
7406                                                         : i + 1;
7407                         }
7408                         *folded_ptr = '\0';
7409                     }
7410
7411
7412                     /* Do the linear search to see if the fold is in the list
7413                      * of multi-char folds. */
7414                     if (av) {
7415                         I32 i;
7416                         for (i = 0; i <= av_len(av); i++) {
7417                             SV* const sv = *av_fetch(av, i, FALSE);
7418                             STRLEN len;
7419                             const char * const s = SvPV_const(sv, len);
7420
7421                             if (len <= total_foldlen
7422                                 && memEQ(s, (char*)folded, len)
7423
7424                                    /* If 0, means matched a partial char. See
7425                                     * [perl #90536] */
7426                                 && map_fold_len_back[len])
7427                             {
7428
7429                                 /* Advance the target string ptr to account for
7430                                  * this fold, but have to translate from the
7431                                  * folded length to the corresponding source
7432                                  * length. */
7433                                 if (lenp) {
7434                                     *lenp = map_fold_len_back[len];
7435                                 }
7436                                 match = TRUE;
7437                                 break;
7438                             }
7439                         }
7440                     }
7441                 }
7442
7443                 /* If we allocated a string above, free it */
7444                 if (! utf8_target) Safefree(utf8_p);
7445             }
7446         }
7447
7448         if (UNICODE_IS_SUPER(c)
7449             && (flags & ANYOF_WARN_SUPER)
7450             && ckWARN_d(WARN_NON_UNICODE))
7451         {
7452             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7453                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7454         }
7455     }
7456
7457     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7458     return cBOOL(flags & ANYOF_INVERT) ^ match;
7459 }
7460
7461 STATIC U8 *
7462 S_reghop3(U8 *s, I32 off, const U8* lim)
7463 {
7464     /* return the position 'off' UTF-8 characters away from 's', forward if
7465      * 'off' >= 0, backwards if negative.  But don't go outside of position
7466      * 'lim', which better be < s  if off < 0 */
7467
7468     dVAR;
7469
7470     PERL_ARGS_ASSERT_REGHOP3;
7471
7472     if (off >= 0) {
7473         while (off-- && s < lim) {
7474             /* XXX could check well-formedness here */
7475             s += UTF8SKIP(s);
7476         }
7477     }
7478     else {
7479         while (off++ && s > lim) {
7480             s--;
7481             if (UTF8_IS_CONTINUED(*s)) {
7482                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7483                     s--;
7484             }
7485             /* XXX could check well-formedness here */
7486         }
7487     }
7488     return s;
7489 }
7490
7491 #ifdef XXX_dmq
7492 /* there are a bunch of places where we use two reghop3's that should
7493    be replaced with this routine. but since thats not done yet
7494    we ifdef it out - dmq
7495 */
7496 STATIC U8 *
7497 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7498 {
7499     dVAR;
7500
7501     PERL_ARGS_ASSERT_REGHOP4;
7502
7503     if (off >= 0) {
7504         while (off-- && s < rlim) {
7505             /* XXX could check well-formedness here */
7506             s += UTF8SKIP(s);
7507         }
7508     }
7509     else {
7510         while (off++ && s > llim) {
7511             s--;
7512             if (UTF8_IS_CONTINUED(*s)) {
7513                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7514                     s--;
7515             }
7516             /* XXX could check well-formedness here */
7517         }
7518     }
7519     return s;
7520 }
7521 #endif
7522
7523 STATIC U8 *
7524 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7525 {
7526     dVAR;
7527
7528     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7529
7530     if (off >= 0) {
7531         while (off-- && s < lim) {
7532             /* XXX could check well-formedness here */
7533             s += UTF8SKIP(s);
7534         }
7535         if (off >= 0)
7536             return NULL;
7537     }
7538     else {
7539         while (off++ && s > lim) {
7540             s--;
7541             if (UTF8_IS_CONTINUED(*s)) {
7542                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7543                     s--;
7544             }
7545             /* XXX could check well-formedness here */
7546         }
7547         if (off <= 0)
7548             return NULL;
7549     }
7550     return s;
7551 }
7552
7553 static void
7554 restore_pos(pTHX_ void *arg)
7555 {
7556     dVAR;
7557     regexp * const rex = (regexp *)arg;
7558     if (PL_reg_state.re_state_eval_setup_done) {
7559         if (PL_reg_oldsaved) {
7560             rex->subbeg = PL_reg_oldsaved;
7561             rex->sublen = PL_reg_oldsavedlen;
7562             rex->suboffset = PL_reg_oldsavedoffset;
7563             rex->subcoffset = PL_reg_oldsavedcoffset;
7564 #ifdef PERL_OLD_COPY_ON_WRITE
7565             rex->saved_copy = PL_nrs;
7566 #endif
7567             RXp_MATCH_COPIED_on(rex);
7568         }
7569         PL_reg_magic->mg_len = PL_reg_oldpos;
7570         PL_reg_state.re_state_eval_setup_done = FALSE;
7571         PL_curpm = PL_reg_oldcurpm;
7572     }
7573 }
7574
7575 STATIC void
7576 S_to_utf8_substr(pTHX_ register regexp *prog)
7577 {
7578     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7579      * on the converted value */
7580
7581     int i = 1;
7582
7583     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7584
7585     do {
7586         if (prog->substrs->data[i].substr
7587             && !prog->substrs->data[i].utf8_substr) {
7588             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7589             prog->substrs->data[i].utf8_substr = sv;
7590             sv_utf8_upgrade(sv);
7591             if (SvVALID(prog->substrs->data[i].substr)) {
7592                 if (SvTAIL(prog->substrs->data[i].substr)) {
7593                     /* Trim the trailing \n that fbm_compile added last
7594                        time.  */
7595                     SvCUR_set(sv, SvCUR(sv) - 1);
7596                     /* Whilst this makes the SV technically "invalid" (as its
7597                        buffer is no longer followed by "\0") when fbm_compile()
7598                        adds the "\n" back, a "\0" is restored.  */
7599                     fbm_compile(sv, FBMcf_TAIL);
7600                 } else
7601                     fbm_compile(sv, 0);
7602             }
7603             if (prog->substrs->data[i].substr == prog->check_substr)
7604                 prog->check_utf8 = sv;
7605         }
7606     } while (i--);
7607 }
7608
7609 STATIC bool
7610 S_to_byte_substr(pTHX_ register regexp *prog)
7611 {
7612     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7613      * on the converted value; returns FALSE if can't be converted. */
7614
7615     dVAR;
7616     int i = 1;
7617
7618     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7619
7620     do {
7621         if (prog->substrs->data[i].utf8_substr
7622             && !prog->substrs->data[i].substr) {
7623             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7624             if (! sv_utf8_downgrade(sv, TRUE)) {
7625                 return FALSE;
7626             }
7627             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7628                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7629                     /* Trim the trailing \n that fbm_compile added last
7630                         time.  */
7631                     SvCUR_set(sv, SvCUR(sv) - 1);
7632                     fbm_compile(sv, FBMcf_TAIL);
7633                 } else
7634                     fbm_compile(sv, 0);
7635             }
7636             prog->substrs->data[i].substr = sv;
7637             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7638                 prog->check_substr = sv;
7639         }
7640     } while (i--);
7641
7642     return TRUE;
7643 }
7644
7645 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7646  * pre-composed Hangul syllables, which are all in a contiguous block and
7647  * arranged there in such a way so as to facilitate alorithmic determination of
7648  * their characteristics.  As such, they don't need a swash, but can be
7649  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7650  * is a GCB=LV */
7651 #define SBASE 0xAC00    /* Start of block */
7652 #define SCount 11172    /* Length of block */
7653 #define TCount 28
7654
7655 #if 0   /* This routine is not currently used */
7656 PERL_STATIC_INLINE bool
7657 S_is_utf8_X_LV(pTHX_ const U8 *p)
7658 {
7659     /* Unlike most other similarly named routines here, this does not create a
7660      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7661
7662     dVAR;
7663
7664     UV cp = valid_utf8_to_uvchr(p, NULL);
7665
7666     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7667
7668     /* The earliest Unicode releases did not have these precomposed Hangul
7669      * syllables.  Set to point to undef in that case, so will return false on
7670      * every call */
7671     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7672         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7673         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7674             SvREFCNT_dec(PL_utf8_X_LV);
7675             PL_utf8_X_LV = &PL_sv_undef;
7676         }
7677     }
7678
7679     return (PL_utf8_X_LV != &PL_sv_undef
7680             && cp >= SBASE && cp < SBASE + SCount
7681             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7682 }
7683 #endif
7684
7685 PERL_STATIC_INLINE bool
7686 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7687 {
7688     /* Unlike most other similarly named routines here, this does not create a
7689      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7690
7691     dVAR;
7692
7693     UV cp = valid_utf8_to_uvchr(p, NULL);
7694
7695     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7696
7697     /* The earliest Unicode releases did not have these precomposed Hangul
7698      * syllables.  Set to point to undef in that case, so will return false on
7699      * every call */
7700     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7701         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7702         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7703             SvREFCNT_dec(PL_utf8_X_LVT);
7704             PL_utf8_X_LVT = &PL_sv_undef;
7705         }
7706     }
7707
7708     return (PL_utf8_X_LVT != &PL_sv_undef
7709             && cp >= SBASE && cp < SBASE + SCount
7710             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7711 }
7712
7713 /*
7714  * Local variables:
7715  * c-indentation-style: bsd
7716  * c-basic-offset: 4
7717  * indent-tabs-mode: nil
7718  * End:
7719  *
7720  * ex: set ts=8 sts=4 sw=4 et:
7721  */