regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid only if 'c', the character being looke-up, is an invariant under
 111  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 112  * everything matchable is straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 114                                 ? reginclass(prog,p,c,c+1,u)                \
 115                                 : ANYOF_BITMAP_TEST(p,*(c)))
 116
 117 /*
 118  * Forwards.
 119  */
 120
 121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 122
 123 #define HOPc(pos,off) \
 124         (char *)(reginfo->is_utf8_target \
 125             ? reghop3((U8*)pos, off, \
 126                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 127             : (U8*)(pos + off))
 128
 129 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 130 #define HOPBACK3(pos, off, lim) \
 131         (reginfo->is_utf8_target                          \
 132             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 133             : (pos - off >= lim)                                 \
 134                 ? (U8*)pos - off                                 \
 135                 : NULL)
 136
 137 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 138
 139 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 140 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 141
 142 /* lim must be +ve. Returns NULL on overshoot */
 143 #define HOPMAYBE3(pos,off,lim) \
 144         (reginfo->is_utf8_target                        \
 145             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 146             : ((U8*)pos + off <= lim)                   \
 147                 ? (U8*)pos + off                        \
 148                 : NULL)
 149
 150 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 151  * off must be >=0; args should be vars rather than expressions */
 152 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 153     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 154     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 155 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 156
 157 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 158     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 159     : (U8*)(pos + off))
 160 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 161
 162 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 163 #define NEXTCHR_IS_EOS (nextchr < 0)
 164
 165 #define SET_nextchr \
 166     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 167
 168 #define SET_locinput(p) \
 169     locinput = (p);  \
 170     SET_nextchr
 171
 172 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 173 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 174
 175 /* for use after a quantifier and before an EXACT-like node -- japhy */
 176 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 177  *
 178  * NOTE that *nothing* that affects backtracking should be in here, specifically
 179  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 180  * node that is in between two EXACT like nodes when ascertaining what the required
 181  * "follow" character is. This should probably be moved to regex compile time
 182  * although it may be done at run time beause of the REF possibility - more
 183  * investigation required. -- demerphq
 184 */
 185 #define JUMPABLE(rn) (                                                             \
 186     OP(rn) == OPEN ||                                                              \
 187     (OP(rn) == CLOSE &&                                                            \
 188      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 189     OP(rn) == EVAL ||                                                              \
 190     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 191     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 192     OP(rn) == KEEPS ||                                                             \
 193     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 194 )
 195 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 196
 197 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 198
 199 /*
 200   Search for mandatory following text node; for lookahead, the text must
 201   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 202 */
 203 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 204     while (JUMPABLE(rn)) { \
 205         const OPCODE type = OP(rn); \
 206         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 207             rn = NEXTOPER(NEXTOPER(rn)); \
 208         else if (type == PLUS) \
 209             rn = NEXTOPER(rn); \
 210         else if (type == IFMATCH) \
 211             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 212         else rn += NEXT_OFF(rn); \
 213     } \
 214 } STMT_END
 215
 216 #define SLAB_FIRST(s) (&(s)->states[0])
 217 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 218
 219 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 220 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 221 static regmatch_state * S_push_slab(pTHX);
 222
 223 #define REGCP_PAREN_ELEMS 3
 224 #define REGCP_OTHER_ELEMS 3
 225 #define REGCP_FRAME_ELEMS 1
 226 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 227  * are needed for the regexp context stack bookkeeping. */
 228
 229 STATIC CHECKPOINT
 230 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 231 {
 232     const int retval = PL_savestack_ix;
 233     const int paren_elems_to_push =
 234                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 235     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 236     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 237     I32 p;
 238     GET_RE_DEBUG_FLAGS_DECL;
 239
 240     PERL_ARGS_ASSERT_REGCPPUSH;
 241
 242     if (paren_elems_to_push < 0)
 243         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 244                    (int)paren_elems_to_push, (int)maxopenparen,
 245                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 246
 247     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 248         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 249                    " out of range (%lu-%ld)",
 250                    total_elems,
 251                    (unsigned long)maxopenparen,
 252                    (long)parenfloor);
 253
 254     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 255
 256     DEBUG_BUFFERS_r(
 257         if ((int)maxopenparen > (int)parenfloor)
 258             Perl_re_exec_indentf( aTHX_
 259                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 260                 depth,
 261                 PTR2UV(rex),
 262                 PTR2UV(rex->offs)
 263             );
 264     );
 265     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 266 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 267         SSPUSHIV(rex->offs[p].end);
 268         SSPUSHIV(rex->offs[p].start);
 269         SSPUSHINT(rex->offs[p].start_tmp);
 270         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 271             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 272             depth,
 273             (UV)p,
 274             (IV)rex->offs[p].start,
 275             (IV)rex->offs[p].start_tmp,
 276             (IV)rex->offs[p].end
 277         ));
 278     }
 279 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 280     SSPUSHINT(maxopenparen);
 281     SSPUSHINT(rex->lastparen);
 282     SSPUSHINT(rex->lastcloseparen);
 283     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 284
 285     return retval;
 286 }
 287
 288 /* These are needed since we do not localize EVAL nodes: */
 289 #define REGCP_SET(cp)                                           \
 290     DEBUG_STATE_r(                                              \
 291         Perl_re_exec_indentf( aTHX_                             \
 292             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 293             depth, (IV)PL_savestack_ix                          \
 294         )                                                       \
 295     );                                                          \
 296     cp = PL_savestack_ix
 297
 298 #define REGCP_UNWIND(cp)                                        \
 299     DEBUG_STATE_r(                                              \
 300         if (cp != PL_savestack_ix)                              \
 301             Perl_re_exec_indentf( aTHX_                         \
 302                 "Clearing an EVAL scope, savestack=%"           \
 303                 IVdf "..%" IVdf "\n",                           \
 304                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 305             )                                                   \
 306     );                                                          \
 307     regcpblow(cp)
 308
 309 /* set the start and end positions of capture ix */
 310 #define CLOSE_CAPTURE(ix, s, e)                                            \
 311     rex->offs[ix].start = s;                                               \
 312     rex->offs[ix].end = e;                                                 \
 313     if (ix > rex->lastparen)                                               \
 314         rex->lastparen = ix;                                               \
 315     rex->lastcloseparen = ix;                                              \
 316     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 317         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 318         depth,                                                             \
 319         PTR2UV(rex),                                                       \
 320         PTR2UV(rex->offs),                                                 \
 321         (UV)ix,                                                            \
 322         (IV)rex->offs[ix].start,                                           \
 323         (IV)rex->offs[ix].end,                                             \
 324         (UV)rex->lastparen                                                 \
 325     ))
 326
 327 #define UNWIND_PAREN(lp, lcp)               \
 328     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 329         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 330         depth,                              \
 331         PTR2UV(rex),                        \
 332         PTR2UV(rex->offs),                  \
 333         (UV)(lp),                           \
 334         (UV)(rex->lastparen),               \
 335         (UV)(lcp)                           \
 336     ));                                     \
 337     for (n = rex->lastparen; n > lp; n--)   \
 338         rex->offs[n].end = -1;              \
 339     rex->lastparen = n;                     \
 340     rex->lastcloseparen = lcp;
 341
 342
 343 STATIC void
 344 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 345 {
 346     UV i;
 347     U32 paren;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     PERL_ARGS_ASSERT_REGCPPOP;
 351
 352     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 353     i = SSPOPUV;
 354     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 355     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 356     rex->lastcloseparen = SSPOPINT;
 357     rex->lastparen = SSPOPINT;
 358     *maxopenparen_p = SSPOPINT;
 359
 360     i -= REGCP_OTHER_ELEMS;
 361     /* Now restore the parentheses context. */
 362     DEBUG_BUFFERS_r(
 363         if (i || rex->lastparen + 1 <= rex->nparens)
 364             Perl_re_exec_indentf( aTHX_
 365                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 366                 depth,
 367                 PTR2UV(rex),
 368                 PTR2UV(rex->offs)
 369             );
 370     );
 371     paren = *maxopenparen_p;
 372     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 373         SSize_t tmps;
 374         rex->offs[paren].start_tmp = SSPOPINT;
 375         rex->offs[paren].start = SSPOPIV;
 376         tmps = SSPOPIV;
 377         if (paren <= rex->lastparen)
 378             rex->offs[paren].end = tmps;
 379         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 380             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 381             depth,
 382             (UV)paren,
 383             (IV)rex->offs[paren].start,
 384             (IV)rex->offs[paren].start_tmp,
 385             (IV)rex->offs[paren].end,
 386             (paren > rex->lastparen ? "(skipped)" : ""));
 387         );
 388         paren--;
 389     }
 390 #if 1
 391     /* It would seem that the similar code in regtry()
 392      * already takes care of this, and in fact it is in
 393      * a better location to since this code can #if 0-ed out
 394      * but the code in regtry() is needed or otherwise tests
 395      * requiring null fields (pat.t#187 and split.t#{13,14}
 396      * (as of patchlevel 7877)  will fail.  Then again,
 397      * this code seems to be necessary or otherwise
 398      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 399      * --jhi updated by dapm */
 400     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 401         if (i > *maxopenparen_p)
 402             rex->offs[i].start = -1;
 403         rex->offs[i].end = -1;
 404         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 405             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 406             depth,
 407             (UV)i,
 408             (i > *maxopenparen_p) ? "-1" : "  "
 409         ));
 410     }
 411 #endif
 412 }
 413
 414 /* restore the parens and associated vars at savestack position ix,
 415  * but without popping the stack */
 416
 417 STATIC void
 418 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 419 {
 420     I32 tmpix = PL_savestack_ix;
 421     PERL_ARGS_ASSERT_REGCP_RESTORE;
 422
 423     PL_savestack_ix = ix;
 424     regcppop(rex, maxopenparen_p);
 425     PL_savestack_ix = tmpix;
 426 }
 427
 428 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 429
 430 #ifndef PERL_IN_XSUB_RE
 431
 432 bool
 433 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 434 {
 435     /* Returns a boolean as to whether or not 'character' is a member of the
 436      * Posix character class given by 'classnum' that should be equivalent to a
 437      * value in the typedef '_char_class_number'.
 438      *
 439      * Ideally this could be replaced by a just an array of function pointers
 440      * to the C library functions that implement the macros this calls.
 441      * However, to compile, the precise function signatures are required, and
 442      * these may vary from platform to to platform.  To avoid having to figure
 443      * out what those all are on each platform, I (khw) am using this method,
 444      * which adds an extra layer of function call overhead (unless the C
 445      * optimizer strips it away).  But we don't particularly care about
 446      * performance with locales anyway. */
 447
 448     switch ((_char_class_number) classnum) {
 449         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 450         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 451         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 452         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 453         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 454                                         || isUPPER_LC(character);
 455         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 456         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 457         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 458         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 459         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 460         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 461         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 462         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 463         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 464         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 465         default:    /* VERTSPACE should never occur in locales */
 466             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 467     }
 468
 469     NOT_REACHED; /* NOTREACHED */
 470     return FALSE;
 471 }
 472
 473 #endif
 474
 475 STATIC bool
 476 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 477 {
 478     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 479      * 'character' is a member of the Posix character class given by 'classnum'
 480      * that should be equivalent to a value in the typedef
 481      * '_char_class_number'.
 482      *
 483      * This just calls isFOO_lc on the code point for the character if it is in
 484      * the range 0-255.  Outside that range, all characters use Unicode
 485      * rules, ignoring any locale.  So use the Unicode function if this class
 486      * requires a swash, and use the Unicode macro otherwise. */
 487
 488     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 489
 490     if (UTF8_IS_INVARIANT(*character)) {
 491         return isFOO_lc(classnum, *character);
 492     }
 493     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 494         return isFOO_lc(classnum,
 495                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 496     }
 497
 498     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 499
 500     switch ((_char_class_number) classnum) {
 501         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 502         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 503         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 504         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 505         default:
 506             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 507                                         utf8_to_uvchr_buf(character, e, NULL));
 508     }
 509
 510     return FALSE; /* Things like CNTRL are always below 256 */
 511 }
 512
 513 STATIC char *
 514 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
 515 {
 516     /* Returns the position of the first ASCII byte in the sequence between 's'
 517      * and 'send-1' inclusive; returns 'send' if none found */
 518
 519     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 520
 521 #ifndef EBCDIC
 522
 523     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 524
 525                             /* This term is wordsize if subword; 0 if not */
 526                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 527
 528                             /* 'offset' */
 529                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 530     {
 531
 532         /* Process per-byte until reach word boundary.  XXX This loop could be
 533          * eliminated if we knew that this platform had fast unaligned reads */
 534         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 535             if (isASCII(*s)) {
 536                 return s;
 537             }
 538             s++;    /* khw didn't bother creating a separate loop for
 539                        utf8_target */
 540         }
 541
 542         /* Here, we know we have at least one full word to process.  Process
 543          * per-word as long as we have at least a full word left */
 544         do {
 545             PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
 546             if (complemented & PERL_VARIANTS_WORD_MASK)  {
 547
 548 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 549      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 550
 551                 s += _variant_byte_number(complemented);
 552                 return s;
 553
 554 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 555            checks. */
 556
 557                 break;
 558 #  endif
 559             }
 560
 561             s += PERL_WORDSIZE;
 562
 563         } while (s + PERL_WORDSIZE <= send);
 564     }
 565
 566 #endif
 567
 568     /* Process per-character */
 569     if (utf8_target) {
 570         while (s < send) {
 571             if (isASCII(*s)) {
 572                 return s;
 573             }
 574             s += UTF8SKIP(s);
 575         }
 576     }
 577     else {
 578         while (s < send) {
 579             if (isASCII(*s)) {
 580                 return s;
 581             }
 582             s++;
 583         }
 584     }
 585
 586     return s;
 587 }
 588
 589 STATIC char *
 590 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
 591 {
 592     /* Returns the position of the first non-ASCII byte in the sequence between
 593      * 's' and 'send-1' inclusive; returns 'send' if none found */
 594
 595 #ifdef EBCDIC
 596
 597     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 598
 599     if (utf8_target) {
 600         while (s < send) {
 601             if ( ! isASCII(*s)) {
 602                 return s;
 603             }
 604             s += UTF8SKIP(s);
 605         }
 606     }
 607     else {
 608         while (s < send) {
 609             if ( ! isASCII(*s)) {
 610                 return s;
 611             }
 612             s++;
 613         }
 614     }
 615
 616     return s;
 617
 618 #else
 619
 620     const U8 * next_non_ascii = NULL;
 621
 622     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 623     PERL_UNUSED_ARG(utf8_target);
 624
 625     /* On ASCII platforms invariants and ASCII are identical, so if the string
 626      * is entirely invariants, there is no non-ASCII character */
 627     return (is_utf8_invariant_string_loc((U8 *) s,
 628                                          (STRLEN) (send - s),
 629                                          &next_non_ascii))
 630             ? (char *) send
 631             : (char *) next_non_ascii;
 632
 633 #endif
 634
 635 }
 636
 637 STATIC U8 *
 638 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 639 {
 640     /* Returns the position of the first byte in the sequence between 's' and
 641      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 642      * */
 643
 644     PERL_ARGS_ASSERT_FIND_SPAN_END;
 645
 646     assert(send >= s);
 647
 648     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 649                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 650                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 651     {
 652         PERL_UINTMAX_T span_word;
 653
 654         /* Process per-byte until reach word boundary.  XXX This loop could be
 655          * eliminated if we knew that this platform had fast unaligned reads */
 656         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 657             if (*s != span_byte) {
 658                 return s;
 659             }
 660             s++;
 661         }
 662
 663         /* Create a word filled with the bytes we are spanning */
 664         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 665
 666         /* Process per-word as long as we have at least a full word left */
 667         do {
 668
 669             /* Keep going if the whole word is composed of 'span_byte's */
 670             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 671                 s += PERL_WORDSIZE;
 672                 continue;
 673             }
 674
 675             /* Here, at least one byte in the word isn't 'span_byte'. */
 676
 677 #ifdef EBCDIC
 678
 679             break;
 680
 681 #else
 682
 683             /* This xor leaves 1 bits only in those non-matching bytes */
 684             span_word ^= * (PERL_UINTMAX_T *) s;
 685
 686             /* Make sure the upper bit of each non-matching byte is set.  This
 687              * makes each such byte look like an ASCII platform variant byte */
 688             span_word |= span_word << 1;
 689             span_word |= span_word << 2;
 690             span_word |= span_word << 4;
 691
 692             /* That reduces the problem to what this function solves */
 693             return s + _variant_byte_number(span_word);
 694
 695 #endif
 696
 697         } while (s + PERL_WORDSIZE <= send);
 698     }
 699
 700     /* Process the straggler bytes beyond the final word boundary */
 701     while (s < send) {
 702         if (*s != span_byte) {
 703             return s;
 704         }
 705         s++;
 706     }
 707
 708     return s;
 709 }
 710
 711 STATIC U8 *
 712 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 713 {
 714     /* Returns the position of the first byte in the sequence between 's'
 715      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 716      * returns 'send' if none found.  It uses word-level operations instead of
 717      * byte to speed up the process */
 718
 719     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 720
 721     assert(send >= s);
 722     assert((byte & mask) == byte);
 723
 724 #ifndef EBCDIC
 725
 726     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 727                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 728                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 729     {
 730         PERL_UINTMAX_T word, mask_word;
 731
 732         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 733             if (((*s) & mask) == byte) {
 734                 return s;
 735             }
 736             s++;
 737         }
 738
 739         word      = PERL_COUNT_MULTIPLIER * byte;
 740         mask_word = PERL_COUNT_MULTIPLIER * mask;
 741
 742         do {
 743             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 744
 745             /* If 'masked' contains bytes with the bit pattern of 'byte' within
 746              * it, xoring with 'word' will leave each of the 8 bits in such
 747              * bytes be 0, and no byte containing any other bit pattern will be
 748              * 0. */
 749             masked ^= word;
 750
 751             /* This causes the most significant bit to be set to 1 for any
 752              * bytes in the word that aren't completely 0 */
 753             masked |= masked << 1;
 754             masked |= masked << 2;
 755             masked |= masked << 4;
 756
 757             /* The msbits are the same as what marks a byte as variant, so we
 758              * can use this mask.  If all msbits are 1, the word doesn't
 759              * contain 'byte' */
 760             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 761                 s += PERL_WORDSIZE;
 762                 continue;
 763             }
 764
 765             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 766              * and any that are, are 0.  Complement and re-AND to swap that */
 767             masked = ~ masked;
 768             masked &= PERL_VARIANTS_WORD_MASK;
 769
 770             /* This reduces the problem to that solved by this function */
 771             s += _variant_byte_number(masked);
 772             return s;
 773
 774         } while (s + PERL_WORDSIZE <= send);
 775     }
 776
 777 #endif
 778
 779     while (s < send) {
 780         if (((*s) & mask) == byte) {
 781             return s;
 782         }
 783         s++;
 784     }
 785
 786     return s;
 787 }
 788
 789 STATIC U8 *
 790 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 791 {
 792     /* Returns the position of the first byte in the sequence between 's' and
 793      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 794      * 'span_byte' should have been ANDed with 'mask' in the call of this
 795      * function.  Returns 'send' if none found.  Works like find_span_end(),
 796      * except for the AND */
 797
 798     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 799
 800     assert(send >= s);
 801     assert((span_byte & mask) == span_byte);
 802
 803     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 804                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 805                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 806     {
 807         PERL_UINTMAX_T span_word, mask_word;
 808
 809         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 810             if (((*s) & mask) != span_byte) {
 811                 return s;
 812             }
 813             s++;
 814         }
 815
 816         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 817         mask_word = PERL_COUNT_MULTIPLIER * mask;
 818
 819         do {
 820             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 821
 822             if (masked == span_word) {
 823                 s += PERL_WORDSIZE;
 824                 continue;
 825             }
 826
 827 #ifdef EBCDIC
 828
 829             break;
 830
 831 #else
 832
 833             masked ^= span_word;
 834             masked |= masked << 1;
 835             masked |= masked << 2;
 836             masked |= masked << 4;
 837             return s + _variant_byte_number(masked);
 838
 839 #endif
 840
 841         } while (s + PERL_WORDSIZE <= send);
 842     }
 843
 844     while (s < send) {
 845         if (((*s) & mask) != span_byte) {
 846             return s;
 847         }
 848         s++;
 849     }
 850
 851     return s;
 852 }
 853
 854 /*
 855  * pregexec and friends
 856  */
 857
 858 #ifndef PERL_IN_XSUB_RE
 859 /*
 860  - pregexec - match a regexp against a string
 861  */
 862 I32
 863 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 864          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 865 /* stringarg: the point in the string at which to begin matching */
 866 /* strend:    pointer to null at end of string */
 867 /* strbeg:    real beginning of string */
 868 /* minend:    end of match must be >= minend bytes after stringarg. */
 869 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 870  *            itself is accessed via the pointers above */
 871 /* nosave:    For optimizations. */
 872 {
 873     PERL_ARGS_ASSERT_PREGEXEC;
 874
 875     return
 876         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 877                       nosave ? 0 : REXEC_COPY_STR);
 878 }
 879 #endif
 880
 881
 882
 883 /* re_intuit_start():
 884  *
 885  * Based on some optimiser hints, try to find the earliest position in the
 886  * string where the regex could match.
 887  *
 888  *   rx:     the regex to match against
 889  *   sv:     the SV being matched: only used for utf8 flag; the string
 890  *           itself is accessed via the pointers below. Note that on
 891  *           something like an overloaded SV, SvPOK(sv) may be false
 892  *           and the string pointers may point to something unrelated to
 893  *           the SV itself.
 894  *   strbeg: real beginning of string
 895  *   strpos: the point in the string at which to begin matching
 896  *   strend: pointer to the byte following the last char of the string
 897  *   flags   currently unused; set to 0
 898  *   data:   currently unused; set to NULL
 899  *
 900  * The basic idea of re_intuit_start() is to use some known information
 901  * about the pattern, namely:
 902  *
 903  *   a) the longest known anchored substring (i.e. one that's at a
 904  *      constant offset from the beginning of the pattern; but not
 905  *      necessarily at a fixed offset from the beginning of the
 906  *      string);
 907  *   b) the longest floating substring (i.e. one that's not at a constant
 908  *      offset from the beginning of the pattern);
 909  *   c) Whether the pattern is anchored to the string; either
 910  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 911  *      or anchored to pos(): /\G/;
 912  *   d) A start class: a real or synthetic character class which
 913  *      represents which characters are legal at the start of the pattern;
 914  *
 915  * to either quickly reject the match, or to find the earliest position
 916  * within the string at which the pattern might match, thus avoiding
 917  * running the full NFA engine at those earlier locations, only to
 918  * eventually fail and retry further along.
 919  *
 920  * Returns NULL if the pattern can't match, or returns the address within
 921  * the string which is the earliest place the match could occur.
 922  *
 923  * The longest of the anchored and floating substrings is called 'check'
 924  * and is checked first. The other is called 'other' and is checked
 925  * second. The 'other' substring may not be present.  For example,
 926  *
 927  *    /(abc|xyz)ABC\d{0,3}DEFG/
 928  *
 929  * will have
 930  *
 931  *   check substr (float)    = "DEFG", offset 6..9 chars
 932  *   other substr (anchored) = "ABC",  offset 3..3 chars
 933  *   stclass = [ax]
 934  *
 935  * Be aware that during the course of this function, sometimes 'anchored'
 936  * refers to a substring being anchored relative to the start of the
 937  * pattern, and sometimes to the pattern itself being anchored relative to
 938  * the string. For example:
 939  *
 940  *   /\dabc/:   "abc" is anchored to the pattern;
 941  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 942  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 943  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 944  *                    but the pattern is anchored to the string.
 945  */
 946
 947 char *
 948 Perl_re_intuit_start(pTHX_
 949                     REGEXP * const rx,
 950                     SV *sv,
 951                     const char * const strbeg,
 952                     char *strpos,
 953                     char *strend,
 954                     const U32 flags,
 955                     re_scream_pos_data *data)
 956 {
 957     struct regexp *const prog = ReANY(rx);
 958     SSize_t start_shift = prog->check_offset_min;
 959     /* Should be nonnegative! */
 960     SSize_t end_shift   = 0;
 961     /* current lowest pos in string where the regex can start matching */
 962     char *rx_origin = strpos;
 963     SV *check;
 964     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 965     U8   other_ix = 1 - prog->substrs->check_ix;
 966     bool ml_anch = 0;
 967     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 968     char *check_at = NULL;              /* check substr found at this pos */
 969     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 970     RXi_GET_DECL(prog,progi);
 971     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 972     regmatch_info *const reginfo = &reginfo_buf;
 973     GET_RE_DEBUG_FLAGS_DECL;
 974
 975     PERL_ARGS_ASSERT_RE_INTUIT_START;
 976     PERL_UNUSED_ARG(flags);
 977     PERL_UNUSED_ARG(data);
 978
 979     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 980                 "Intuit: trying to determine minimum start position...\n"));
 981
 982     /* for now, assume that all substr offsets are positive. If at some point
 983      * in the future someone wants to do clever things with lookbehind and
 984      * -ve offsets, they'll need to fix up any code in this function
 985      * which uses these offsets. See the thread beginning
 986      * <20140113145929.GF27210@iabyn.com>
 987      */
 988     assert(prog->substrs->data[0].min_offset >= 0);
 989     assert(prog->substrs->data[0].max_offset >= 0);
 990     assert(prog->substrs->data[1].min_offset >= 0);
 991     assert(prog->substrs->data[1].max_offset >= 0);
 992     assert(prog->substrs->data[2].min_offset >= 0);
 993     assert(prog->substrs->data[2].max_offset >= 0);
 994
 995     /* for now, assume that if both present, that the floating substring
 996      * doesn't start before the anchored substring.
 997      * If you break this assumption (e.g. doing better optimisations
 998      * with lookahead/behind), then you'll need to audit the code in this
 999      * function carefully first
1000      */
1001     assert(
1002             ! (  (prog->anchored_utf8 || prog->anchored_substr)
1003               && (prog->float_utf8    || prog->float_substr))
1004            || (prog->float_min_offset >= prog->anchored_offset));
1005
1006     /* byte rather than char calculation for efficiency. It fails
1007      * to quickly reject some cases that can't match, but will reject
1008      * them later after doing full char arithmetic */
1009     if (prog->minlen > strend - strpos) {
1010         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1011                               "  String too short...\n"));
1012         goto fail;
1013     }
1014
1015     RXp_MATCH_UTF8_set(prog, utf8_target);
1016     reginfo->is_utf8_target = cBOOL(utf8_target);
1017     reginfo->info_aux = NULL;
1018     reginfo->strbeg = strbeg;
1019     reginfo->strend = strend;
1020     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1021     reginfo->intuit = 1;
1022     /* not actually used within intuit, but zero for safety anyway */
1023     reginfo->poscache_maxiter = 0;
1024
1025     if (utf8_target) {
1026         if ((!prog->anchored_utf8 && prog->anchored_substr)
1027                 || (!prog->float_utf8 && prog->float_substr))
1028             to_utf8_substr(prog);
1029         check = prog->check_utf8;
1030     } else {
1031         if (!prog->check_substr && prog->check_utf8) {
1032             if (! to_byte_substr(prog)) {
1033                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1034             }
1035         }
1036         check = prog->check_substr;
1037     }
1038
1039     /* dump the various substring data */
1040     DEBUG_OPTIMISE_MORE_r({
1041         int i;
1042         for (i=0; i<=2; i++) {
1043             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1044                                   : prog->substrs->data[i].substr);
1045             if (!sv)
1046                 continue;
1047
1048             Perl_re_printf( aTHX_
1049                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1050                 " useful=%" IVdf " utf8=%d [%s]\n",
1051                 i,
1052                 (IV)prog->substrs->data[i].min_offset,
1053                 (IV)prog->substrs->data[i].max_offset,
1054                 (IV)prog->substrs->data[i].end_shift,
1055                 BmUSEFUL(sv),
1056                 utf8_target ? 1 : 0,
1057                 SvPEEK(sv));
1058         }
1059     });
1060
1061     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1062
1063         /* ml_anch: check after \n?
1064          *
1065          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1066          * with /.*.../, these flags will have been added by the
1067          * compiler:
1068          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1069          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1070          */
1071         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1072                    && !(prog->intflags & PREGf_IMPLICIT);
1073
1074         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1075             /* we are only allowed to match at BOS or \G */
1076
1077             /* trivially reject if there's a BOS anchor and we're not at BOS.
1078              *
1079              * Note that we don't try to do a similar quick reject for
1080              * \G, since generally the caller will have calculated strpos
1081              * based on pos() and gofs, so the string is already correctly
1082              * anchored by definition; and handling the exceptions would
1083              * be too fiddly (e.g. REXEC_IGNOREPOS).
1084              */
1085             if (   strpos != strbeg
1086                 && (prog->intflags & PREGf_ANCH_SBOL))
1087             {
1088                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1089                                 "  Not at start...\n"));
1090                 goto fail;
1091             }
1092
1093             /* in the presence of an anchor, the anchored (relative to the
1094              * start of the regex) substr must also be anchored relative
1095              * to strpos. So quickly reject if substr isn't found there.
1096              * This works for \G too, because the caller will already have
1097              * subtracted gofs from pos, and gofs is the offset from the
1098              * \G to the start of the regex. For example, in /.abc\Gdef/,
1099              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1100              * caller will have set strpos=pos()-4; we look for the substr
1101              * at position pos()-4+1, which lines up with the "a" */
1102
1103             if (prog->check_offset_min == prog->check_offset_max) {
1104                 /* Substring at constant offset from beg-of-str... */
1105                 SSize_t slen = SvCUR(check);
1106                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1107
1108                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1109                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1110                     (IV)prog->check_offset_min));
1111
1112                 if (SvTAIL(check)) {
1113                     /* In this case, the regex is anchored at the end too.
1114                      * Unless it's a multiline match, the lengths must match
1115                      * exactly, give or take a \n.  NB: slen >= 1 since
1116                      * the last char of check is \n */
1117                     if (!multiline
1118                         && (   strend - s > slen
1119                             || strend - s < slen - 1
1120                             || (strend - s == slen && strend[-1] != '\n')))
1121                     {
1122                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1123                                             "  String too long...\n"));
1124                         goto fail_finish;
1125                     }
1126                     /* Now should match s[0..slen-2] */
1127                     slen--;
1128                 }
1129                 if (slen && (strend - s < slen
1130                     || *SvPVX_const(check) != *s
1131                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1132                 {
1133                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1134                                     "  String not equal...\n"));
1135                     goto fail_finish;
1136                 }
1137
1138                 check_at = s;
1139                 goto success_at_start;
1140             }
1141         }
1142     }
1143
1144     end_shift = prog->check_end_shift;
1145
1146 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1147     if (end_shift < 0)
1148         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1149                    (IV)end_shift, RX_PRECOMP(rx));
1150 #endif
1151
1152   restart:
1153
1154     /* This is the (re)entry point of the main loop in this function.
1155      * The goal of this loop is to:
1156      * 1) find the "check" substring in the region rx_origin..strend
1157      *    (adjusted by start_shift / end_shift). If not found, reject
1158      *    immediately.
1159      * 2) If it exists, look for the "other" substr too if defined; for
1160      *    example, if the check substr maps to the anchored substr, then
1161      *    check the floating substr, and vice-versa. If not found, go
1162      *    back to (1) with rx_origin suitably incremented.
1163      * 3) If we find an rx_origin position that doesn't contradict
1164      *    either of the substrings, then check the possible additional
1165      *    constraints on rx_origin of /^.../m or a known start class.
1166      *    If these fail, then depending on which constraints fail, jump
1167      *    back to here, or to various other re-entry points further along
1168      *    that skip some of the first steps.
1169      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1170      *    substring. If the start position was determined to be at the
1171      *    beginning of the string  - so, not rejected, but not optimised,
1172      *    since we have to run regmatch from position 0 - decrement the
1173      *    BmUSEFUL() count. Otherwise increment it.
1174      */
1175
1176
1177     /* first, look for the 'check' substring */
1178
1179     {
1180         U8* start_point;
1181         U8* end_point;
1182
1183         DEBUG_OPTIMISE_MORE_r({
1184             Perl_re_printf( aTHX_
1185                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1186                 " Start shift: %" IVdf " End shift %" IVdf
1187                 " Real end Shift: %" IVdf "\n",
1188                 (IV)(rx_origin - strbeg),
1189                 (IV)prog->check_offset_min,
1190                 (IV)start_shift,
1191                 (IV)end_shift,
1192                 (IV)prog->check_end_shift);
1193         });
1194
1195         end_point = HOPBACK3(strend, end_shift, rx_origin);
1196         if (!end_point)
1197             goto fail_finish;
1198         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1199         if (!start_point)
1200             goto fail_finish;
1201
1202
1203         /* If the regex is absolutely anchored to either the start of the
1204          * string (SBOL) or to pos() (ANCH_GPOS), then
1205          * check_offset_max represents an upper bound on the string where
1206          * the substr could start. For the ANCH_GPOS case, we assume that
1207          * the caller of intuit will have already set strpos to
1208          * pos()-gofs, so in this case strpos + offset_max will still be
1209          * an upper bound on the substr.
1210          */
1211         if (!ml_anch
1212             && prog->intflags & PREGf_ANCH
1213             && prog->check_offset_max != SSize_t_MAX)
1214         {
1215             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1216             const char * const anchor =
1217                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1218             SSize_t targ_len = (char*)end_point - anchor;
1219
1220             if (check_len > targ_len) {
1221                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1222                               "Target string too short to match required substring...\n"));
1223                 goto fail_finish;
1224             }
1225
1226             /* do a bytes rather than chars comparison. It's conservative;
1227              * so it skips doing the HOP if the result can't possibly end
1228              * up earlier than the old value of end_point.
1229              */
1230             assert(anchor + check_len <= (char *)end_point);
1231             if (prog->check_offset_max + check_len < targ_len) {
1232                 end_point = HOP3lim((U8*)anchor,
1233                                 prog->check_offset_max,
1234                                 end_point - check_len
1235                             )
1236                             + check_len;
1237                 if (end_point < start_point)
1238                     goto fail_finish;
1239             }
1240         }
1241
1242         check_at = fbm_instr( start_point, end_point,
1243                       check, multiline ? FBMrf_MULTILINE : 0);
1244
1245         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1246             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1247             (IV)((char*)start_point - strbeg),
1248             (IV)((char*)end_point   - strbeg),
1249             (IV)(check_at ? check_at - strbeg : -1)
1250         ));
1251
1252         /* Update the count-of-usability, remove useless subpatterns,
1253             unshift s.  */
1254
1255         DEBUG_EXECUTE_r({
1256             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1257                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1258             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1259                               (check_at ? "Found" : "Did not find"),
1260                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1261                     ? "anchored" : "floating"),
1262                 quoted,
1263                 RE_SV_TAIL(check),
1264                 (check_at ? " at offset " : "...\n") );
1265         });
1266
1267         if (!check_at)
1268             goto fail_finish;
1269         /* set rx_origin to the minimum position where the regex could start
1270          * matching, given the constraint of the just-matched check substring.
1271          * But don't set it lower than previously.
1272          */
1273
1274         if (check_at - rx_origin > prog->check_offset_max)
1275             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1276         /* Finish the diagnostic message */
1277         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1278             "%ld (rx_origin now %" IVdf ")...\n",
1279             (long)(check_at - strbeg),
1280             (IV)(rx_origin - strbeg)
1281         ));
1282     }
1283
1284
1285     /* now look for the 'other' substring if defined */
1286
1287     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1288                     : prog->substrs->data[other_ix].substr)
1289     {
1290         /* Take into account the "other" substring. */
1291         char *last, *last1;
1292         char *s;
1293         SV* must;
1294         struct reg_substr_datum *other;
1295
1296       do_other_substr:
1297         other = &prog->substrs->data[other_ix];
1298
1299         /* if "other" is anchored:
1300          * we've previously found a floating substr starting at check_at.
1301          * This means that the regex origin must lie somewhere
1302          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1303          * and max:                 HOP3(check_at, -check_offset_min)
1304          * (except that min will be >= strpos)
1305          * So the fixed  substr must lie somewhere between
1306          *  HOP3(min, anchored_offset)
1307          *  HOP3(max, anchored_offset) + SvCUR(substr)
1308          */
1309
1310         /* if "other" is floating
1311          * Calculate last1, the absolute latest point where the
1312          * floating substr could start in the string, ignoring any
1313          * constraints from the earlier fixed match. It is calculated
1314          * as follows:
1315          *
1316          * strend - prog->minlen (in chars) is the absolute latest
1317          * position within the string where the origin of the regex
1318          * could appear. The latest start point for the floating
1319          * substr is float_min_offset(*) on from the start of the
1320          * regex.  last1 simply combines thee two offsets.
1321          *
1322          * (*) You might think the latest start point should be
1323          * float_max_offset from the regex origin, and technically
1324          * you'd be correct. However, consider
1325          *    /a\d{2,4}bcd\w/
1326          * Here, float min, max are 3,5 and minlen is 7.
1327          * This can match either
1328          *    /a\d\dbcd\w/
1329          *    /a\d\d\dbcd\w/
1330          *    /a\d\d\d\dbcd\w/
1331          * In the first case, the regex matches minlen chars; in the
1332          * second, minlen+1, in the third, minlen+2.
1333          * In the first case, the floating offset is 3 (which equals
1334          * float_min), in the second, 4, and in the third, 5 (which
1335          * equals float_max). In all cases, the floating string bcd
1336          * can never start more than 4 chars from the end of the
1337          * string, which equals minlen - float_min. As the substring
1338          * starts to match more than float_min from the start of the
1339          * regex, it makes the regex match more than minlen chars,
1340          * and the two cancel each other out. So we can always use
1341          * float_min - minlen, rather than float_max - minlen for the
1342          * latest position in the string.
1343          *
1344          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1345          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1346          */
1347
1348         assert(prog->minlen >= other->min_offset);
1349         last1 = HOP3c(strend,
1350                         other->min_offset - prog->minlen, strbeg);
1351
1352         if (other_ix) {/* i.e. if (other-is-float) */
1353             /* last is the latest point where the floating substr could
1354              * start, *given* any constraints from the earlier fixed
1355              * match. This constraint is that the floating string starts
1356              * <= float_max_offset chars from the regex origin (rx_origin).
1357              * If this value is less than last1, use it instead.
1358              */
1359             assert(rx_origin <= last1);
1360             last =
1361                 /* this condition handles the offset==infinity case, and
1362                  * is a short-cut otherwise. Although it's comparing a
1363                  * byte offset to a char length, it does so in a safe way,
1364                  * since 1 char always occupies 1 or more bytes,
1365                  * so if a string range is  (last1 - rx_origin) bytes,
1366                  * it will be less than or equal to  (last1 - rx_origin)
1367                  * chars; meaning it errs towards doing the accurate HOP3
1368                  * rather than just using last1 as a short-cut */
1369                 (last1 - rx_origin) < other->max_offset
1370                     ? last1
1371                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1372         }
1373         else {
1374             assert(strpos + start_shift <= check_at);
1375             last = HOP4c(check_at, other->min_offset - start_shift,
1376                         strbeg, strend);
1377         }
1378
1379         s = HOP3c(rx_origin, other->min_offset, strend);
1380         if (s < other_last)     /* These positions already checked */
1381             s = other_last;
1382
1383         must = utf8_target ? other->utf8_substr : other->substr;
1384         assert(SvPOK(must));
1385         {
1386             char *from = s;
1387             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1388
1389             if (to > strend)
1390                 to = strend;
1391             if (from > to) {
1392                 s = NULL;
1393                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1394                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1395                     (IV)(from - strbeg),
1396                     (IV)(to   - strbeg)
1397                 ));
1398             }
1399             else {
1400                 s = fbm_instr(
1401                     (unsigned char*)from,
1402                     (unsigned char*)to,
1403                     must,
1404                     multiline ? FBMrf_MULTILINE : 0
1405                 );
1406                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1407                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1408                     (IV)(from - strbeg),
1409                     (IV)(to   - strbeg),
1410                     (IV)(s ? s - strbeg : -1)
1411                 ));
1412             }
1413         }
1414
1415         DEBUG_EXECUTE_r({
1416             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1417                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1418             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1419                 s ? "Found" : "Contradicts",
1420                 other_ix ? "floating" : "anchored",
1421                 quoted, RE_SV_TAIL(must));
1422         });
1423
1424
1425         if (!s) {
1426             /* last1 is latest possible substr location. If we didn't
1427              * find it before there, we never will */
1428             if (last >= last1) {
1429                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1430                                         "; giving up...\n"));
1431                 goto fail_finish;
1432             }
1433
1434             /* try to find the check substr again at a later
1435              * position. Maybe next time we'll find the "other" substr
1436              * in range too */
1437             other_last = HOP3c(last, 1, strend) /* highest failure */;
1438             rx_origin =
1439                 other_ix /* i.e. if other-is-float */
1440                     ? HOP3c(rx_origin, 1, strend)
1441                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1442             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1443                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1444                 (other_ix ? "floating" : "anchored"),
1445                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1446                 (IV)(rx_origin - strbeg)
1447             ));
1448             goto restart;
1449         }
1450         else {
1451             if (other_ix) { /* if (other-is-float) */
1452                 /* other_last is set to s, not s+1, since its possible for
1453                  * a floating substr to fail first time, then succeed
1454                  * second time at the same floating position; e.g.:
1455                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1456                  * The first time round, anchored and float match at
1457                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1458                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1459                  */
1460                 other_last = s;
1461             }
1462             else {
1463                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1464                 other_last = HOP3c(s, 1, strend);
1465             }
1466             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1467                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1468                   (long)(s - strbeg),
1469                 (IV)(rx_origin - strbeg)
1470               ));
1471
1472         }
1473     }
1474     else {
1475         DEBUG_OPTIMISE_MORE_r(
1476             Perl_re_printf( aTHX_
1477                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1478                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1479                 " strend:%" IVdf "\n",
1480                 (IV)prog->check_offset_min,
1481                 (IV)prog->check_offset_max,
1482                 (IV)(check_at-strbeg),
1483                 (IV)(rx_origin-strbeg),
1484                 (IV)(rx_origin-check_at),
1485                 (IV)(strend-strbeg)
1486             )
1487         );
1488     }
1489
1490   postprocess_substr_matches:
1491
1492     /* handle the extra constraint of /^.../m if present */
1493
1494     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1495         char *s;
1496
1497         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1498                         "  looking for /^/m anchor"));
1499
1500         /* we have failed the constraint of a \n before rx_origin.
1501          * Find the next \n, if any, even if it's beyond the current
1502          * anchored and/or floating substrings. Whether we should be
1503          * scanning ahead for the next \n or the next substr is debatable.
1504          * On the one hand you'd expect rare substrings to appear less
1505          * often than \n's. On the other hand, searching for \n means
1506          * we're effectively flipping between check_substr and "\n" on each
1507          * iteration as the current "rarest" string candidate, which
1508          * means for example that we'll quickly reject the whole string if
1509          * hasn't got a \n, rather than trying every substr position
1510          * first
1511          */
1512
1513         s = HOP3c(strend, - prog->minlen, strpos);
1514         if (s <= rx_origin ||
1515             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1516         {
1517             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1518                             "  Did not find /%s^%s/m...\n",
1519                             PL_colors[0], PL_colors[1]));
1520             goto fail_finish;
1521         }
1522
1523         /* earliest possible origin is 1 char after the \n.
1524          * (since *rx_origin == '\n', it's safe to ++ here rather than
1525          * HOP(rx_origin, 1)) */
1526         rx_origin++;
1527
1528         if (prog->substrs->check_ix == 0  /* check is anchored */
1529             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1530         {
1531             /* Position contradicts check-string; either because
1532              * check was anchored (and thus has no wiggle room),
1533              * or check was float and rx_origin is above the float range */
1534             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1535                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1536                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1537             goto restart;
1538         }
1539
1540         /* if we get here, the check substr must have been float,
1541          * is in range, and we may or may not have had an anchored
1542          * "other" substr which still contradicts */
1543         assert(prog->substrs->check_ix); /* check is float */
1544
1545         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1546             /* whoops, the anchored "other" substr exists, so we still
1547              * contradict. On the other hand, the float "check" substr
1548              * didn't contradict, so just retry the anchored "other"
1549              * substr */
1550             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1551                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1552                 PL_colors[0], PL_colors[1],
1553                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1554                 (IV)(rx_origin - strbeg)
1555             ));
1556             goto do_other_substr;
1557         }
1558
1559         /* success: we don't contradict the found floating substring
1560          * (and there's no anchored substr). */
1561         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1562             "  Found /%s^%s/m with rx_origin %ld...\n",
1563             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1564     }
1565     else {
1566         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1567             "  (multiline anchor test skipped)\n"));
1568     }
1569
1570   success_at_start:
1571
1572
1573     /* if we have a starting character class, then test that extra constraint.
1574      * (trie stclasses are too expensive to use here, we are better off to
1575      * leave it to regmatch itself) */
1576
1577     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1578         const U8* const str = (U8*)STRING(progi->regstclass);
1579
1580         /* XXX this value could be pre-computed */
1581         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1582                     ?  (reginfo->is_utf8_pat
1583                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1584                         : STR_LEN(progi->regstclass))
1585                     : 1);
1586         char * endpos;
1587         char *s;
1588         /* latest pos that a matching float substr constrains rx start to */
1589         char *rx_max_float = NULL;
1590
1591         /* if the current rx_origin is anchored, either by satisfying an
1592          * anchored substring constraint, or a /^.../m constraint, then we
1593          * can reject the current origin if the start class isn't found
1594          * at the current position. If we have a float-only match, then
1595          * rx_origin is constrained to a range; so look for the start class
1596          * in that range. if neither, then look for the start class in the
1597          * whole rest of the string */
1598
1599         /* XXX DAPM it's not clear what the minlen test is for, and why
1600          * it's not used in the floating case. Nothing in the test suite
1601          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1602          * Here are some old comments, which may or may not be correct:
1603          *
1604          *   minlen == 0 is possible if regstclass is \b or \B,
1605          *   and the fixed substr is ''$.
1606          *   Since minlen is already taken into account, rx_origin+1 is
1607          *   before strend; accidentally, minlen >= 1 guaranties no false
1608          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1609          *   0) below assumes that regstclass does not come from lookahead...
1610          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1611          *   This leaves EXACTF-ish only, which are dealt with in
1612          *   find_byclass().
1613          */
1614
1615         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1616             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1617         else if (prog->float_substr || prog->float_utf8) {
1618             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1619             endpos = HOP3clim(rx_max_float, cl_l, strend);
1620         }
1621         else
1622             endpos= strend;
1623
1624         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1625             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1626             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1627               (IV)start_shift, (IV)(check_at - strbeg),
1628               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1629
1630         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1631                             reginfo);
1632         if (!s) {
1633             if (endpos == strend) {
1634                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1635                                 "  Could not match STCLASS...\n") );
1636                 goto fail;
1637             }
1638             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1639                                "  This position contradicts STCLASS...\n") );
1640             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1641                         && !(prog->intflags & PREGf_IMPLICIT))
1642                 goto fail;
1643
1644             /* Contradict one of substrings */
1645             if (prog->anchored_substr || prog->anchored_utf8) {
1646                 if (prog->substrs->check_ix == 1) { /* check is float */
1647                     /* Have both, check_string is floating */
1648                     assert(rx_origin + start_shift <= check_at);
1649                     if (rx_origin + start_shift != check_at) {
1650                         /* not at latest position float substr could match:
1651                          * Recheck anchored substring, but not floating.
1652                          * The condition above is in bytes rather than
1653                          * chars for efficiency. It's conservative, in
1654                          * that it errs on the side of doing 'goto
1655                          * do_other_substr'. In this case, at worst,
1656                          * an extra anchored search may get done, but in
1657                          * practice the extra fbm_instr() is likely to
1658                          * get skipped anyway. */
1659                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1660                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1661                             (long)(other_last - strbeg),
1662                             (IV)(rx_origin - strbeg)
1663                         ));
1664                         goto do_other_substr;
1665                     }
1666                 }
1667             }
1668             else {
1669                 /* float-only */
1670
1671                 if (ml_anch) {
1672                     /* In the presence of ml_anch, we might be able to
1673                      * find another \n without breaking the current float
1674                      * constraint. */
1675
1676                     /* strictly speaking this should be HOP3c(..., 1, ...),
1677                      * but since we goto a block of code that's going to
1678                      * search for the next \n if any, its safe here */
1679                     rx_origin++;
1680                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1681                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1682                               PL_colors[0], PL_colors[1],
1683                               (long)(rx_origin - strbeg)) );
1684                     goto postprocess_substr_matches;
1685                 }
1686
1687                 /* strictly speaking this can never be true; but might
1688                  * be if we ever allow intuit without substrings */
1689                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1690                     goto fail;
1691
1692                 rx_origin = rx_max_float;
1693             }
1694
1695             /* at this point, any matching substrings have been
1696              * contradicted. Start again... */
1697
1698             rx_origin = HOP3c(rx_origin, 1, strend);
1699
1700             /* uses bytes rather than char calculations for efficiency.
1701              * It's conservative: it errs on the side of doing 'goto restart',
1702              * where there is code that does a proper char-based test */
1703             if (rx_origin + start_shift + end_shift > strend) {
1704                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1705                                        "  Could not match STCLASS...\n") );
1706                 goto fail;
1707             }
1708             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1709                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1710                 (prog->substrs->check_ix ? "floating" : "anchored"),
1711                 (long)(rx_origin + start_shift - strbeg),
1712                 (IV)(rx_origin - strbeg)
1713             ));
1714             goto restart;
1715         }
1716
1717         /* Success !!! */
1718
1719         if (rx_origin != s) {
1720             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1721                         "  By STCLASS: moving %ld --> %ld\n",
1722                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1723                    );
1724         }
1725         else {
1726             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1727                                   "  Does not contradict STCLASS...\n");
1728                    );
1729         }
1730     }
1731
1732     /* Decide whether using the substrings helped */
1733
1734     if (rx_origin != strpos) {
1735         /* Fixed substring is found far enough so that the match
1736            cannot start at strpos. */
1737
1738         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1739         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1740     }
1741     else {
1742         /* The found rx_origin position does not prohibit matching at
1743          * strpos, so calling intuit didn't gain us anything. Decrement
1744          * the BmUSEFUL() count on the check substring, and if we reach
1745          * zero, free it.  */
1746         if (!(prog->intflags & PREGf_NAUGHTY)
1747             && (utf8_target ? (
1748                 prog->check_utf8                /* Could be deleted already */
1749                 && --BmUSEFUL(prog->check_utf8) < 0
1750                 && (prog->check_utf8 == prog->float_utf8)
1751             ) : (
1752                 prog->check_substr              /* Could be deleted already */
1753                 && --BmUSEFUL(prog->check_substr) < 0
1754                 && (prog->check_substr == prog->float_substr)
1755             )))
1756         {
1757             /* If flags & SOMETHING - do not do it many times on the same match */
1758             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1759             /* XXX Does the destruction order has to change with utf8_target? */
1760             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1761             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1762             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1763             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1764             check = NULL;                       /* abort */
1765             /* XXXX This is a remnant of the old implementation.  It
1766                     looks wasteful, since now INTUIT can use many
1767                     other heuristics. */
1768             prog->extflags &= ~RXf_USE_INTUIT;
1769         }
1770     }
1771
1772     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1773             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1774              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1775
1776     return rx_origin;
1777
1778   fail_finish:                          /* Substring not found */
1779     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1780         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1781   fail:
1782     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1783                           PL_colors[4], PL_colors[5]));
1784     return NULL;
1785 }
1786
1787
1788 #define DECL_TRIE_TYPE(scan) \
1789     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1790                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1791                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1792                     trie_type = ((scan->flags == EXACT)                             \
1793                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1794                                  : (scan->flags == EXACTL)                          \
1795                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1796                                     : (scan->flags == EXACTFAA)                     \
1797                                       ? (utf8_target                                \
1798                                          ? trie_utf8_exactfa_fold                   \
1799                                          : trie_latin_utf8_exactfa_fold)            \
1800                                       : (scan->flags == EXACTFLU8                   \
1801                                          ? (utf8_target                             \
1802                                            ? trie_flu8                              \
1803                                            : trie_flu8_latin)                       \
1804                                          : (utf8_target                             \
1805                                            ? trie_utf8_fold                         \
1806                                            : trie_latin_utf8_fold)))
1807
1808 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1809  * 'foldbuf+sizeof(foldbuf)' */
1810 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1811 STMT_START {                                                                        \
1812     STRLEN skiplen;                                                                 \
1813     U8 flags = FOLD_FLAGS_FULL;                                                     \
1814     switch (trie_type) {                                                            \
1815     case trie_flu8:                                                                 \
1816         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1817         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1818             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1819         }                                                                           \
1820         goto do_trie_utf8_fold;                                                     \
1821     case trie_utf8_exactfa_fold:                                                    \
1822         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1823         /* FALLTHROUGH */                                                           \
1824     case trie_utf8_fold:                                                            \
1825       do_trie_utf8_fold:                                                            \
1826         if ( foldlen>0 ) {                                                          \
1827             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1828             foldlen -= len;                                                         \
1829             uscan += len;                                                           \
1830             len=0;                                                                  \
1831         } else {                                                                    \
1832             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1833                                                                             flags); \
1834             len = UTF8SKIP(uc);                                                     \
1835             skiplen = UVCHR_SKIP( uvc );                                            \
1836             foldlen -= skiplen;                                                     \
1837             uscan = foldbuf + skiplen;                                              \
1838         }                                                                           \
1839         break;                                                                      \
1840     case trie_flu8_latin:                                                           \
1841         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1842         goto do_trie_latin_utf8_fold;                                               \
1843     case trie_latin_utf8_exactfa_fold:                                              \
1844         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1845         /* FALLTHROUGH */                                                           \
1846     case trie_latin_utf8_fold:                                                      \
1847       do_trie_latin_utf8_fold:                                                      \
1848         if ( foldlen>0 ) {                                                          \
1849             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1850             foldlen -= len;                                                         \
1851             uscan += len;                                                           \
1852             len=0;                                                                  \
1853         } else {                                                                    \
1854             len = 1;                                                                \
1855             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1856             skiplen = UVCHR_SKIP( uvc );                                            \
1857             foldlen -= skiplen;                                                     \
1858             uscan = foldbuf + skiplen;                                              \
1859         }                                                                           \
1860         break;                                                                      \
1861     case trie_utf8l:                                                                \
1862         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1863         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1864             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1865         }                                                                           \
1866         /* FALLTHROUGH */                                                           \
1867     case trie_utf8:                                                                 \
1868         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1869         break;                                                                      \
1870     case trie_plain:                                                                \
1871         uvc = (UV)*uc;                                                              \
1872         len = 1;                                                                    \
1873     }                                                                               \
1874     if (uvc < 256) {                                                                \
1875         charid = trie->charmap[ uvc ];                                              \
1876     }                                                                               \
1877     else {                                                                          \
1878         charid = 0;                                                                 \
1879         if (widecharmap) {                                                          \
1880             SV** const svpp = hv_fetch(widecharmap,                                 \
1881                         (char*)&uvc, sizeof(UV), 0);                                \
1882             if (svpp)                                                               \
1883                 charid = (U16)SvIV(*svpp);                                          \
1884         }                                                                           \
1885     }                                                                               \
1886 } STMT_END
1887
1888 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1889     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1890                 startpos, doutf8, depth)
1891
1892 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1893     STMT_START {                                            \
1894         while (s < strend) {                                \
1895             CODE                                            \
1896             s += ((UTF8) ? UTF8SKIP(s) : 1);                \
1897         }                                                   \
1898     } STMT_END
1899
1900 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1901     STMT_START {                                            \
1902         while (s < strend) {                                \
1903             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1904         }                                                   \
1905     } STMT_END
1906
1907 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1908     if (COND) {                                                \
1909         FBC_CHECK_AND_TRY                                      \
1910         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1911         previous_occurrence_end = s;                           \
1912     }                                                          \
1913     else {                                                     \
1914         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1915     }
1916
1917 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1918     if (utf8_target) {                                         \
1919         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1920     }                                                          \
1921     else {                                                     \
1922         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1923     }
1924
1925 /* We keep track of where the next character should start after an occurrence
1926  * of the one we're looking for.  Knowing that, we can see right away if the
1927  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1928  * don't accept the 2nd and succeeding adjacent occurrences */
1929 #define FBC_CHECK_AND_TRY                                      \
1930         if (   (   doevery                                     \
1931                 || s != previous_occurrence_end)               \
1932             && (reginfo->intuit || regtry(reginfo, &s)))       \
1933         {                                                      \
1934             goto got_it;                                       \
1935         }
1936
1937
1938 /* This differs from the above macros in that it calls a function which returns
1939  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1940  * there is no such occurrence. */
1941 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1942     while (s < strend) {                                    \
1943         s = (f);                                            \
1944         if (s >= strend) {                                  \
1945             break;                                          \
1946         }                                                   \
1947                                                             \
1948         FBC_CHECK_AND_TRY                                   \
1949         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1950         previous_occurrence_end = s;                        \
1951     }
1952
1953 /* The three macros below are slightly different versions of the same logic.
1954  *
1955  * The first is for /a and /aa when the target string is UTF-8.  This can only
1956  * match ascii, but it must advance based on UTF-8.   The other two handle the
1957  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1958  * for the boundary (or non-boundary) between a word and non-word character.
1959  * The utf8 and non-utf8 cases have the same logic, but the details must be
1960  * different.  Find the "wordness" of the character just prior to this one, and
1961  * compare it with the wordness of this one.  If they differ, we have a
1962  * boundary.  At the beginning of the string, pretend that the previous
1963  * character was a new-line.
1964  *
1965  * All these macros uncleanly have side-effects with each other and outside
1966  * variables.  So far it's been too much trouble to clean-up
1967  *
1968  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1969  *               a word character or not.
1970  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1971  *               word/non-word
1972  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1973  *
1974  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1975  * are looking for a boundary or for a non-boundary.  If we are looking for a
1976  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1977  * see if this tentative match actually works, and if so, to quit the loop
1978  * here.  And vice-versa if we are looking for a non-boundary.
1979  *
1980  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1981  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1982  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1983  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1984  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1985  * complement.  But in that branch we complement tmp, meaning that at the
1986  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1987  * which means at the top of the loop in the next iteration, it is
1988  * TEST_NON_UTF8(s-1) */
1989 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1990     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1991     tmp = TEST_NON_UTF8(tmp);                                                  \
1992     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1993         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1994             tmp = !tmp;                                                        \
1995             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1996         }                                                                      \
1997         else {                                                                 \
1998             IF_FAIL;                                                           \
1999         }                                                                      \
2000     );                                                                         \
2001
2002 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2003  * TEST_UTF8 is a macro that for the same input code points returns identically
2004  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
2005 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
2006     if (s == reginfo->strbeg) {                                                \
2007         tmp = '\n';                                                            \
2008     }                                                                          \
2009     else { /* Back-up to the start of the previous character */                \
2010         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
2011         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
2012                                                        0, UTF8_ALLOW_DEFAULT); \
2013     }                                                                          \
2014     tmp = TEST_UV(tmp);                                                        \
2015     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2016         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2017             tmp = !tmp;                                                        \
2018             IF_SUCCESS;                                                        \
2019         }                                                                      \
2020         else {                                                                 \
2021             IF_FAIL;                                                           \
2022         }                                                                      \
2023     );
2024
2025 /* Like the above two macros.  UTF8_CODE is the complete code for handling
2026  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2027  * macros below */
2028 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
2029     if (utf8_target) {                                                         \
2030         UTF8_CODE                                                              \
2031     }                                                                          \
2032     else {  /* Not utf8 */                                                     \
2033         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
2034         tmp = TEST_NON_UTF8(tmp);                                              \
2035         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
2036             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
2037                 IF_SUCCESS;                                                    \
2038                 tmp = !tmp;                                                    \
2039             }                                                                  \
2040             else {                                                             \
2041                 IF_FAIL;                                                       \
2042             }                                                                  \
2043         );                                                                     \
2044     }                                                                          \
2045     /* Here, things have been set up by the previous code so that tmp is the   \
2046      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
2047      * utf8ness of the target).  We also have to check if this matches against \
2048      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
2049      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
2050      * string */                                                               \
2051     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2052         IF_SUCCESS;                                                            \
2053     }                                                                          \
2054     else {                                                                     \
2055         IF_FAIL;                                                               \
2056     }
2057
2058 /* This is the macro to use when we want to see if something that looks like it
2059  * could match, actually does, and if so exits the loop */
2060 #define REXEC_FBC_TRYIT                            \
2061     if ((reginfo->intuit || regtry(reginfo, &s)))  \
2062         goto got_it
2063
2064 /* The only difference between the BOUND and NBOUND cases is that
2065  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2066  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2067  * with the other one being empty (PLACEHOLDER is defined as empty).
2068  *
2069  * The TEST_FOO parameters are for operating on different forms of input, but
2070  * all should be ones that return identically for the same underlying code
2071  * points */
2072 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
2073     FBC_BOUND_COMMON(                                                          \
2074           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
2075           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2076
2077 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2078     FBC_BOUND_COMMON(                                                          \
2079             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2080             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2081
2082 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2083     FBC_BOUND_COMMON(                                                          \
2084           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2085           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2086
2087 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2088     FBC_BOUND_COMMON(                                                          \
2089             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2090             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2091
2092 #ifdef DEBUGGING
2093 static IV
2094 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2095   IV cp_out = _invlist_search(invlist, cp_in);
2096   assert(cp_out >= 0);
2097   return cp_out;
2098 }
2099 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2100         invmap[S_get_break_val_cp_checked(invlist, cp)]
2101 #else
2102 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2103         invmap[_invlist_search(invlist, cp)]
2104 #endif
2105
2106 /* Takes a pointer to an inversion list, a pointer to its corresponding
2107  * inversion map, and a code point, and returns the code point's value
2108  * according to the two arrays.  It assumes that all code points have a value.
2109  * This is used as the base macro for macros for particular properties */
2110 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2111         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2112
2113 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2114  * of a code point, returning the value for the first code point in the string.
2115  * And it takes the particular macro name that finds the desired value given a
2116  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2117 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2118              (__ASSERT_(pos < strend)                                          \
2119                  /* Note assumes is valid UTF-8 */                             \
2120              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2121
2122 /* Returns the GCB value for the input code point */
2123 #define getGCB_VAL_CP(cp)                                                      \
2124           _generic_GET_BREAK_VAL_CP(                                           \
2125                                     PL_GCB_invlist,                            \
2126                                     _Perl_GCB_invmap,                          \
2127                                     (cp))
2128
2129 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2130  * bounded by pos and strend */
2131 #define getGCB_VAL_UTF8(pos, strend)                                           \
2132     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2133
2134 /* Returns the LB value for the input code point */
2135 #define getLB_VAL_CP(cp)                                                       \
2136           _generic_GET_BREAK_VAL_CP(                                           \
2137                                     PL_LB_invlist,                             \
2138                                     _Perl_LB_invmap,                           \
2139                                     (cp))
2140
2141 /* Returns the LB value for the first code point in the UTF-8 encoded string
2142  * bounded by pos and strend */
2143 #define getLB_VAL_UTF8(pos, strend)                                            \
2144     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2145
2146
2147 /* Returns the SB value for the input code point */
2148 #define getSB_VAL_CP(cp)                                                       \
2149           _generic_GET_BREAK_VAL_CP(                                           \
2150                                     PL_SB_invlist,                             \
2151                                     _Perl_SB_invmap,                     \
2152                                     (cp))
2153
2154 /* Returns the SB value for the first code point in the UTF-8 encoded string
2155  * bounded by pos and strend */
2156 #define getSB_VAL_UTF8(pos, strend)                                            \
2157     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2158
2159 /* Returns the WB value for the input code point */
2160 #define getWB_VAL_CP(cp)                                                       \
2161           _generic_GET_BREAK_VAL_CP(                                           \
2162                                     PL_WB_invlist,                             \
2163                                     _Perl_WB_invmap,                         \
2164                                     (cp))
2165
2166 /* Returns the WB value for the first code point in the UTF-8 encoded string
2167  * bounded by pos and strend */
2168 #define getWB_VAL_UTF8(pos, strend)                                            \
2169     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2170
2171 /* We know what class REx starts with.  Try to find this position... */
2172 /* if reginfo->intuit, its a dryrun */
2173 /* annoyingly all the vars in this routine have different names from their counterparts
2174    in regmatch. /grrr */
2175 STATIC char *
2176 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2177     const char *strend, regmatch_info *reginfo)
2178 {
2179     dVAR;
2180
2181     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2182     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2183
2184     char *pat_string;   /* The pattern's exactish string */
2185     char *pat_end;          /* ptr to end char of pat_string */
2186     re_fold_t folder;   /* Function for computing non-utf8 folds */
2187     const U8 *fold_array;   /* array for folding ords < 256 */
2188     STRLEN ln;
2189     STRLEN lnc;
2190     U8 c1;
2191     U8 c2;
2192     char *e;
2193
2194     /* In some cases we accept only the first occurence of 'x' in a sequence of
2195      * them.  This variable points to just beyond the end of the previous
2196      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2197      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2198      * hop back.) */
2199     char * previous_occurrence_end = 0;
2200
2201     I32 tmp;            /* Scratch variable */
2202     const bool utf8_target = reginfo->is_utf8_target;
2203     UV utf8_fold_flags = 0;
2204     const bool is_utf8_pat = reginfo->is_utf8_pat;
2205     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2206                                    with a result inverts that result, as 0^1 =
2207                                    1 and 1^1 = 0 */
2208     _char_class_number classnum;
2209
2210     RXi_GET_DECL(prog,progi);
2211
2212     PERL_ARGS_ASSERT_FIND_BYCLASS;
2213
2214     /* We know what class it must start with. */
2215     switch (OP(c)) {
2216     case ANYOFPOSIXL:
2217     case ANYOFL:
2218         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2219
2220         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2221             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2222         }
2223
2224         /* FALLTHROUGH */
2225     case ANYOFD:
2226     case ANYOF:
2227         if (utf8_target) {
2228             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2229                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2230         }
2231         else if (ANYOF_FLAGS(c)) {
2232             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2233         }
2234         else {
2235             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2236         }
2237         break;
2238
2239     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2240         /* UTF-8ness doesn't matter, so use 0 */
2241         REXEC_FBC_FIND_NEXT_SCAN(0,
2242          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2243                                    (U8) ARG(c), FLAGS(c)));
2244         break;
2245
2246     case NANYOFM:
2247         REXEC_FBC_FIND_NEXT_SCAN(0,
2248          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2249                                    (U8) ARG(c), FLAGS(c)));
2250         break;
2251
2252     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2253         assert(! is_utf8_pat);
2254         /* FALLTHROUGH */
2255     case EXACTFAA:
2256         if (is_utf8_pat || utf8_target) {
2257             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2258             goto do_exactf_utf8;
2259         }
2260         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
2261         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
2262         goto do_exactf_non_utf8;        /* isn't dealt with by these */
2263
2264     case EXACTF:   /* This node only generated for non-utf8 patterns */
2265         assert(! is_utf8_pat);
2266         if (utf8_target) {
2267             utf8_fold_flags = 0;
2268             goto do_exactf_utf8;
2269         }
2270         fold_array = PL_fold;
2271         folder = foldEQ;
2272         goto do_exactf_non_utf8;
2273
2274     case EXACTFL:
2275         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2276         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2277             utf8_fold_flags = FOLDEQ_LOCALE;
2278             goto do_exactf_utf8;
2279         }
2280         fold_array = PL_fold_locale;
2281         folder = foldEQ_locale;
2282         goto do_exactf_non_utf8;
2283
2284     case EXACTFU_SS:
2285         if (is_utf8_pat) {
2286             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2287         }
2288         goto do_exactf_utf8;
2289
2290     case EXACTFLU8:
2291             if (! utf8_target) {    /* All code points in this node require
2292                                        UTF-8 to express.  */
2293                 break;
2294             }
2295             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2296                                              | FOLDEQ_S2_FOLDS_SANE;
2297             goto do_exactf_utf8;
2298
2299     case EXACTFU:
2300         if (is_utf8_pat || utf8_target) {
2301             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2302             goto do_exactf_utf8;
2303         }
2304
2305         /* Any 'ss' in the pattern should have been replaced by regcomp,
2306          * so we don't have to worry here about this single special case
2307          * in the Latin1 range */
2308         fold_array = PL_fold_latin1;
2309         folder = foldEQ_latin1;
2310
2311         /* FALLTHROUGH */
2312
2313       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2314                            are no glitches with fold-length differences
2315                            between the target string and pattern */
2316
2317         /* The idea in the non-utf8 EXACTF* cases is to first find the
2318          * first character of the EXACTF* node and then, if necessary,
2319          * case-insensitively compare the full text of the node.  c1 is the
2320          * first character.  c2 is its fold.  This logic will not work for
2321          * Unicode semantics and the german sharp ss, which hence should
2322          * not be compiled into a node that gets here. */
2323         pat_string = STRING(c);
2324         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2325
2326         /* We know that we have to match at least 'ln' bytes (which is the
2327          * same as characters, since not utf8).  If we have to match 3
2328          * characters, and there are only 2 availabe, we know without
2329          * trying that it will fail; so don't start a match past the
2330          * required minimum number from the far end */
2331         e = HOP3c(strend, -((SSize_t)ln), s);
2332         if (e < s)
2333             break;
2334
2335         c1 = *pat_string;
2336         c2 = fold_array[c1];
2337         if (c1 == c2) { /* If char and fold are the same */
2338             while (s <= e) {
2339                 s = (char *) memchr(s, c1, e + 1 - s);
2340                 if (s == NULL) {
2341                     break;
2342                 }
2343
2344                 /* Check that the rest of the node matches */
2345                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2346                     && (reginfo->intuit || regtry(reginfo, &s)) )
2347                 {
2348                     goto got_it;
2349                 }
2350                 s++;
2351             }
2352         }
2353         else {
2354             U8 bits_differing = c1 ^ c2;
2355
2356             /* If the folds differ in one bit position only, we can mask to
2357              * match either of them, and can use this faster find method.  Both
2358              * ASCII and EBCDIC tend to have their case folds differ in only
2359              * one position, so this is very likely */
2360             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2361                 bits_differing = ~ bits_differing;
2362                 while (s <= e) {
2363                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2364                                         (c1 & bits_differing), bits_differing);
2365                     if (s > e) {
2366                         break;
2367                     }
2368
2369                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2370                         && (reginfo->intuit || regtry(reginfo, &s)) )
2371                     {
2372                         goto got_it;
2373                     }
2374                     s++;
2375                 }
2376             }
2377             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2378                        should actually happen only in EXACTFL nodes */
2379                 while (s <= e) {
2380                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2381                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2382                         && (reginfo->intuit || regtry(reginfo, &s)) )
2383                     {
2384                         goto got_it;
2385                     }
2386                     s++;
2387                 }
2388             }
2389         }
2390         break;
2391
2392       do_exactf_utf8:
2393       {
2394         unsigned expansion;
2395
2396         /* If one of the operands is in utf8, we can't use the simpler folding
2397          * above, due to the fact that many different characters can have the
2398          * same fold, or portion of a fold, or different- length fold */
2399         pat_string = STRING(c);
2400         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2401         pat_end = pat_string + ln;
2402         lnc = is_utf8_pat       /* length to match in characters */
2403                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2404                 : ln;
2405
2406         /* We have 'lnc' characters to match in the pattern, but because of
2407          * multi-character folding, each character in the target can match
2408          * up to 3 characters (Unicode guarantees it will never exceed
2409          * this) if it is utf8-encoded; and up to 2 if not (based on the
2410          * fact that the Latin 1 folds are already determined, and the
2411          * only multi-char fold in that range is the sharp-s folding to
2412          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2413          * string character.  Adjust lnc accordingly, rounding up, so that
2414          * if we need to match at least 4+1/3 chars, that really is 5. */
2415         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2416         lnc = (lnc + expansion - 1) / expansion;
2417
2418         /* As in the non-UTF8 case, if we have to match 3 characters, and
2419          * only 2 are left, it's guaranteed to fail, so don't start a
2420          * match that would require us to go beyond the end of the string
2421          */
2422         e = HOP3c(strend, -((SSize_t)lnc), s);
2423
2424         /* XXX Note that we could recalculate e to stop the loop earlier,
2425          * as the worst case expansion above will rarely be met, and as we
2426          * go along we would usually find that e moves further to the left.
2427          * This would happen only after we reached the point in the loop
2428          * where if there were no expansion we should fail.  Unclear if
2429          * worth the expense */
2430
2431         while (s <= e) {
2432             char *my_strend= (char *)strend;
2433             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2434                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2435                 && (reginfo->intuit || regtry(reginfo, &s)) )
2436             {
2437                 goto got_it;
2438             }
2439             s += (utf8_target) ? UTF8SKIP(s) : 1;
2440         }
2441         break;
2442     }
2443
2444     case BOUNDL:
2445         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2446         if (FLAGS(c) != TRADITIONAL_BOUND) {
2447             if (! IN_UTF8_CTYPE_LOCALE) {
2448                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2449                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2450             }
2451             goto do_boundu;
2452         }
2453
2454         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2455         break;
2456
2457     case NBOUNDL:
2458         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2459         if (FLAGS(c) != TRADITIONAL_BOUND) {
2460             if (! IN_UTF8_CTYPE_LOCALE) {
2461                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2462                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2463             }
2464             goto do_nboundu;
2465         }
2466
2467         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2468         break;
2469
2470     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2471                    meaning */
2472         assert(FLAGS(c) == TRADITIONAL_BOUND);
2473
2474         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2475         break;
2476
2477     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2478                    meaning */
2479         assert(FLAGS(c) == TRADITIONAL_BOUND);
2480
2481         FBC_BOUND_A(isWORDCHAR_A);
2482         break;
2483
2484     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2485                    meaning */
2486         assert(FLAGS(c) == TRADITIONAL_BOUND);
2487
2488         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2489         break;
2490
2491     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2492                    meaning */
2493         assert(FLAGS(c) == TRADITIONAL_BOUND);
2494
2495         FBC_NBOUND_A(isWORDCHAR_A);
2496         break;
2497
2498     case NBOUNDU:
2499         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2500             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2501             break;
2502         }
2503
2504       do_nboundu:
2505
2506         to_complement = 1;
2507         /* FALLTHROUGH */
2508
2509     case BOUNDU:
2510       do_boundu:
2511         switch((bound_type) FLAGS(c)) {
2512             case TRADITIONAL_BOUND:
2513                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2514                 break;
2515             case GCB_BOUND:
2516                 if (s == reginfo->strbeg) {
2517                     if (reginfo->intuit || regtry(reginfo, &s))
2518                     {
2519                         goto got_it;
2520                     }
2521
2522                     /* Didn't match.  Try at the next position (if there is one) */
2523                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2524                     if (UNLIKELY(s >= reginfo->strend)) {
2525                         break;
2526                     }
2527                 }
2528
2529                 if (utf8_target) {
2530                     GCB_enum before = getGCB_VAL_UTF8(
2531                                                reghop3((U8*)s, -1,
2532                                                        (U8*)(reginfo->strbeg)),
2533                                                (U8*) reginfo->strend);
2534                     while (s < strend) {
2535                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2536                                                         (U8*) reginfo->strend);
2537                         if (   (to_complement ^ isGCB(before,
2538                                                       after,
2539                                                       (U8*) reginfo->strbeg,
2540                                                       (U8*) s,
2541                                                       utf8_target))
2542                             && (reginfo->intuit || regtry(reginfo, &s)))
2543                         {
2544                             goto got_it;
2545                         }
2546                         before = after;
2547                         s += UTF8SKIP(s);
2548                     }
2549                 }
2550                 else {  /* Not utf8.  Everything is a GCB except between CR and
2551                            LF */
2552                     while (s < strend) {
2553                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2554                                               || UCHARAT(s) != '\n'))
2555                             && (reginfo->intuit || regtry(reginfo, &s)))
2556                         {
2557                             goto got_it;
2558                         }
2559                         s++;
2560                     }
2561                 }
2562
2563                 /* And, since this is a bound, it can match after the final
2564                  * character in the string */
2565                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2566                     goto got_it;
2567                 }
2568                 break;
2569
2570             case LB_BOUND:
2571                 if (s == reginfo->strbeg) {
2572                     if (reginfo->intuit || regtry(reginfo, &s)) {
2573                         goto got_it;
2574                     }
2575                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2576                     if (UNLIKELY(s >= reginfo->strend)) {
2577                         break;
2578                     }
2579                 }
2580
2581                 if (utf8_target) {
2582                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2583                                                                -1,
2584                                                                (U8*)(reginfo->strbeg)),
2585                                                        (U8*) reginfo->strend);
2586                     while (s < strend) {
2587                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2588                         if (to_complement ^ isLB(before,
2589                                                  after,
2590                                                  (U8*) reginfo->strbeg,
2591                                                  (U8*) s,
2592                                                  (U8*) reginfo->strend,
2593                                                  utf8_target)
2594                             && (reginfo->intuit || regtry(reginfo, &s)))
2595                         {
2596                             goto got_it;
2597                         }
2598                         before = after;
2599                         s += UTF8SKIP(s);
2600                     }
2601                 }
2602                 else {  /* Not utf8. */
2603                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2604                     while (s < strend) {
2605                         LB_enum after = getLB_VAL_CP((U8) *s);
2606                         if (to_complement ^ isLB(before,
2607                                                  after,
2608                                                  (U8*) reginfo->strbeg,
2609                                                  (U8*) s,
2610                                                  (U8*) reginfo->strend,
2611                                                  utf8_target)
2612                             && (reginfo->intuit || regtry(reginfo, &s)))
2613                         {
2614                             goto got_it;
2615                         }
2616                         before = after;
2617                         s++;
2618                     }
2619                 }
2620
2621                 if (reginfo->intuit || regtry(reginfo, &s)) {
2622                     goto got_it;
2623                 }
2624
2625                 break;
2626
2627             case SB_BOUND:
2628                 if (s == reginfo->strbeg) {
2629                     if (reginfo->intuit || regtry(reginfo, &s)) {
2630                         goto got_it;
2631                     }
2632                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2633                     if (UNLIKELY(s >= reginfo->strend)) {
2634                         break;
2635                     }
2636                 }
2637
2638                 if (utf8_target) {
2639                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2640                                                         -1,
2641                                                         (U8*)(reginfo->strbeg)),
2642                                                       (U8*) reginfo->strend);
2643                     while (s < strend) {
2644                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2645                                                          (U8*) reginfo->strend);
2646                         if ((to_complement ^ isSB(before,
2647                                                   after,
2648                                                   (U8*) reginfo->strbeg,
2649                                                   (U8*) s,
2650                                                   (U8*) reginfo->strend,
2651                                                   utf8_target))
2652                             && (reginfo->intuit || regtry(reginfo, &s)))
2653                         {
2654                             goto got_it;
2655                         }
2656                         before = after;
2657                         s += UTF8SKIP(s);
2658                     }
2659                 }
2660                 else {  /* Not utf8. */
2661                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2662                     while (s < strend) {
2663                         SB_enum after = getSB_VAL_CP((U8) *s);
2664                         if ((to_complement ^ isSB(before,
2665                                                   after,
2666                                                   (U8*) reginfo->strbeg,
2667                                                   (U8*) s,
2668                                                   (U8*) reginfo->strend,
2669                                                   utf8_target))
2670                             && (reginfo->intuit || regtry(reginfo, &s)))
2671                         {
2672                             goto got_it;
2673                         }
2674                         before = after;
2675                         s++;
2676                     }
2677                 }
2678
2679                 /* Here are at the final position in the target string.  The SB
2680                  * value is always true here, so matches, depending on other
2681                  * constraints */
2682                 if (reginfo->intuit || regtry(reginfo, &s)) {
2683                     goto got_it;
2684                 }
2685
2686                 break;
2687
2688             case WB_BOUND:
2689                 if (s == reginfo->strbeg) {
2690                     if (reginfo->intuit || regtry(reginfo, &s)) {
2691                         goto got_it;
2692                     }
2693                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2694                     if (UNLIKELY(s >= reginfo->strend)) {
2695                         break;
2696                     }
2697                 }
2698
2699                 if (utf8_target) {
2700                     /* We are at a boundary between char_sub_0 and char_sub_1.
2701                      * We also keep track of the value for char_sub_-1 as we
2702                      * loop through the line.   Context may be needed to make a
2703                      * determination, and if so, this can save having to
2704                      * recalculate it */
2705                     WB_enum previous = WB_UNKNOWN;
2706                     WB_enum before = getWB_VAL_UTF8(
2707                                               reghop3((U8*)s,
2708                                                       -1,
2709                                                       (U8*)(reginfo->strbeg)),
2710                                               (U8*) reginfo->strend);
2711                     while (s < strend) {
2712                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2713                                                         (U8*) reginfo->strend);
2714                         if ((to_complement ^ isWB(previous,
2715                                                   before,
2716                                                   after,
2717                                                   (U8*) reginfo->strbeg,
2718                                                   (U8*) s,
2719                                                   (U8*) reginfo->strend,
2720                                                   utf8_target))
2721                             && (reginfo->intuit || regtry(reginfo, &s)))
2722                         {
2723                             goto got_it;
2724                         }
2725                         previous = before;
2726                         before = after;
2727                         s += UTF8SKIP(s);
2728                     }
2729                 }
2730                 else {  /* Not utf8. */
2731                     WB_enum previous = WB_UNKNOWN;
2732                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2733                     while (s < strend) {
2734                         WB_enum after = getWB_VAL_CP((U8) *s);
2735                         if ((to_complement ^ isWB(previous,
2736                                                   before,
2737                                                   after,
2738                                                   (U8*) reginfo->strbeg,
2739                                                   (U8*) s,
2740                                                   (U8*) reginfo->strend,
2741                                                   utf8_target))
2742                             && (reginfo->intuit || regtry(reginfo, &s)))
2743                         {
2744                             goto got_it;
2745                         }
2746                         previous = before;
2747                         before = after;
2748                         s++;
2749                     }
2750                 }
2751
2752                 if (reginfo->intuit || regtry(reginfo, &s)) {
2753                     goto got_it;
2754                 }
2755         }
2756         break;
2757
2758     case LNBREAK:
2759         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2760                         is_LNBREAK_latin1_safe(s, strend)
2761         );
2762         break;
2763
2764     case ASCII:
2765         REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
2766         break;
2767
2768     case NASCII:
2769         if (utf8_target) {
2770             REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2771                                                             utf8_target));
2772         }
2773         else {
2774             REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2775                                                             utf8_target));
2776         }
2777
2778         break;
2779
2780     /* The argument to all the POSIX node types is the class number to pass to
2781      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2782
2783     case NPOSIXL:
2784         to_complement = 1;
2785         /* FALLTHROUGH */
2786
2787     case POSIXL:
2788         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2789         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2790                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2791         break;
2792
2793     case NPOSIXD:
2794         to_complement = 1;
2795         /* FALLTHROUGH */
2796
2797     case POSIXD:
2798         if (utf8_target) {
2799             goto posix_utf8;
2800         }
2801         goto posixa;
2802
2803     case NPOSIXA:
2804         if (utf8_target) {
2805             /* The complement of something that matches only ASCII matches all
2806              * non-ASCII, plus everything in ASCII that isn't in the class. */
2807             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2808                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2809             break;
2810         }
2811
2812         to_complement = 1;
2813         goto posixa;
2814
2815     case POSIXA:
2816         /* Don't need to worry about utf8, as it can match only a single
2817          * byte invariant character.  But we do anyway for performance reasons,
2818          * as otherwise we would have to examine all the continuation
2819          * characters */
2820         if (utf8_target) {
2821             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2822             break;
2823         }
2824
2825       posixa:
2826         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2827                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2828         break;
2829
2830     case NPOSIXU:
2831         to_complement = 1;
2832         /* FALLTHROUGH */
2833
2834     case POSIXU:
2835         if (! utf8_target) {
2836             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2837                                  to_complement ^ cBOOL(_generic_isCC(*s,
2838                                                                     FLAGS(c))));
2839         }
2840         else {
2841
2842           posix_utf8:
2843             classnum = (_char_class_number) FLAGS(c);
2844             switch (classnum) {
2845                 default:
2846                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2847                         to_complement ^ cBOOL(_invlist_contains_cp(
2848                                               PL_XPosix_ptrs[classnum],
2849                                               utf8_to_uvchr_buf((U8 *) s,
2850                                                                 (U8 *) strend,
2851                                                                 NULL))));
2852                     break;
2853                 case _CC_ENUM_SPACE:
2854                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2855                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2856                     break;
2857
2858                 case _CC_ENUM_BLANK:
2859                     REXEC_FBC_CLASS_SCAN(1,
2860                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2861                     break;
2862
2863                 case _CC_ENUM_XDIGIT:
2864                     REXEC_FBC_CLASS_SCAN(1,
2865                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2866                     break;
2867
2868                 case _CC_ENUM_VERTSPACE:
2869                     REXEC_FBC_CLASS_SCAN(1,
2870                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2871                     break;
2872
2873                 case _CC_ENUM_CNTRL:
2874                     REXEC_FBC_CLASS_SCAN(1,
2875                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2876                     break;
2877             }
2878         }
2879         break;
2880
2881     case AHOCORASICKC:
2882     case AHOCORASICK:
2883         {
2884             DECL_TRIE_TYPE(c);
2885             /* what trie are we using right now */
2886             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2887             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2888             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2889
2890             const char *last_start = strend - trie->minlen;
2891 #ifdef DEBUGGING
2892             const char *real_start = s;
2893 #endif
2894             STRLEN maxlen = trie->maxlen;
2895             SV *sv_points;
2896             U8 **points; /* map of where we were in the input string
2897                             when reading a given char. For ASCII this
2898                             is unnecessary overhead as the relationship
2899                             is always 1:1, but for Unicode, especially
2900                             case folded Unicode this is not true. */
2901             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2902             U8 *bitmap=NULL;
2903
2904
2905             GET_RE_DEBUG_FLAGS_DECL;
2906
2907             /* We can't just allocate points here. We need to wrap it in
2908              * an SV so it gets freed properly if there is a croak while
2909              * running the match */
2910             ENTER;
2911             SAVETMPS;
2912             sv_points=newSV(maxlen * sizeof(U8 *));
2913             SvCUR_set(sv_points,
2914                 maxlen * sizeof(U8 *));
2915             SvPOK_on(sv_points);
2916             sv_2mortal(sv_points);
2917             points=(U8**)SvPV_nolen(sv_points );
2918             if ( trie_type != trie_utf8_fold
2919                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2920             {
2921                 if (trie->bitmap)
2922                     bitmap=(U8*)trie->bitmap;
2923                 else
2924                     bitmap=(U8*)ANYOF_BITMAP(c);
2925             }
2926             /* this is the Aho-Corasick algorithm modified a touch
2927                to include special handling for long "unknown char" sequences.
2928                The basic idea being that we use AC as long as we are dealing
2929                with a possible matching char, when we encounter an unknown char
2930                (and we have not encountered an accepting state) we scan forward
2931                until we find a legal starting char.
2932                AC matching is basically that of trie matching, except that when
2933                we encounter a failing transition, we fall back to the current
2934                states "fail state", and try the current char again, a process
2935                we repeat until we reach the root state, state 1, or a legal
2936                transition. If we fail on the root state then we can either
2937                terminate if we have reached an accepting state previously, or
2938                restart the entire process from the beginning if we have not.
2939
2940              */
2941             while (s <= last_start) {
2942                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2943                 U8 *uc = (U8*)s;
2944                 U16 charid = 0;
2945                 U32 base = 1;
2946                 U32 state = 1;
2947                 UV uvc = 0;
2948                 STRLEN len = 0;
2949                 STRLEN foldlen = 0;
2950                 U8 *uscan = (U8*)NULL;
2951                 U8 *leftmost = NULL;
2952 #ifdef DEBUGGING
2953                 U32 accepted_word= 0;
2954 #endif
2955                 U32 pointpos = 0;
2956
2957                 while ( state && uc <= (U8*)strend ) {
2958                     int failed=0;
2959                     U32 word = aho->states[ state ].wordnum;
2960
2961                     if( state==1 ) {
2962                         if ( bitmap ) {
2963                             DEBUG_TRIE_EXECUTE_r(
2964                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2965                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2966                                         (char *)uc, utf8_target, 0 );
2967                                     Perl_re_printf( aTHX_
2968                                         " Scanning for legal start char...\n");
2969                                 }
2970                             );
2971                             if (utf8_target) {
2972                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2973                                     uc += UTF8SKIP(uc);
2974                                 }
2975                             } else {
2976                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2977                                     uc++;
2978                                 }
2979                             }
2980                             s= (char *)uc;
2981                         }
2982                         if (uc >(U8*)last_start) break;
2983                     }
2984
2985                     if ( word ) {
2986                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2987                         if (!leftmost || lpos < leftmost) {
2988                             DEBUG_r(accepted_word=word);
2989                             leftmost= lpos;
2990                         }
2991                         if (base==0) break;
2992
2993                     }
2994                     points[pointpos++ % maxlen]= uc;
2995                     if (foldlen || uc < (U8*)strend) {
2996                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2997                                              (U8 *) strend, uscan, len, uvc,
2998                                              charid, foldlen, foldbuf,
2999                                              uniflags);
3000                         DEBUG_TRIE_EXECUTE_r({
3001                             dump_exec_pos( (char *)uc, c, strend,
3002                                         real_start, s, utf8_target, 0);
3003                             Perl_re_printf( aTHX_
3004                                 " Charid:%3u CP:%4" UVxf " ",
3005                                  charid, uvc);
3006                         });
3007                     }
3008                     else {
3009                         len = 0;
3010                         charid = 0;
3011                     }
3012
3013
3014                     do {
3015 #ifdef DEBUGGING
3016                         word = aho->states[ state ].wordnum;
3017 #endif
3018                         base = aho->states[ state ].trans.base;
3019
3020                         DEBUG_TRIE_EXECUTE_r({
3021                             if (failed)
3022                                 dump_exec_pos( (char *)uc, c, strend, real_start,
3023                                     s,   utf8_target, 0 );
3024                             Perl_re_printf( aTHX_
3025                                 "%sState: %4" UVxf ", word=%" UVxf,
3026                                 failed ? " Fail transition to " : "",
3027                                 (UV)state, (UV)word);
3028                         });
3029                         if ( base ) {
3030                             U32 tmp;
3031                             I32 offset;
3032                             if (charid &&
3033                                  ( ((offset = base + charid
3034                                     - 1 - trie->uniquecharcount)) >= 0)
3035                                  && ((U32)offset < trie->lasttrans)
3036                                  && trie->trans[offset].check == state
3037                                  && (tmp=trie->trans[offset].next))
3038                             {
3039                                 DEBUG_TRIE_EXECUTE_r(
3040                                     Perl_re_printf( aTHX_ " - legal\n"));
3041                                 state = tmp;
3042                                 break;
3043                             }
3044                             else {
3045                                 DEBUG_TRIE_EXECUTE_r(
3046                                     Perl_re_printf( aTHX_ " - fail\n"));
3047                                 failed = 1;
3048                                 state = aho->fail[state];
3049                             }
3050                         }
3051                         else {
3052                             /* we must be accepting here */
3053                             DEBUG_TRIE_EXECUTE_r(
3054                                     Perl_re_printf( aTHX_ " - accepting\n"));
3055                             failed = 1;
3056                             break;
3057                         }
3058                     } while(state);
3059                     uc += len;
3060                     if (failed) {
3061                         if (leftmost)
3062                             break;
3063                         if (!state) state = 1;
3064                     }
3065                 }
3066                 if ( aho->states[ state ].wordnum ) {
3067                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3068                     if (!leftmost || lpos < leftmost) {
3069                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3070                         leftmost = lpos;
3071                     }
3072                 }
3073                 if (leftmost) {
3074                     s = (char*)leftmost;
3075                     DEBUG_TRIE_EXECUTE_r({
3076                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3077                             (UV)accepted_word, (IV)(s - real_start)
3078                         );
3079                     });
3080                     if (reginfo->intuit || regtry(reginfo, &s)) {
3081                         FREETMPS;
3082                         LEAVE;
3083                         goto got_it;
3084                     }
3085                     s = HOPc(s,1);
3086                     DEBUG_TRIE_EXECUTE_r({
3087                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3088                     });
3089                 } else {
3090                     DEBUG_TRIE_EXECUTE_r(
3091                         Perl_re_printf( aTHX_ "No match.\n"));
3092                     break;
3093                 }
3094             }
3095             FREETMPS;
3096             LEAVE;
3097         }
3098         break;
3099     default:
3100         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3101     }
3102     return 0;
3103   got_it:
3104     return s;
3105 }
3106
3107 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3108  * flags have same meanings as with regexec_flags() */
3109
3110 static void
3111 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3112                             char *strbeg,
3113                             char *strend,
3114                             SV *sv,
3115                             U32 flags,
3116                             bool utf8_target)
3117 {
3118     struct regexp *const prog = ReANY(rx);
3119
3120     if (flags & REXEC_COPY_STR) {
3121 #ifdef PERL_ANY_COW
3122         if (SvCANCOW(sv)) {
3123             DEBUG_C(Perl_re_printf( aTHX_
3124                               "Copy on write: regexp capture, type %d\n",
3125                                     (int) SvTYPE(sv)));
3126             /* Create a new COW SV to share the match string and store
3127              * in saved_copy, unless the current COW SV in saved_copy
3128              * is valid and suitable for our purpose */
3129             if ((   prog->saved_copy
3130                  && SvIsCOW(prog->saved_copy)
3131                  && SvPOKp(prog->saved_copy)
3132                  && SvIsCOW(sv)
3133                  && SvPOKp(sv)
3134                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3135             {
3136                 /* just reuse saved_copy SV */
3137                 if (RXp_MATCH_COPIED(prog)) {
3138                     Safefree(prog->subbeg);
3139                     RXp_MATCH_COPIED_off(prog);
3140                 }
3141             }
3142             else {
3143                 /* create new COW SV to share string */
3144                 RXp_MATCH_COPY_FREE(prog);
3145                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3146             }
3147             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3148             assert (SvPOKp(prog->saved_copy));
3149             prog->sublen  = strend - strbeg;
3150             prog->suboffset = 0;
3151             prog->subcoffset = 0;
3152         } else
3153 #endif
3154         {
3155             SSize_t min = 0;
3156             SSize_t max = strend - strbeg;
3157             SSize_t sublen;
3158
3159             if (    (flags & REXEC_COPY_SKIP_POST)
3160                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3161                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3162             ) { /* don't copy $' part of string */
3163                 U32 n = 0;
3164                 max = -1;
3165                 /* calculate the right-most part of the string covered
3166                  * by a capture. Due to lookahead, this may be to
3167                  * the right of $&, so we have to scan all captures */
3168                 while (n <= prog->lastparen) {
3169                     if (prog->offs[n].end > max)
3170                         max = prog->offs[n].end;
3171                     n++;
3172                 }
3173                 if (max == -1)
3174                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3175                             ? prog->offs[0].start
3176                             : 0;
3177                 assert(max >= 0 && max <= strend - strbeg);
3178             }
3179
3180             if (    (flags & REXEC_COPY_SKIP_PRE)
3181                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3182                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3183             ) { /* don't copy $` part of string */
3184                 U32 n = 0;
3185                 min = max;
3186                 /* calculate the left-most part of the string covered
3187                  * by a capture. Due to lookbehind, this may be to
3188                  * the left of $&, so we have to scan all captures */
3189                 while (min && n <= prog->lastparen) {
3190                     if (   prog->offs[n].start != -1
3191                         && prog->offs[n].start < min)
3192                     {
3193                         min = prog->offs[n].start;
3194                     }
3195                     n++;
3196                 }
3197                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3198                     && min >  prog->offs[0].end
3199                 )
3200                     min = prog->offs[0].end;
3201
3202             }
3203
3204             assert(min >= 0 && min <= max && min <= strend - strbeg);
3205             sublen = max - min;
3206
3207             if (RXp_MATCH_COPIED(prog)) {
3208                 if (sublen > prog->sublen)
3209                     prog->subbeg =
3210                             (char*)saferealloc(prog->subbeg, sublen+1);
3211             }
3212             else
3213                 prog->subbeg = (char*)safemalloc(sublen+1);
3214             Copy(strbeg + min, prog->subbeg, sublen, char);
3215             prog->subbeg[sublen] = '\0';
3216             prog->suboffset = min;
3217             prog->sublen = sublen;
3218             RXp_MATCH_COPIED_on(prog);
3219         }
3220         prog->subcoffset = prog->suboffset;
3221         if (prog->suboffset && utf8_target) {
3222             /* Convert byte offset to chars.
3223              * XXX ideally should only compute this if @-/@+
3224              * has been seen, a la PL_sawampersand ??? */
3225
3226             /* If there's a direct correspondence between the
3227              * string which we're matching and the original SV,
3228              * then we can use the utf8 len cache associated with
3229              * the SV. In particular, it means that under //g,
3230              * sv_pos_b2u() will use the previously cached
3231              * position to speed up working out the new length of
3232              * subcoffset, rather than counting from the start of
3233              * the string each time. This stops
3234              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3235              * from going quadratic */
3236             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3237                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3238                                                 SV_GMAGIC|SV_CONST_RETURN);
3239             else
3240                 prog->subcoffset = utf8_length((U8*)strbeg,
3241                                     (U8*)(strbeg+prog->suboffset));
3242         }
3243     }
3244     else {
3245         RXp_MATCH_COPY_FREE(prog);
3246         prog->subbeg = strbeg;
3247         prog->suboffset = 0;
3248         prog->subcoffset = 0;
3249         prog->sublen = strend - strbeg;
3250     }
3251 }
3252
3253
3254
3255
3256 /*
3257  - regexec_flags - match a regexp against a string
3258  */
3259 I32
3260 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3261               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3262 /* stringarg: the point in the string at which to begin matching */
3263 /* strend:    pointer to null at end of string */
3264 /* strbeg:    real beginning of string */
3265 /* minend:    end of match must be >= minend bytes after stringarg. */
3266 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3267  *            itself is accessed via the pointers above */
3268 /* data:      May be used for some additional optimizations.
3269               Currently unused. */
3270 /* flags:     For optimizations. See REXEC_* in regexp.h */
3271
3272 {
3273     struct regexp *const prog = ReANY(rx);
3274     char *s;
3275     regnode *c;
3276     char *startpos;
3277     SSize_t minlen;             /* must match at least this many chars */
3278     SSize_t dontbother = 0;     /* how many characters not to try at end */
3279     const bool utf8_target = cBOOL(DO_UTF8(sv));
3280     I32 multiline;
3281     RXi_GET_DECL(prog,progi);
3282     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3283     regmatch_info *const reginfo = &reginfo_buf;
3284     regexp_paren_pair *swap = NULL;
3285     I32 oldsave;
3286     GET_RE_DEBUG_FLAGS_DECL;
3287
3288     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3289     PERL_UNUSED_ARG(data);
3290
3291     /* Be paranoid... */
3292     if (prog == NULL) {
3293         Perl_croak(aTHX_ "NULL regexp parameter");
3294     }
3295
3296     DEBUG_EXECUTE_r(
3297         debug_start_match(rx, utf8_target, stringarg, strend,
3298         "Matching");
3299     );
3300
3301     startpos = stringarg;
3302
3303     /* set these early as they may be used by the HOP macros below */
3304     reginfo->strbeg = strbeg;
3305     reginfo->strend = strend;
3306     reginfo->is_utf8_target = cBOOL(utf8_target);
3307
3308     if (prog->intflags & PREGf_GPOS_SEEN) {
3309         MAGIC *mg;
3310
3311         /* set reginfo->ganch, the position where \G can match */
3312
3313         reginfo->ganch =
3314             (flags & REXEC_IGNOREPOS)
3315             ? stringarg /* use start pos rather than pos() */
3316             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3317               /* Defined pos(): */
3318             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3319             : strbeg; /* pos() not defined; use start of string */
3320
3321         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3322             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3323
3324         /* in the presence of \G, we may need to start looking earlier in
3325          * the string than the suggested start point of stringarg:
3326          * if prog->gofs is set, then that's a known, fixed minimum
3327          * offset, such as
3328          * /..\G/:   gofs = 2
3329          * /ab|c\G/: gofs = 1
3330          * or if the minimum offset isn't known, then we have to go back
3331          * to the start of the string, e.g. /w+\G/
3332          */
3333
3334         if (prog->intflags & PREGf_ANCH_GPOS) {
3335             if (prog->gofs) {
3336                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3337                 if (!startpos ||
3338                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3339                 {
3340                     DEBUG_r(Perl_re_printf( aTHX_
3341                             "fail: ganch-gofs before earliest possible start\n"));
3342                     return 0;
3343                 }
3344             }
3345             else
3346                 startpos = reginfo->ganch;
3347         }
3348         else if (prog->gofs) {
3349             startpos = HOPBACKc(startpos, prog->gofs);
3350             if (!startpos)
3351                 startpos = strbeg;
3352         }
3353         else if (prog->intflags & PREGf_GPOS_FLOAT)
3354             startpos = strbeg;
3355     }
3356
3357     minlen = prog->minlen;
3358     if ((startpos + minlen) > strend || startpos < strbeg) {
3359         DEBUG_r(Perl_re_printf( aTHX_
3360                     "Regex match can't succeed, so not even tried\n"));
3361         return 0;
3362     }
3363
3364     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3365      * which will call destuctors to reset PL_regmatch_state, free higher
3366      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3367      * regmatch_info_aux_eval */
3368
3369     oldsave = PL_savestack_ix;
3370
3371     s = startpos;
3372
3373     if ((prog->extflags & RXf_USE_INTUIT)
3374         && !(flags & REXEC_CHECKED))
3375     {
3376         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3377                                     flags, NULL);
3378         if (!s)
3379             return 0;
3380
3381         if (prog->extflags & RXf_CHECK_ALL) {
3382             /* we can match based purely on the result of INTUIT.
3383              * Set up captures etc just for $& and $-[0]
3384              * (an intuit-only match wont have $1,$2,..) */
3385             assert(!prog->nparens);
3386
3387             /* s/// doesn't like it if $& is earlier than where we asked it to
3388              * start searching (which can happen on something like /.\G/) */
3389             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3390                     && (s < stringarg))
3391             {
3392                 /* this should only be possible under \G */
3393                 assert(prog->intflags & PREGf_GPOS_SEEN);
3394                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3395                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3396                 goto phooey;
3397             }
3398
3399             /* match via INTUIT shouldn't have any captures.
3400              * Let @-, @+, $^N know */
3401             prog->lastparen = prog->lastcloseparen = 0;
3402             RXp_MATCH_UTF8_set(prog, utf8_target);
3403             prog->offs[0].start = s - strbeg;
3404             prog->offs[0].end = utf8_target
3405                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3406                 : s - strbeg + prog->minlenret;
3407             if ( !(flags & REXEC_NOT_FIRST) )
3408                 S_reg_set_capture_string(aTHX_ rx,
3409                                         strbeg, strend,
3410                                         sv, flags, utf8_target);
3411
3412             return 1;
3413         }
3414     }
3415
3416     multiline = prog->extflags & RXf_PMf_MULTILINE;
3417
3418     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3419         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3420                               "String too short [regexec_flags]...\n"));
3421         goto phooey;
3422     }
3423
3424     /* Check validity of program. */
3425     if (UCHARAT(progi->program) != REG_MAGIC) {
3426         Perl_croak(aTHX_ "corrupted regexp program");
3427     }
3428
3429     RXp_MATCH_TAINTED_off(prog);
3430     RXp_MATCH_UTF8_set(prog, utf8_target);
3431
3432     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3433     reginfo->intuit = 0;
3434     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3435     reginfo->warned = FALSE;
3436     reginfo->sv = sv;
3437     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3438     /* see how far we have to get to not match where we matched before */
3439     reginfo->till = stringarg + minend;
3440
3441     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3442         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3443            S_cleanup_regmatch_info_aux has executed (registered by
3444            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3445            magic belonging to this SV.
3446            Not newSVsv, either, as it does not COW.
3447         */
3448         reginfo->sv = newSV(0);
3449         SvSetSV_nosteal(reginfo->sv, sv);
3450         SAVEFREESV(reginfo->sv);
3451     }
3452
3453     /* reserve next 2 or 3 slots in PL_regmatch_state:
3454      * slot N+0: may currently be in use: skip it
3455      * slot N+1: use for regmatch_info_aux struct
3456      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3457      * slot N+3: ready for use by regmatch()
3458      */
3459
3460     {
3461         regmatch_state *old_regmatch_state;
3462         regmatch_slab  *old_regmatch_slab;
3463         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3464
3465         /* on first ever match, allocate first slab */
3466         if (!PL_regmatch_slab) {
3467             Newx(PL_regmatch_slab, 1, regmatch_slab);
3468             PL_regmatch_slab->prev = NULL;
3469             PL_regmatch_slab->next = NULL;
3470             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3471         }
3472
3473         old_regmatch_state = PL_regmatch_state;
3474         old_regmatch_slab  = PL_regmatch_slab;
3475
3476         for (i=0; i <= max; i++) {
3477             if (i == 1)
3478                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3479             else if (i ==2)
3480                 reginfo->info_aux_eval =
3481                 reginfo->info_aux->info_aux_eval =
3482                             &(PL_regmatch_state->u.info_aux_eval);
3483
3484             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3485                 PL_regmatch_state = S_push_slab(aTHX);
3486         }
3487
3488         /* note initial PL_regmatch_state position; at end of match we'll
3489          * pop back to there and free any higher slabs */
3490
3491         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3492         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3493         reginfo->info_aux->poscache = NULL;
3494
3495         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3496
3497         if ((prog->extflags & RXf_EVAL_SEEN))
3498             S_setup_eval_state(aTHX_ reginfo);
3499         else
3500             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3501     }
3502
3503     /* If there is a "must appear" string, look for it. */
3504
3505     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3506         /* We have to be careful. If the previous successful match
3507            was from this regex we don't want a subsequent partially
3508            successful match to clobber the old results.
3509            So when we detect this possibility we add a swap buffer
3510            to the re, and switch the buffer each match. If we fail,
3511            we switch it back; otherwise we leave it swapped.
3512         */
3513         swap = prog->offs;
3514         /* do we need a save destructor here for eval dies? */
3515         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3516         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3517             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3518             0,
3519             PTR2UV(prog),
3520             PTR2UV(swap),
3521             PTR2UV(prog->offs)
3522         ));
3523     }
3524
3525     if (prog->recurse_locinput)
3526         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3527
3528     /* Simplest case: anchored match need be tried only once, or with
3529      * MBOL, only at the beginning of each line.
3530      *
3531      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3532      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3533      * match at the start of the string then it won't match anywhere else
3534      * either; while with /.*.../, if it doesn't match at the beginning,
3535      * the earliest it could match is at the start of the next line */
3536
3537     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3538         char *end;
3539
3540         if (regtry(reginfo, &s))
3541             goto got_it;
3542
3543         if (!(prog->intflags & PREGf_ANCH_MBOL))
3544             goto phooey;
3545
3546         /* didn't match at start, try at other newline positions */
3547
3548         if (minlen)
3549             dontbother = minlen - 1;
3550         end = HOP3c(strend, -dontbother, strbeg) - 1;
3551
3552         /* skip to next newline */
3553
3554         while (s <= end) { /* note it could be possible to match at the end of the string */
3555             /* NB: newlines are the same in unicode as they are in latin */
3556             if (*s++ != '\n')
3557                 continue;
3558             if (prog->check_substr || prog->check_utf8) {
3559             /* note that with PREGf_IMPLICIT, intuit can only fail
3560              * or return the start position, so it's of limited utility.
3561              * Nevertheless, I made the decision that the potential for
3562              * quick fail was still worth it - DAPM */
3563                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3564                 if (!s)
3565                     goto phooey;
3566             }
3567             if (regtry(reginfo, &s))
3568                 goto got_it;
3569         }
3570         goto phooey;
3571     } /* end anchored search */
3572
3573     if (prog->intflags & PREGf_ANCH_GPOS)
3574     {
3575         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3576         assert(prog->intflags & PREGf_GPOS_SEEN);
3577         /* For anchored \G, the only position it can match from is
3578          * (ganch-gofs); we already set startpos to this above; if intuit
3579          * moved us on from there, we can't possibly succeed */
3580         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3581         if (s == startpos && regtry(reginfo, &s))
3582             goto got_it;
3583         goto phooey;
3584     }
3585
3586     /* Messy cases:  unanchored match. */
3587     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3588         /* we have /x+whatever/ */
3589         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3590         char ch;
3591 #ifdef DEBUGGING
3592         int did_match = 0;
3593 #endif
3594         if (utf8_target) {
3595             if (! prog->anchored_utf8) {
3596                 to_utf8_substr(prog);
3597             }
3598             ch = SvPVX_const(prog->anchored_utf8)[0];
3599             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3600                 if (*s == ch) {
3601                     DEBUG_EXECUTE_r( did_match = 1 );
3602                     if (regtry(reginfo, &s)) goto got_it;
3603                     s += UTF8SKIP(s);
3604                     while (s < strend && *s == ch)
3605                         s += UTF8SKIP(s);
3606                 }
3607             );
3608
3609         }
3610         else {
3611             if (! prog->anchored_substr) {
3612                 if (! to_byte_substr(prog)) {
3613                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3614                 }
3615             }
3616             ch = SvPVX_const(prog->anchored_substr)[0];
3617             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3618                 if (*s == ch) {
3619                     DEBUG_EXECUTE_r( did_match = 1 );
3620                     if (regtry(reginfo, &s)) goto got_it;
3621                     s++;
3622                     while (s < strend && *s == ch)
3623                         s++;
3624                 }
3625             );
3626         }
3627         DEBUG_EXECUTE_r(if (!did_match)
3628                 Perl_re_printf( aTHX_
3629                                   "Did not find anchored character...\n")
3630                );
3631     }
3632     else if (prog->anchored_substr != NULL
3633               || prog->anchored_utf8 != NULL
3634               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3635                   && prog->float_max_offset < strend - s)) {
3636         SV *must;
3637         SSize_t back_max;
3638         SSize_t back_min;
3639         char *last;
3640         char *last1;            /* Last position checked before */
3641 #ifdef DEBUGGING
3642         int did_match = 0;
3643 #endif
3644         if (prog->anchored_substr || prog->anchored_utf8) {
3645             if (utf8_target) {
3646                 if (! prog->anchored_utf8) {
3647                     to_utf8_substr(prog);
3648                 }
3649                 must = prog->anchored_utf8;
3650             }
3651             else {
3652                 if (! prog->anchored_substr) {
3653                     if (! to_byte_substr(prog)) {
3654                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3655                     }
3656                 }
3657                 must = prog->anchored_substr;
3658             }
3659             back_max = back_min = prog->anchored_offset;
3660         } else {
3661             if (utf8_target) {
3662                 if (! prog->float_utf8) {
3663                     to_utf8_substr(prog);
3664                 }
3665                 must = prog->float_utf8;
3666             }
3667             else {
3668                 if (! prog->float_substr) {
3669                     if (! to_byte_substr(prog)) {
3670                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3671                     }
3672                 }
3673                 must = prog->float_substr;
3674             }
3675             back_max = prog->float_max_offset;
3676             back_min = prog->float_min_offset;
3677         }
3678
3679         if (back_min<0) {
3680             last = strend;
3681         } else {
3682             last = HOP3c(strend,        /* Cannot start after this */
3683                   -(SSize_t)(CHR_SVLEN(must)
3684                          - (SvTAIL(must) != 0) + back_min), strbeg);
3685         }
3686         if (s > reginfo->strbeg)
3687             last1 = HOPc(s, -1);
3688         else
3689             last1 = s - 1;      /* bogus */
3690
3691         /* XXXX check_substr already used to find "s", can optimize if
3692            check_substr==must. */
3693         dontbother = 0;
3694         strend = HOPc(strend, -dontbother);
3695         while ( (s <= last) &&
3696                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3697                                   (unsigned char*)strend, must,
3698                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3699             DEBUG_EXECUTE_r( did_match = 1 );
3700             if (HOPc(s, -back_max) > last1) {
3701                 last1 = HOPc(s, -back_min);
3702                 s = HOPc(s, -back_max);
3703             }
3704             else {
3705                 char * const t = (last1 >= reginfo->strbeg)
3706                                     ? HOPc(last1, 1) : last1 + 1;
3707
3708                 last1 = HOPc(s, -back_min);
3709                 s = t;
3710             }
3711             if (utf8_target) {
3712                 while (s <= last1) {
3713                     if (regtry(reginfo, &s))
3714                         goto got_it;
3715                     if (s >= last1) {
3716                         s++; /* to break out of outer loop */
3717                         break;
3718                     }
3719                     s += UTF8SKIP(s);
3720                 }
3721             }
3722             else {
3723                 while (s <= last1) {
3724                     if (regtry(reginfo, &s))
3725                         goto got_it;
3726                     s++;
3727                 }
3728             }
3729         }
3730         DEBUG_EXECUTE_r(if (!did_match) {
3731             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3732                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3733             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3734                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3735                                ? "anchored" : "floating"),
3736                 quoted, RE_SV_TAIL(must));
3737         });
3738         goto phooey;
3739     }
3740     else if ( (c = progi->regstclass) ) {
3741         if (minlen) {
3742             const OPCODE op = OP(progi->regstclass);
3743             /* don't bother with what can't match */
3744             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3745                 strend = HOPc(strend, -(minlen - 1));
3746         }
3747         DEBUG_EXECUTE_r({
3748             SV * const prop = sv_newmortal();
3749             regprop(prog, prop, c, reginfo, NULL);
3750             {
3751                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3752                     s,strend-s,PL_dump_re_max_len);
3753                 Perl_re_printf( aTHX_
3754                     "Matching stclass %.*s against %s (%d bytes)\n",
3755                     (int)SvCUR(prop), SvPVX_const(prop),
3756                      quoted, (int)(strend - s));
3757             }
3758         });
3759         if (find_byclass(prog, c, s, strend, reginfo))
3760             goto got_it;
3761         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3762     }
3763     else {
3764         dontbother = 0;
3765         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3766             /* Trim the end. */
3767             char *last= NULL;
3768             SV* float_real;
3769             STRLEN len;
3770             const char *little;
3771
3772             if (utf8_target) {
3773                 if (! prog->float_utf8) {
3774                     to_utf8_substr(prog);
3775                 }
3776                 float_real = prog->float_utf8;
3777             }
3778             else {
3779                 if (! prog->float_substr) {
3780                     if (! to_byte_substr(prog)) {
3781                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3782                     }
3783                 }
3784                 float_real = prog->float_substr;
3785             }
3786
3787             little = SvPV_const(float_real, len);
3788             if (SvTAIL(float_real)) {
3789                     /* This means that float_real contains an artificial \n on
3790                      * the end due to the presence of something like this:
3791                      * /foo$/ where we can match both "foo" and "foo\n" at the
3792                      * end of the string.  So we have to compare the end of the
3793                      * string first against the float_real without the \n and
3794                      * then against the full float_real with the string.  We
3795                      * have to watch out for cases where the string might be
3796                      * smaller than the float_real or the float_real without
3797                      * the \n. */
3798                     char *checkpos= strend - len;
3799                     DEBUG_OPTIMISE_r(
3800                         Perl_re_printf( aTHX_
3801                             "%sChecking for float_real.%s\n",
3802                             PL_colors[4], PL_colors[5]));
3803                     if (checkpos + 1 < strbeg) {
3804                         /* can't match, even if we remove the trailing \n
3805                          * string is too short to match */
3806                         DEBUG_EXECUTE_r(
3807                             Perl_re_printf( aTHX_
3808                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3809                                 PL_colors[4], PL_colors[5]));
3810                         goto phooey;
3811                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3812                         /* can match, the end of the string matches without the
3813                          * "\n" */
3814                         last = checkpos + 1;
3815                     } else if (checkpos < strbeg) {
3816                         /* cant match, string is too short when the "\n" is
3817                          * included */
3818                         DEBUG_EXECUTE_r(
3819                             Perl_re_printf( aTHX_
3820                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3821                                 PL_colors[4], PL_colors[5]));
3822                         goto phooey;
3823                     } else if (!multiline) {
3824                         /* non multiline match, so compare with the "\n" at the
3825                          * end of the string */
3826                         if (memEQ(checkpos, little, len)) {
3827                             last= checkpos;
3828                         } else {
3829                             DEBUG_EXECUTE_r(
3830                                 Perl_re_printf( aTHX_
3831                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3832                                     PL_colors[4], PL_colors[5]));
3833                             goto phooey;
3834                         }
3835                     } else {
3836                         /* multiline match, so we have to search for a place
3837                          * where the full string is located */
3838                         goto find_last;
3839                     }
3840             } else {
3841                   find_last:
3842                     if (len)
3843                         last = rninstr(s, strend, little, little + len);
3844                     else
3845                         last = strend;  /* matching "$" */
3846             }
3847             if (!last) {
3848                 /* at one point this block contained a comment which was
3849                  * probably incorrect, which said that this was a "should not
3850                  * happen" case.  Even if it was true when it was written I am
3851                  * pretty sure it is not anymore, so I have removed the comment
3852                  * and replaced it with this one. Yves */
3853                 DEBUG_EXECUTE_r(
3854                     Perl_re_printf( aTHX_
3855                         "%sString does not contain required substring, cannot match.%s\n",
3856                         PL_colors[4], PL_colors[5]
3857                     ));
3858                 goto phooey;
3859             }
3860             dontbother = strend - last + prog->float_min_offset;
3861         }
3862         if (minlen && (dontbother < minlen))
3863             dontbother = minlen - 1;
3864         strend -= dontbother;              /* this one's always in bytes! */
3865         /* We don't know much -- general case. */
3866         if (utf8_target) {
3867             for (;;) {
3868                 if (regtry(reginfo, &s))
3869                     goto got_it;
3870                 if (s >= strend)
3871                     break;
3872                 s += UTF8SKIP(s);
3873             };
3874         }
3875         else {
3876             do {
3877                 if (regtry(reginfo, &s))
3878                     goto got_it;
3879             } while (s++ < strend);
3880         }
3881     }
3882
3883     /* Failure. */
3884     goto phooey;
3885
3886   got_it:
3887     /* s/// doesn't like it if $& is earlier than where we asked it to
3888      * start searching (which can happen on something like /.\G/) */
3889     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3890             && (prog->offs[0].start < stringarg - strbeg))
3891     {
3892         /* this should only be possible under \G */
3893         assert(prog->intflags & PREGf_GPOS_SEEN);
3894         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3895             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3896         goto phooey;
3897     }
3898
3899     DEBUG_BUFFERS_r(
3900         if (swap)
3901             Perl_re_exec_indentf( aTHX_
3902                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3903                 0,
3904                 PTR2UV(prog),
3905                 PTR2UV(swap)
3906             );
3907     );
3908     Safefree(swap);
3909
3910     /* clean up; this will trigger destructors that will free all slabs
3911      * above the current one, and cleanup the regmatch_info_aux
3912      * and regmatch_info_aux_eval sructs */
3913
3914     LEAVE_SCOPE(oldsave);
3915
3916     if (RXp_PAREN_NAMES(prog))
3917         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3918
3919     /* make sure $`, $&, $', and $digit will work later */
3920     if ( !(flags & REXEC_NOT_FIRST) )
3921         S_reg_set_capture_string(aTHX_ rx,
3922                                     strbeg, reginfo->strend,
3923                                     sv, flags, utf8_target);
3924
3925     return 1;
3926
3927   phooey:
3928     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3929                           PL_colors[4], PL_colors[5]));
3930
3931     /* clean up; this will trigger destructors that will free all slabs
3932      * above the current one, and cleanup the regmatch_info_aux
3933      * and regmatch_info_aux_eval sructs */
3934
3935     LEAVE_SCOPE(oldsave);
3936
3937     if (swap) {
3938         /* we failed :-( roll it back */
3939         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3940             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3941             0,
3942             PTR2UV(prog),
3943             PTR2UV(prog->offs),
3944             PTR2UV(swap)
3945         ));
3946         Safefree(prog->offs);
3947         prog->offs = swap;
3948     }
3949     return 0;
3950 }
3951
3952
3953 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3954  * Do inc before dec, in case old and new rex are the same */
3955 #define SET_reg_curpm(Re2)                          \
3956     if (reginfo->info_aux_eval) {                   \
3957         (void)ReREFCNT_inc(Re2);                    \
3958         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3959         PM_SETRE((PL_reg_curpm), (Re2));            \
3960     }
3961
3962
3963 /*
3964  - regtry - try match at specific point
3965  */
3966 STATIC bool                     /* 0 failure, 1 success */
3967 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3968 {
3969     CHECKPOINT lastcp;
3970     REGEXP *const rx = reginfo->prog;
3971     regexp *const prog = ReANY(rx);
3972     SSize_t result;
3973 #ifdef DEBUGGING
3974     U32 depth = 0; /* used by REGCP_SET */
3975 #endif
3976     RXi_GET_DECL(prog,progi);
3977     GET_RE_DEBUG_FLAGS_DECL;
3978
3979     PERL_ARGS_ASSERT_REGTRY;
3980
3981     reginfo->cutpoint=NULL;
3982
3983     prog->offs[0].start = *startposp - reginfo->strbeg;
3984     prog->lastparen = 0;
3985     prog->lastcloseparen = 0;
3986
3987     /* XXXX What this code is doing here?!!!  There should be no need
3988        to do this again and again, prog->lastparen should take care of
3989        this!  --ilya*/
3990
3991     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3992      * Actually, the code in regcppop() (which Ilya may be meaning by
3993      * prog->lastparen), is not needed at all by the test suite
3994      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3995      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3996      * Meanwhile, this code *is* needed for the
3997      * above-mentioned test suite tests to succeed.  The common theme
3998      * on those tests seems to be returning null fields from matches.
3999      * --jhi updated by dapm */
4000
4001     /* After encountering a variant of the issue mentioned above I think
4002      * the point Ilya was making is that if we properly unwind whenever
4003      * we set lastparen to a smaller value then we should not need to do
4004      * this every time, only when needed. So if we have tests that fail if
4005      * we remove this, then it suggests somewhere else we are improperly
4006      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4007      * places it is called, and related regcp() routines. - Yves */
4008 #if 1
4009     if (prog->nparens) {
4010         regexp_paren_pair *pp = prog->offs;
4011         I32 i;
4012         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4013             ++pp;
4014             pp->start = -1;
4015             pp->end = -1;
4016         }
4017     }
4018 #endif
4019     REGCP_SET(lastcp);
4020     result = regmatch(reginfo, *startposp, progi->program + 1);
4021     if (result != -1) {
4022         prog->offs[0].end = result;
4023         return 1;
4024     }
4025     if (reginfo->cutpoint)
4026         *startposp= reginfo->cutpoint;
4027     REGCP_UNWIND(lastcp);
4028     return 0;
4029 }
4030
4031
4032 #define sayYES goto yes
4033 #define sayNO goto no
4034 #define sayNO_SILENT goto no_silent
4035
4036 /* we dont use STMT_START/END here because it leads to
4037    "unreachable code" warnings, which are bogus, but distracting. */
4038 #define CACHEsayNO \
4039     if (ST.cache_mask) \
4040        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
4041     sayNO
4042
4043 /* this is used to determine how far from the left messages like
4044    'failed...' are printed in regexec.c. It should be set such that
4045    messages are inline with the regop output that created them.
4046 */
4047 #define REPORT_CODE_OFF 29
4048 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4049 #ifdef DEBUGGING
4050 int
4051 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4052 {
4053     va_list ap;
4054     int result;
4055     PerlIO *f= Perl_debug_log;
4056     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4057     va_start(ap, depth);
4058     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4059     result = PerlIO_vprintf(f, fmt, ap);
4060     va_end(ap);
4061     return result;
4062 }
4063 #endif /* DEBUGGING */
4064
4065
4066 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4067 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4068 #define CHRTEST_NOT_A_CP_1 -999
4069 #define CHRTEST_NOT_A_CP_2 -998
4070
4071 /* grab a new slab and return the first slot in it */
4072
4073 STATIC regmatch_state *
4074 S_push_slab(pTHX)
4075 {
4076     regmatch_slab *s = PL_regmatch_slab->next;
4077     if (!s) {
4078         Newx(s, 1, regmatch_slab);
4079         s->prev = PL_regmatch_slab;
4080         s->next = NULL;
4081         PL_regmatch_slab->next = s;
4082     }
4083     PL_regmatch_slab = s;
4084     return SLAB_FIRST(s);
4085 }
4086
4087
4088 /* push a new state then goto it */
4089
4090 #define PUSH_STATE_GOTO(state, node, input) \
4091     pushinput = input; \
4092     scan = node; \
4093     st->resume_state = state; \
4094     goto push_state;
4095
4096 /* push a new state with success backtracking, then goto it */
4097
4098 #define PUSH_YES_STATE_GOTO(state, node, input) \
4099     pushinput = input; \
4100     scan = node; \
4101     st->resume_state = state; \
4102     goto push_yes_state;
4103
4104
4105
4106
4107 /*
4108
4109 regmatch() - main matching routine
4110
4111 This is basically one big switch statement in a loop. We execute an op,
4112 set 'next' to point the next op, and continue. If we come to a point which
4113 we may need to backtrack to on failure such as (A|B|C), we push a
4114 backtrack state onto the backtrack stack. On failure, we pop the top
4115 state, and re-enter the loop at the state indicated. If there are no more
4116 states to pop, we return failure.
4117
4118 Sometimes we also need to backtrack on success; for example /A+/, where
4119 after successfully matching one A, we need to go back and try to
4120 match another one; similarly for lookahead assertions: if the assertion
4121 completes successfully, we backtrack to the state just before the assertion
4122 and then carry on.  In these cases, the pushed state is marked as
4123 'backtrack on success too'. This marking is in fact done by a chain of
4124 pointers, each pointing to the previous 'yes' state. On success, we pop to
4125 the nearest yes state, discarding any intermediate failure-only states.
4126 Sometimes a yes state is pushed just to force some cleanup code to be
4127 called at the end of a successful match or submatch; e.g. (??{$re}) uses
4128 it to free the inner regex.
4129
4130 Note that failure backtracking rewinds the cursor position, while
4131 success backtracking leaves it alone.
4132
4133 A pattern is complete when the END op is executed, while a subpattern
4134 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
4135 ops trigger the "pop to last yes state if any, otherwise return true"
4136 behaviour.
4137
4138 A common convention in this function is to use A and B to refer to the two
4139 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
4140 the subpattern to be matched possibly multiple times, while B is the entire
4141 rest of the pattern. Variable and state names reflect this convention.
4142
4143 The states in the main switch are the union of ops and failure/success of
4144 substates associated with with that op.  For example, IFMATCH is the op
4145 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
4146 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
4147 successfully matched A and IFMATCH_A_fail is a state saying that we have
4148 just failed to match A. Resume states always come in pairs. The backtrack
4149 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
4150 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
4151 on success or failure.
4152
4153 The struct that holds a backtracking state is actually a big union, with
4154 one variant for each major type of op. The variable st points to the
4155 top-most backtrack struct. To make the code clearer, within each
4156 block of code we #define ST to alias the relevant union.
4157
4158 Here's a concrete example of a (vastly oversimplified) IFMATCH
4159 implementation:
4160
4161     switch (state) {
4162     ....
4163
4164 #define ST st->u.ifmatch
4165
4166     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4167         ST.foo = ...; // some state we wish to save
4168         ...
4169         // push a yes backtrack state with a resume value of
4170         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
4171         // first node of A:
4172         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
4173         // NOTREACHED
4174
4175     case IFMATCH_A: // we have successfully executed A; now continue with B
4176         next = B;
4177         bar = ST.foo; // do something with the preserved value
4178         break;
4179
4180     case IFMATCH_A_fail: // A failed, so the assertion failed
4181         ...;   // do some housekeeping, then ...
4182         sayNO; // propagate the failure
4183
4184 #undef ST
4185
4186     ...
4187     }
4188
4189 For any old-timers reading this who are familiar with the old recursive
4190 approach, the code above is equivalent to:
4191
4192     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4193     {
4194         int foo = ...
4195         ...
4196         if (regmatch(A)) {
4197             next = B;
4198             bar = foo;
4199             break;
4200         }
4201         ...;   // do some housekeeping, then ...
4202         sayNO; // propagate the failure
4203     }
4204
4205 The topmost backtrack state, pointed to by st, is usually free. If you
4206 want to claim it, populate any ST.foo fields in it with values you wish to
4207 save, then do one of
4208
4209         PUSH_STATE_GOTO(resume_state, node, newinput);
4210         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
4211
4212 which sets that backtrack state's resume value to 'resume_state', pushes a
4213 new free entry to the top of the backtrack stack, then goes to 'node'.
4214 On backtracking, the free slot is popped, and the saved state becomes the
4215 new free state. An ST.foo field in this new top state can be temporarily
4216 accessed to retrieve values, but once the main loop is re-entered, it
4217 becomes available for reuse.
4218
4219 Note that the depth of the backtrack stack constantly increases during the
4220 left-to-right execution of the pattern, rather than going up and down with
4221 the pattern nesting. For example the stack is at its maximum at Z at the
4222 end of the pattern, rather than at X in the following:
4223
4224     /(((X)+)+)+....(Y)+....Z/
4225
4226 The only exceptions to this are lookahead/behind assertions and the cut,
4227 (?>A), which pop all the backtrack states associated with A before
4228 continuing.
4229
4230 Backtrack state structs are allocated in slabs of about 4K in size.
4231 PL_regmatch_state and st always point to the currently active state,
4232 and PL_regmatch_slab points to the slab currently containing
4233 PL_regmatch_state.  The first time regmatch() is called, the first slab is
4234 allocated, and is never freed until interpreter destruction. When the slab
4235 is full, a new one is allocated and chained to the end. At exit from
4236 regmatch(), slabs allocated since entry are freed.
4237
4238 */
4239
4240
4241 #define DEBUG_STATE_pp(pp)                                  \
4242     DEBUG_STATE_r({                                         \
4243         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
4244         Perl_re_printf( aTHX_                                           \
4245             "%*s" pp " %s%s%s%s%s\n",                       \
4246             INDENT_CHARS(depth), "",                        \
4247             PL_reg_name[st->resume_state],                  \
4248             ((st==yes_state||st==mark_state) ? "[" : ""),   \
4249             ((st==yes_state) ? "Y" : ""),                   \
4250             ((st==mark_state) ? "M" : ""),                  \
4251             ((st==yes_state||st==mark_state) ? "]" : "")    \
4252         );                                                  \
4253     });
4254
4255
4256 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4257
4258 #ifdef DEBUGGING
4259
4260 STATIC void
4261 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4262     const char *start, const char *end, const char *blurb)
4263 {
4264     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4265
4266     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4267
4268     if (!PL_colorset)
4269             reginitcolors();
4270     {
4271         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4272             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4273
4274         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4275             start, end - start, PL_dump_re_max_len);
4276
4277         Perl_re_printf( aTHX_
4278             "%s%s REx%s %s against %s\n",
4279                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4280
4281         if (utf8_target||utf8_pat)
4282             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4283                 utf8_pat ? "pattern" : "",
4284                 utf8_pat && utf8_target ? " and " : "",
4285                 utf8_target ? "string" : ""
4286             );
4287     }
4288 }
4289
4290 STATIC void
4291 S_dump_exec_pos(pTHX_ const char *locinput,
4292                       const regnode *scan,
4293                       const char *loc_regeol,
4294                       const char *loc_bostr,
4295                       const char *loc_reg_starttry,
4296                       const bool utf8_target,
4297                       const U32 depth
4298                 )
4299 {
4300     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4301     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4302     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4303     /* The part of the string before starttry has one color
4304        (pref0_len chars), between starttry and current
4305        position another one (pref_len - pref0_len chars),
4306        after the current position the third one.
4307        We assume that pref0_len <= pref_len, otherwise we
4308        decrease pref0_len.  */
4309     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4310         ? (5 + taill) - l : locinput - loc_bostr;
4311     int pref0_len;
4312
4313     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4314
4315     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4316         pref_len++;
4317     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4318     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4319         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4320               ? (5 + taill) - pref_len : loc_regeol - locinput);
4321     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4322         l--;
4323     if (pref0_len < 0)
4324         pref0_len = 0;
4325     if (pref0_len > pref_len)
4326         pref0_len = pref_len;
4327     {
4328         const int is_uni = utf8_target ? 1 : 0;
4329
4330         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4331             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4332
4333         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4334                     (locinput - pref_len + pref0_len),
4335                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4336
4337         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4338                     locinput, loc_regeol - locinput, 10, 0, 1);
4339
4340         const STRLEN tlen=len0+len1+len2;
4341         Perl_re_printf( aTHX_
4342                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4343                     (IV)(locinput - loc_bostr),
4344                     len0, s0,
4345                     len1, s1,
4346                     (docolor ? "" : "> <"),
4347                     len2, s2,
4348                     (int)(tlen > 19 ? 0 :  19 - tlen),
4349                     "",
4350                     depth);
4351     }
4352 }
4353
4354 #endif
4355
4356 /* reg_check_named_buff_matched()
4357  * Checks to see if a named buffer has matched. The data array of
4358  * buffer numbers corresponding to the buffer is expected to reside
4359  * in the regexp->data->data array in the slot stored in the ARG() of
4360  * node involved. Note that this routine doesn't actually care about the
4361  * name, that information is not preserved from compilation to execution.
4362  * Returns the index of the leftmost defined buffer with the given name
4363  * or 0 if non of the buffers matched.
4364  */
4365 STATIC I32
4366 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4367 {
4368     I32 n;
4369     RXi_GET_DECL(rex,rexi);
4370     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4371     I32 *nums=(I32*)SvPVX(sv_dat);
4372
4373     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4374
4375     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4376         if ((I32)rex->lastparen >= nums[n] &&
4377             rex->offs[nums[n]].end != -1)
4378         {
4379             return nums[n];
4380         }
4381     }
4382     return 0;
4383 }
4384
4385
4386 static bool
4387 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4388         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4389 {
4390     /* This function determines if there are one or two characters that match
4391      * the first character of the passed-in EXACTish node <text_node>, and if
4392      * so, returns them in the passed-in pointers.
4393      *
4394      * If it determines that no possible character in the target string can
4395      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4396      * the first character in <text_node> requires UTF-8 to represent, and the
4397      * target string isn't in UTF-8.)
4398      *
4399      * If there are more than two characters that could match the beginning of
4400      * <text_node>, or if more context is required to determine a match or not,
4401      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4402      *
4403      * The motiviation behind this function is to allow the caller to set up
4404      * tight loops for matching.  If <text_node> is of type EXACT, there is
4405      * only one possible character that can match its first character, and so
4406      * the situation is quite simple.  But things get much more complicated if
4407      * folding is involved.  It may be that the first character of an EXACTFish
4408      * node doesn't participate in any possible fold, e.g., punctuation, so it
4409      * can be matched only by itself.  The vast majority of characters that are
4410      * in folds match just two things, their lower and upper-case equivalents.
4411      * But not all are like that; some have multiple possible matches, or match
4412      * sequences of more than one character.  This function sorts all that out.
4413      *
4414      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4415      * loop of trying to match A*, we know we can't exit where the thing
4416      * following it isn't a B.  And something can't be a B unless it is the
4417      * beginning of B.  By putting a quick test for that beginning in a tight
4418      * loop, we can rule out things that can't possibly be B without having to
4419      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4420      * character, we can make a tight loop matching A*, using the outputs of
4421      * this function.
4422      *
4423      * If the target string to match isn't in UTF-8, and there aren't
4424      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4425      * the one or two possible octets (which are characters in this situation)
4426      * that can match.  In all cases, if there is only one character that can
4427      * match, *<c1p> and *<c2p> will be identical.
4428      *
4429      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4430      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4431      * can match the beginning of <text_node>.  They should be declared with at
4432      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4433      * undefined what these contain.)  If one or both of the buffers are
4434      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4435      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4436      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4437      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4438      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4439
4440     const bool utf8_target = reginfo->is_utf8_target;
4441
4442     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4443     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4444     bool use_chrtest_void = FALSE;
4445     const bool is_utf8_pat = reginfo->is_utf8_pat;
4446
4447     /* Used when we have both utf8 input and utf8 output, to avoid converting
4448      * to/from code points */
4449     bool utf8_has_been_setup = FALSE;
4450
4451     dVAR;
4452
4453     U8 *pat = (U8*)STRING(text_node);
4454     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4455
4456     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4457
4458         /* In an exact node, only one thing can be matched, that first
4459          * character.  If both the pat and the target are UTF-8, we can just
4460          * copy the input to the output, avoiding finding the code point of
4461          * that character */
4462         if (!is_utf8_pat) {
4463             c2 = c1 = *pat;
4464         }
4465         else if (utf8_target) {
4466             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4467             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4468             utf8_has_been_setup = TRUE;
4469         }
4470         else {
4471             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4472         }
4473     }
4474     else { /* an EXACTFish node */
4475         U8 *pat_end = pat + STR_LEN(text_node);
4476
4477         /* An EXACTFL node has at least some characters unfolded, because what
4478          * they match is not known until now.  So, now is the time to fold
4479          * the first few of them, as many as are needed to determine 'c1' and
4480          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4481          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4482          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4483          * need to fold as many characters as a single character can fold to,
4484          * so that later we can check if the first ones are such a multi-char
4485          * fold.  But, in such a pattern only locale-problematic characters
4486          * aren't folded, so we can skip this completely if the first character
4487          * in the node isn't one of the tricky ones */
4488         if (OP(text_node) == EXACTFL) {
4489
4490             if (! is_utf8_pat) {
4491                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4492                 {
4493                     folded[0] = folded[1] = 's';
4494                     pat = folded;
4495                     pat_end = folded + 2;
4496                 }
4497             }
4498             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4499                 U8 *s = pat;
4500                 U8 *d = folded;
4501                 int i;
4502
4503                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4504                     if (isASCII(*s)) {
4505                         *(d++) = (U8) toFOLD_LC(*s);
4506                         s++;
4507                     }
4508                     else {
4509                         STRLEN len;
4510                         _toFOLD_utf8_flags(s,
4511                                            pat_end,
4512                                            d,
4513                                            &len,
4514                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4515                         d += len;
4516                         s += UTF8SKIP(s);
4517                     }
4518                 }
4519
4520                 pat = folded;
4521                 pat_end = d;
4522             }
4523         }
4524
4525         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4526              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4527         {
4528             /* Multi-character folds require more context to sort out.  Also
4529              * PL_utf8_foldclosures used below doesn't handle them, so have to
4530              * be handled outside this routine */
4531             use_chrtest_void = TRUE;
4532         }
4533         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4534             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4535             if (c1 > 255) {
4536                 const unsigned int * remaining_folds_to_list;
4537                 unsigned int first_folds_to;
4538
4539                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4540                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4541                 Size_t folds_to_count = _inverse_folds(c1,
4542                                                      &first_folds_to,
4543                                                      &remaining_folds_to_list);
4544                 if (folds_to_count == 0) {
4545                     c2 = c1;    /* there is only a single character that could
4546                                    match */
4547                 }
4548                 else if (folds_to_count != 1) {
4549                     /* If there aren't exactly two folds to this (itself and
4550                      * another), it is outside the scope of this function */
4551                     use_chrtest_void = TRUE;
4552                 }
4553                 else {  /* There are two.  We already have one, get the other */
4554                     c2 = first_folds_to;
4555
4556                     /* Folds that cross the 255/256 boundary are forbidden if
4557                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4558                      * ASCIII.  The only other match to c1 is c2, and since c1
4559                      * is above 255, c2 better be as well under these
4560                      * circumstances.  If it isn't, it means the only legal
4561                      * match of c1 is itself. */
4562                     if (    c2 < 256
4563                         && (   (   OP(text_node) == EXACTFL
4564                                 && ! IN_UTF8_CTYPE_LOCALE)
4565                             || ((     OP(text_node) == EXACTFAA
4566                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4567                                 && (isASCII(c1) || isASCII(c2)))))
4568                     {
4569                         c2 = c1;
4570                     }
4571                 }
4572             }
4573             else /* Here, c1 is <= 255 */
4574                 if (utf8_target
4575                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4576                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4577                     && ((OP(text_node) != EXACTFAA
4578                         && OP(text_node) != EXACTFAA_NO_TRIE)
4579                         || ! isASCII(c1)))
4580             {
4581                 /* Here, there could be something above Latin1 in the target
4582                  * which folds to this character in the pattern.  All such
4583                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4584                  * than two characters involved in their folds, so are outside
4585                  * the scope of this function */
4586                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4587                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4588                 }
4589                 else {
4590                     use_chrtest_void = TRUE;
4591                 }
4592             }
4593             else { /* Here nothing above Latin1 can fold to the pattern
4594                       character */
4595                 switch (OP(text_node)) {
4596
4597                     case EXACTFL:   /* /l rules */
4598                         c2 = PL_fold_locale[c1];
4599                         break;
4600
4601                     case EXACTF:   /* This node only generated for non-utf8
4602                                     patterns */
4603                         assert(! is_utf8_pat);
4604                         if (! utf8_target) {    /* /d rules */
4605                             c2 = PL_fold[c1];
4606                             break;
4607                         }
4608                         /* FALLTHROUGH */
4609                         /* /u rules for all these.  This happens to work for
4610                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4611                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4612                                                 non-utf8 patterns */
4613                         assert(! is_utf8_pat);
4614                         /* FALLTHROUGH */
4615                     case EXACTFAA:
4616                     case EXACTFU_SS:
4617                     case EXACTFU:
4618                         c2 = PL_fold_latin1[c1];
4619                         break;
4620
4621                     default:
4622                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4623                         NOT_REACHED; /* NOTREACHED */
4624                 }
4625             }
4626         }
4627     }
4628
4629     /* Here have figured things out.  Set up the returns */
4630     if (use_chrtest_void) {
4631         *c2p = *c1p = CHRTEST_VOID;
4632     }
4633     else if (utf8_target) {
4634         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4635             uvchr_to_utf8(c1_utf8, c1);
4636             uvchr_to_utf8(c2_utf8, c2);
4637         }
4638
4639         /* Invariants are stored in both the utf8 and byte outputs; Use
4640          * negative numbers otherwise for the byte ones.  Make sure that the
4641          * byte ones are the same iff the utf8 ones are the same */
4642         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4643         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4644                 ? *c2_utf8
4645                 : (c1 == c2)
4646                   ? CHRTEST_NOT_A_CP_1
4647                   : CHRTEST_NOT_A_CP_2;
4648     }
4649     else if (c1 > 255) {
4650        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4651                            can represent */
4652            return FALSE;
4653        }
4654
4655        *c1p = *c2p = c2;    /* c2 is the only representable value */
4656     }
4657     else {  /* c1 is representable; see about c2 */
4658        *c1p = c1;
4659        *c2p = (c2 < 256) ? c2 : c1;
4660     }
4661
4662     return TRUE;
4663 }
4664
4665 STATIC bool
4666 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4667 {
4668     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4669      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4670
4671     PERL_ARGS_ASSERT_ISGCB;
4672
4673     switch (GCB_table[before][after]) {
4674         case GCB_BREAKABLE:
4675             return TRUE;
4676
4677         case GCB_NOBREAK:
4678             return FALSE;
4679
4680         case GCB_RI_then_RI:
4681             {
4682                 int RI_count = 1;
4683                 U8 * temp_pos = (U8 *) curpos;
4684
4685                 /* Do not break within emoji flag sequences. That is, do not
4686                  * break between regional indicator (RI) symbols if there is an
4687                  * odd number of RI characters before the break point.
4688                  *  GB12   sot (RI RI)* RI × RI
4689                  *  GB13 [^RI] (RI RI)* RI × RI */
4690
4691                 while (backup_one_GCB(strbeg,
4692                                     &temp_pos,
4693                                     utf8_target) == GCB_Regional_Indicator)
4694                 {
4695                     RI_count++;
4696                 }
4697
4698                 return RI_count % 2 != 1;
4699             }
4700
4701         case GCB_EX_then_EM:
4702
4703             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4704             {
4705                 U8 * temp_pos = (U8 *) curpos;
4706                 GCB_enum prev;
4707
4708                 do {
4709                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4710                 }
4711                 while (prev == GCB_Extend);
4712
4713                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4714             }
4715
4716         case GCB_Maybe_Emoji_NonBreak:
4717
4718             {
4719
4720             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4721               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4722               */
4723                 U8 * temp_pos = (U8 *) curpos;
4724                 GCB_enum prev;
4725
4726                 do {
4727                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4728                 }
4729                 while (prev == GCB_Extend);
4730
4731                 return prev != GCB_XPG_XX;
4732             }
4733
4734         default:
4735             break;
4736     }
4737
4738 #ifdef DEBUGGING
4739     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4740                                   before, after, GCB_table[before][after]);
4741     assert(0);
4742 #endif
4743     return TRUE;
4744 }
4745
4746 STATIC GCB_enum
4747 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4748 {
4749     GCB_enum gcb;
4750
4751     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4752
4753     if (*curpos < strbeg) {
4754         return GCB_EDGE;
4755     }
4756
4757     if (utf8_target) {
4758         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4759         U8 * prev_prev_char_pos;
4760
4761         if (! prev_char_pos) {
4762             return GCB_EDGE;
4763         }
4764
4765         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4766             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4767             *curpos = prev_char_pos;
4768             prev_char_pos = prev_prev_char_pos;
4769         }
4770         else {
4771             *curpos = (U8 *) strbeg;
4772             return GCB_EDGE;
4773         }
4774     }
4775     else {
4776         if (*curpos - 2 < strbeg) {
4777             *curpos = (U8 *) strbeg;
4778             return GCB_EDGE;
4779         }
4780         (*curpos)--;
4781         gcb = getGCB_VAL_CP(*(*curpos - 1));
4782     }
4783
4784     return gcb;
4785 }
4786
4787 /* Combining marks attach to most classes that precede them, but this defines
4788  * the exceptions (from TR14) */
4789 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4790                                      || prev == LB_Mandatory_Break      \
4791                                      || prev == LB_Carriage_Return      \
4792                                      || prev == LB_Line_Feed            \
4793                                      || prev == LB_Next_Line            \
4794                                      || prev == LB_Space                \
4795                                      || prev == LB_ZWSpace))
4796
4797 STATIC bool
4798 S_isLB(pTHX_ LB_enum before,
4799              LB_enum after,
4800              const U8 * const strbeg,
4801              const U8 * const curpos,
4802              const U8 * const strend,
4803              const bool utf8_target)
4804 {
4805     U8 * temp_pos = (U8 *) curpos;
4806     LB_enum prev = before;
4807
4808     /* Is the boundary between 'before' and 'after' line-breakable?
4809      * Most of this is just a table lookup of a generated table from Unicode
4810      * rules.  But some rules require context to decide, and so have to be
4811      * implemented in code */
4812
4813     PERL_ARGS_ASSERT_ISLB;
4814
4815     /* Rule numbers in the comments below are as of Unicode 9.0 */
4816
4817   redo:
4818     before = prev;
4819     switch (LB_table[before][after]) {
4820         case LB_BREAKABLE:
4821             return TRUE;
4822
4823         case LB_NOBREAK:
4824         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4825             return FALSE;
4826
4827         case LB_SP_foo + LB_BREAKABLE:
4828         case LB_SP_foo + LB_NOBREAK:
4829         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4830
4831             /* When we have something following a SP, we have to look at the
4832              * context in order to know what to do.
4833              *
4834              * SP SP should not reach here because LB7: Do not break before
4835              * spaces.  (For two spaces in a row there is nothing that
4836              * overrides that) */
4837             assert(after != LB_Space);
4838
4839             /* Here we have a space followed by a non-space.  Mostly this is a
4840              * case of LB18: "Break after spaces".  But there are complications
4841              * as the handling of spaces is somewhat tricky.  They are in a
4842              * number of rules, which have to be applied in priority order, but
4843              * something earlier in the string can cause a rule to be skipped
4844              * and a lower priority rule invoked.  A prime example is LB7 which
4845              * says don't break before a space.  But rule LB8 (lower priority)
4846              * says that the first break opportunity after a ZW is after any
4847              * span of spaces immediately after it.  If a ZW comes before a SP
4848              * in the input, rule LB8 applies, and not LB7.  Other such rules
4849              * involve combining marks which are rules 9 and 10, but they may
4850              * override higher priority rules if they come earlier in the
4851              * string.  Since we're doing random access into the middle of the
4852              * string, we have to look for rules that should get applied based
4853              * on both string position and priority.  Combining marks do not
4854              * attach to either ZW nor SP, so we don't have to consider them
4855              * until later.
4856              *
4857              * To check for LB8, we have to find the first non-space character
4858              * before this span of spaces */
4859             do {
4860                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4861             }
4862             while (prev == LB_Space);
4863
4864             /* LB8 Break before any character following a zero-width space,
4865              * even if one or more spaces intervene.
4866              *      ZW SP* ÷
4867              * So if we have a ZW just before this span, and to get here this
4868              * is the final space in the span. */
4869             if (prev == LB_ZWSpace) {
4870                 return TRUE;
4871             }
4872
4873             /* Here, not ZW SP+.  There are several rules that have higher
4874              * priority than LB18 and can be resolved now, as they don't depend
4875              * on anything earlier in the string (except ZW, which we have
4876              * already handled).  One of these rules is LB11 Do not break
4877              * before Word joiner, but we have specially encoded that in the
4878              * lookup table so it is caught by the single test below which
4879              * catches the other ones. */
4880             if (LB_table[LB_Space][after] - LB_SP_foo
4881                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4882             {
4883                 return FALSE;
4884             }
4885
4886             /* If we get here, we have to XXX consider combining marks. */
4887             if (prev == LB_Combining_Mark) {
4888
4889                 /* What happens with these depends on the character they
4890                  * follow.  */
4891                 do {
4892                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4893                 }
4894                 while (prev == LB_Combining_Mark);
4895
4896                 /* Most times these attach to and inherit the characteristics
4897                  * of that character, but not always, and when not, they are to
4898                  * be treated as AL by rule LB10. */
4899                 if (! LB_CM_ATTACHES_TO(prev)) {
4900                     prev = LB_Alphabetic;
4901                 }
4902             }
4903
4904             /* Here, we have the character preceding the span of spaces all set
4905              * up.  We follow LB18: "Break after spaces" unless the table shows
4906              * that is overriden */
4907             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4908
4909         case LB_CM_ZWJ_foo:
4910
4911             /* We don't know how to treat the CM except by looking at the first
4912              * non-CM character preceding it.  ZWJ is treated as CM */
4913             do {
4914                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4915             }
4916             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4917
4918             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4919              * attatches to it, then it inherits the behavior of 'prev'.  If it
4920              * doesn't attach, it is to be treated as an AL */
4921             if (! LB_CM_ATTACHES_TO(prev)) {
4922                 prev = LB_Alphabetic;
4923             }
4924
4925             goto redo;
4926
4927         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4928         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4929
4930             /* LB21a Don't break after Hebrew + Hyphen.
4931              * HL (HY | BA) × */
4932
4933             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4934                                                           == LB_Hebrew_Letter)
4935             {
4936                 return FALSE;
4937             }
4938
4939             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4940
4941         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4942         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4943
4944             /* LB25a (PR | PO) × ( OP | HY )? NU */
4945             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4946                 return FALSE;
4947             }
4948
4949             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4950                                                                 == LB_BREAKABLE;
4951
4952         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4953         case LB_SY_or_IS_then_various + LB_NOBREAK:
4954         {
4955             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4956
4957             LB_enum temp = prev;
4958             do {
4959                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4960             }
4961             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4962             if (temp == LB_Numeric) {
4963                 return FALSE;
4964             }
4965
4966             return LB_table[prev][after] - LB_SY_or_IS_then_various
4967                                                                == LB_BREAKABLE;
4968         }
4969
4970         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4971         case LB_various_then_PO_or_PR + LB_NOBREAK:
4972         {
4973             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4974
4975             LB_enum temp = prev;
4976             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4977             {
4978                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4979             }
4980             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4981                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4982             }
4983             if (temp == LB_Numeric) {
4984                 return FALSE;
4985             }
4986             return LB_various_then_PO_or_PR;
4987         }
4988
4989         case LB_RI_then_RI + LB_NOBREAK:
4990         case LB_RI_then_RI + LB_BREAKABLE:
4991             {
4992                 int RI_count = 1;
4993
4994                 /* LB30a Break between two regional indicator symbols if and
4995                  * only if there are an even number of regional indicators
4996                  * preceding the position of the break.
4997                  *
4998                  *    sot (RI RI)* RI × RI
4999                  *  [^RI] (RI RI)* RI × RI */
5000
5001                 while (backup_one_LB(strbeg,
5002                                      &temp_pos,
5003                                      utf8_target) == LB_Regional_Indicator)
5004                 {
5005                     RI_count++;
5006                 }
5007
5008                 return RI_count % 2 == 0;
5009             }
5010
5011         default:
5012             break;
5013     }
5014
5015 #ifdef DEBUGGING
5016     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5017                                   before, after, LB_table[before][after]);
5018     assert(0);
5019 #endif
5020     return TRUE;
5021 }
5022
5023 STATIC LB_enum
5024 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5025 {
5026     LB_enum lb;
5027
5028     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5029
5030     if (*curpos >= strend) {
5031         return LB_EDGE;
5032     }
5033
5034     if (utf8_target) {
5035         *curpos += UTF8SKIP(*curpos);
5036         if (*curpos >= strend) {
5037             return LB_EDGE;
5038         }
5039         lb = getLB_VAL_UTF8(*curpos, strend);
5040     }
5041     else {
5042         (*curpos)++;
5043         if (*curpos >= strend) {
5044             return LB_EDGE;
5045         }
5046         lb = getLB_VAL_CP(**curpos);
5047     }
5048
5049     return lb;
5050 }
5051
5052 STATIC LB_enum
5053 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5054 {
5055     LB_enum lb;
5056
5057     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5058
5059     if (*curpos < strbeg) {
5060         return LB_EDGE;
5061     }
5062
5063     if (utf8_target) {
5064         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5065         U8 * prev_prev_char_pos;
5066
5067         if (! prev_char_pos) {
5068             return LB_EDGE;
5069         }
5070
5071         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5072             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5073             *curpos = prev_char_pos;
5074             prev_char_pos = prev_prev_char_pos;
5075         }
5076         else {
5077             *curpos = (U8 *) strbeg;
5078             return LB_EDGE;
5079         }
5080     }
5081     else {
5082         if (*curpos - 2 < strbeg) {
5083             *curpos = (U8 *) strbeg;
5084             return LB_EDGE;
5085         }
5086         (*curpos)--;
5087         lb = getLB_VAL_CP(*(*curpos - 1));
5088     }
5089
5090     return lb;
5091 }
5092
5093 STATIC bool
5094 S_isSB(pTHX_ SB_enum before,
5095              SB_enum after,
5096              const U8 * const strbeg,
5097              const U8 * const curpos,
5098              const U8 * const strend,
5099              const bool utf8_target)
5100 {
5101     /* returns a boolean indicating if there is a Sentence Boundary Break
5102      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
5103
5104     U8 * lpos = (U8 *) curpos;
5105     bool has_para_sep = FALSE;
5106     bool has_sp = FALSE;
5107
5108     PERL_ARGS_ASSERT_ISSB;
5109
5110     /* Break at the start and end of text.
5111         SB1.  sot  ÷
5112         SB2.  ÷  eot
5113       But unstated in Unicode is don't break if the text is empty */
5114     if (before == SB_EDGE || after == SB_EDGE) {
5115         return before != after;
5116     }
5117
5118     /* SB 3: Do not break within CRLF. */
5119     if (before == SB_CR && after == SB_LF) {
5120         return FALSE;
5121     }
5122
5123     /* Break after paragraph separators.  CR and LF are considered
5124      * so because Unicode views text as like word processing text where there
5125      * are no newlines except between paragraphs, and the word processor takes
5126      * care of wrapping without there being hard line-breaks in the text *./
5127        SB4.  Sep | CR | LF  ÷ */
5128     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5129         return TRUE;
5130     }
5131
5132     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5133      * (See Section 6.2, Replacing Ignore Rules.)
5134         SB5.  X (Extend | Format)*  →  X */
5135     if (after == SB_Extend || after == SB_Format) {
5136
5137         /* Implied is that the these characters attach to everything
5138          * immediately prior to them except for those separator-type
5139          * characters.  And the rules earlier have already handled the case
5140          * when one of those immediately precedes the extend char */
5141         return FALSE;
5142     }
5143
5144     if (before == SB_Extend || before == SB_Format) {
5145         U8 * temp_pos = lpos;
5146         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5147         if (   backup != SB_EDGE
5148             && backup != SB_Sep
5149             && backup != SB_CR
5150             && backup != SB_LF)
5151         {
5152             before = backup;
5153             lpos = temp_pos;
5154         }
5155
5156         /* Here, both 'before' and 'backup' are these types; implied is that we
5157          * don't break between them */
5158         if (backup == SB_Extend || backup == SB_Format) {
5159             return FALSE;
5160         }
5161     }
5162
5163     /* Do not break after ambiguous terminators like period, if they are
5164      * immediately followed by a number or lowercase letter, if they are
5165      * between uppercase letters, if the first following letter (optionally
5166      * after certain punctuation) is lowercase, or if they are followed by
5167      * "continuation" punctuation such as comma, colon, or semicolon. For
5168      * example, a period may be an abbreviation or numeric period, and thus may
5169      * not mark the end of a sentence.
5170
5171      * SB6. ATerm  ×  Numeric */
5172     if (before == SB_ATerm && after == SB_Numeric) {
5173         return FALSE;
5174     }
5175
5176     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5177     if (before == SB_ATerm && after == SB_Upper) {
5178         U8 * temp_pos = lpos;
5179         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5180         if (backup == SB_Upper || backup == SB_Lower) {
5181             return FALSE;
5182         }
5183     }
5184
5185     /* The remaining rules that aren't the final one, all require an STerm or
5186      * an ATerm after having backed up over some Close* Sp*, and in one case an
5187      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5188      * So do that backup now, setting flags if either Sp or a paragraph
5189      * separator are found */
5190
5191     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5192         has_para_sep = TRUE;
5193         before = backup_one_SB(strbeg, &lpos, utf8_target);
5194     }
5195
5196     if (before == SB_Sp) {
5197         has_sp = TRUE;
5198         do {
5199             before = backup_one_SB(strbeg, &lpos, utf8_target);
5200         }
5201         while (before == SB_Sp);
5202     }
5203
5204     while (before == SB_Close) {
5205         before = backup_one_SB(strbeg, &lpos, utf8_target);
5206     }
5207
5208     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5209      * most cases an STerm */
5210     if (before == SB_STerm || before == SB_ATerm) {
5211
5212         /* So, here the lhs matches
5213          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5214          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5215          * The rules that apply here are:
5216          *
5217          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5218                                            | LF | STerm | ATerm) )* Lower
5219            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5220            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5221            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5222            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5223          */
5224
5225         /* And all but SB11 forbid having seen a paragraph separator */
5226         if (! has_para_sep) {
5227             if (before == SB_ATerm) {          /* SB8 */
5228                 U8 * rpos = (U8 *) curpos;
5229                 SB_enum later = after;
5230
5231                 while (    later != SB_OLetter
5232                         && later != SB_Upper
5233                         && later != SB_Lower
5234                         && later != SB_Sep
5235                         && later != SB_CR
5236                         && later != SB_LF
5237                         && later != SB_STerm
5238                         && later != SB_ATerm
5239                         && later != SB_EDGE)
5240                 {
5241                     later = advance_one_SB(&rpos, strend, utf8_target);
5242                 }
5243                 if (later == SB_Lower) {
5244                     return FALSE;
5245                 }
5246             }
5247
5248             if (   after == SB_SContinue    /* SB8a */
5249                 || after == SB_STerm
5250                 || after == SB_ATerm)
5251             {
5252                 return FALSE;
5253             }
5254
5255             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5256                 if (   after == SB_Close
5257                     || after == SB_Sp
5258                     || after == SB_Sep
5259                     || after == SB_CR
5260                     || after == SB_LF)
5261                 {
5262                     return FALSE;
5263                 }
5264             }
5265
5266             /* SB10.  This and SB9 could probably be combined some way, but khw
5267              * has decided to follow the Unicode rule book precisely for
5268              * simplified maintenance */
5269             if (   after == SB_Sp
5270                 || after == SB_Sep
5271                 || after == SB_CR
5272                 || after == SB_LF)
5273             {
5274                 return FALSE;
5275             }
5276         }
5277
5278         /* SB11.  */
5279         return TRUE;
5280     }
5281
5282     /* Otherwise, do not break.
5283     SB12.  Any  ×  Any */
5284
5285     return FALSE;
5286 }
5287
5288 STATIC SB_enum
5289 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5290 {
5291     SB_enum sb;
5292
5293     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5294
5295     if (*curpos >= strend) {
5296         return SB_EDGE;
5297     }
5298
5299     if (utf8_target) {
5300         do {
5301             *curpos += UTF8SKIP(*curpos);
5302             if (*curpos >= strend) {
5303                 return SB_EDGE;
5304             }
5305             sb = getSB_VAL_UTF8(*curpos, strend);
5306         } while (sb == SB_Extend || sb == SB_Format);
5307     }
5308     else {
5309         do {
5310             (*curpos)++;
5311             if (*curpos >= strend) {
5312                 return SB_EDGE;
5313             }
5314             sb = getSB_VAL_CP(**curpos);
5315         } while (sb == SB_Extend || sb == SB_Format);
5316     }
5317
5318     return sb;
5319 }
5320
5321 STATIC SB_enum
5322 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5323 {
5324     SB_enum sb;
5325
5326     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5327
5328     if (*curpos < strbeg) {
5329         return SB_EDGE;
5330     }
5331
5332     if (utf8_target) {
5333         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5334         if (! prev_char_pos) {
5335             return SB_EDGE;
5336         }
5337
5338         /* Back up over Extend and Format.  curpos is always just to the right
5339          * of the characater whose value we are getting */
5340         do {
5341             U8 * prev_prev_char_pos;
5342             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5343                                                                       strbeg)))
5344             {
5345                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5346                 *curpos = prev_char_pos;
5347                 prev_char_pos = prev_prev_char_pos;
5348             }
5349             else {
5350                 *curpos = (U8 *) strbeg;
5351                 return SB_EDGE;
5352             }
5353         } while (sb == SB_Extend || sb == SB_Format);
5354     }
5355     else {
5356         do {
5357             if (*curpos - 2 < strbeg) {
5358                 *curpos = (U8 *) strbeg;
5359                 return SB_EDGE;
5360             }
5361             (*curpos)--;
5362             sb = getSB_VAL_CP(*(*curpos - 1));
5363         } while (sb == SB_Extend || sb == SB_Format);
5364     }
5365
5366     return sb;
5367 }
5368
5369 STATIC bool
5370 S_isWB(pTHX_ WB_enum previous,
5371              WB_enum before,
5372              WB_enum after,
5373              const U8 * const strbeg,
5374              const U8 * const curpos,
5375              const U8 * const strend,
5376              const bool utf8_target)
5377 {
5378     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5379      *  a Unicode word break, using their published algorithm, but tailored for
5380      *  Perl by treating spans of white space as one unit.  Context may be
5381      *  needed to make this determination.  If the value for the character
5382      *  before 'before' is known, it is passed as 'previous'; otherwise that
5383      *  should be set to WB_UNKNOWN.  The other input parameters give the
5384      *  boundaries and current position in the matching of the string.  That
5385      *  is, 'curpos' marks the position where the character whose wb value is
5386      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5387
5388     U8 * before_pos = (U8 *) curpos;
5389     U8 * after_pos = (U8 *) curpos;
5390     WB_enum prev = before;
5391     WB_enum next;
5392
5393     PERL_ARGS_ASSERT_ISWB;
5394
5395     /* Rule numbers in the comments below are as of Unicode 9.0 */
5396
5397   redo:
5398     before = prev;
5399     switch (WB_table[before][after]) {
5400         case WB_BREAKABLE:
5401             return TRUE;
5402
5403         case WB_NOBREAK:
5404             return FALSE;
5405
5406         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5407             next = advance_one_WB(&after_pos, strend, utf8_target,
5408                                  FALSE /* Don't skip Extend nor Format */ );
5409             /* A space immediately preceeding an Extend or Format is attached
5410              * to by them, and hence gets separated from previous spaces.
5411              * Otherwise don't break between horizontal white space */
5412             return next == WB_Extend || next == WB_Format;
5413
5414         /* WB4 Ignore Format and Extend characters, except when they appear at
5415          * the beginning of a region of text.  This code currently isn't
5416          * general purpose, but it works as the rules are currently and likely
5417          * to be laid out.  The reason it works is that when 'they appear at
5418          * the beginning of a region of text', the rule is to break before
5419          * them, just like any other character.  Therefore, the default rule
5420          * applies and we don't have to look in more depth.  Should this ever
5421          * change, we would have to have 2 'case' statements, like in the rules
5422          * below, and backup a single character (not spacing over the extend
5423          * ones) and then see if that is one of the region-end characters and
5424          * go from there */
5425         case WB_Ex_or_FO_or_ZWJ_then_foo:
5426             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5427             goto redo;
5428
5429         case WB_DQ_then_HL + WB_BREAKABLE:
5430         case WB_DQ_then_HL + WB_NOBREAK:
5431
5432             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5433
5434             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5435                                                             == WB_Hebrew_Letter)
5436             {
5437                 return FALSE;
5438             }
5439
5440              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5441
5442         case WB_HL_then_DQ + WB_BREAKABLE:
5443         case WB_HL_then_DQ + WB_NOBREAK:
5444
5445             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5446
5447             if (advance_one_WB(&after_pos, strend, utf8_target,
5448                                        TRUE /* Do skip Extend and Format */ )
5449                                                             == WB_Hebrew_Letter)
5450             {
5451                 return FALSE;
5452             }
5453
5454             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5455
5456         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5457         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5458
5459             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5460              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5461
5462             next = advance_one_WB(&after_pos, strend, utf8_target,
5463                                        TRUE /* Do skip Extend and Format */ );
5464
5465             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5466             {
5467                 return FALSE;
5468             }
5469
5470             return WB_table[before][after]
5471                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5472
5473         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5474         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5475
5476             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5477              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5478
5479             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5480             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5481             {
5482                 return FALSE;
5483             }
5484
5485             return WB_table[before][after]
5486                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5487
5488         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5489         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5490
5491             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5492              * */
5493
5494             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5495                                                             == WB_Numeric)
5496             {
5497                 return FALSE;
5498             }
5499
5500             return WB_table[before][after]
5501                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5502
5503         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5504         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5505
5506             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5507
5508             if (advance_one_WB(&after_pos, strend, utf8_target,
5509                                        TRUE /* Do skip Extend and Format */ )
5510                                                             == WB_Numeric)
5511             {
5512                 return FALSE;
5513             }
5514
5515             return WB_table[before][after]
5516                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5517
5518         case WB_RI_then_RI + WB_NOBREAK:
5519         case WB_RI_then_RI + WB_BREAKABLE:
5520             {
5521                 int RI_count = 1;
5522
5523                 /* Do not break within emoji flag sequences. That is, do not
5524                  * break between regional indicator (RI) symbols if there is an
5525                  * odd number of RI characters before the potential break
5526                  * point.
5527                  *
5528                  * WB15   sot (RI RI)* RI × RI
5529                  * WB16 [^RI] (RI RI)* RI × RI */
5530
5531                 while (backup_one_WB(&previous,
5532                                      strbeg,
5533                                      &before_pos,
5534                                      utf8_target) == WB_Regional_Indicator)
5535                 {
5536                     RI_count++;
5537                 }
5538
5539                 return RI_count % 2 != 1;
5540             }
5541
5542         default:
5543             break;
5544     }
5545
5546 #ifdef DEBUGGING
5547     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5548                                   before, after, WB_table[before][after]);
5549     assert(0);
5550 #endif
5551     return TRUE;
5552 }
5553
5554 STATIC WB_enum
5555 S_advance_one_WB(pTHX_ U8 ** curpos,
5556                        const U8 * const strend,
5557                        const bool utf8_target,
5558                        const bool skip_Extend_Format)
5559 {
5560     WB_enum wb;
5561
5562     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5563
5564     if (*curpos >= strend) {
5565         return WB_EDGE;
5566     }
5567
5568     if (utf8_target) {
5569
5570         /* Advance over Extend and Format */
5571         do {
5572             *curpos += UTF8SKIP(*curpos);
5573             if (*curpos >= strend) {
5574                 return WB_EDGE;
5575             }
5576             wb = getWB_VAL_UTF8(*curpos, strend);
5577         } while (    skip_Extend_Format
5578                  && (wb == WB_Extend || wb == WB_Format));
5579     }
5580     else {
5581         do {
5582             (*curpos)++;
5583             if (*curpos >= strend) {
5584                 return WB_EDGE;
5585             }
5586             wb = getWB_VAL_CP(**curpos);
5587         } while (    skip_Extend_Format
5588                  && (wb == WB_Extend || wb == WB_Format));
5589     }
5590
5591     return wb;
5592 }
5593
5594 STATIC WB_enum
5595 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5596 {
5597     WB_enum wb;
5598
5599     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5600
5601     /* If we know what the previous character's break value is, don't have
5602         * to look it up */
5603     if (*previous != WB_UNKNOWN) {
5604         wb = *previous;
5605
5606         /* But we need to move backwards by one */
5607         if (utf8_target) {
5608             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5609             if (! *curpos) {
5610                 *previous = WB_EDGE;
5611                 *curpos = (U8 *) strbeg;
5612             }
5613             else {
5614                 *previous = WB_UNKNOWN;
5615             }
5616         }
5617         else {
5618             (*curpos)--;
5619             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5620         }
5621
5622         /* And we always back up over these three types */
5623         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5624             return wb;
5625         }
5626     }
5627
5628     if (*curpos < strbeg) {
5629         return WB_EDGE;
5630     }
5631
5632     if (utf8_target) {
5633         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5634         if (! prev_char_pos) {
5635             return WB_EDGE;
5636         }
5637
5638         /* Back up over Extend and Format.  curpos is always just to the right
5639          * of the characater whose value we are getting */
5640         do {
5641             U8 * prev_prev_char_pos;
5642             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5643                                                    -1,
5644                                                    strbeg)))
5645             {
5646                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5647                 *curpos = prev_char_pos;
5648                 prev_char_pos = prev_prev_char_pos;
5649             }
5650             else {
5651                 *curpos = (U8 *) strbeg;
5652                 return WB_EDGE;
5653             }
5654         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5655     }
5656     else {
5657         do {
5658             if (*curpos - 2 < strbeg) {
5659                 *curpos = (U8 *) strbeg;
5660                 return WB_EDGE;
5661             }
5662             (*curpos)--;
5663             wb = getWB_VAL_CP(*(*curpos - 1));
5664         } while (wb == WB_Extend || wb == WB_Format);
5665     }
5666
5667     return wb;
5668 }
5669
5670 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5671 (                                                           \
5672     (   ( st )                                         ) && \
5673     (   ( st )->u.eval.close_paren                     ) && \
5674     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5675 )
5676
5677 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5678 (                                                           \
5679     (   ( st )                                         ) && \
5680     (   ( st )->u.eval.close_paren                     ) && \
5681     (   ( expr )                                       ) && \
5682     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5683 )
5684
5685
5686 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5687     (st)->u.eval.close_paren = ( (expr) + 1 )
5688
5689 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5690     (st)->u.eval.close_paren = 0
5691
5692 /* returns -1 on failure, $+[0] on success */
5693 STATIC SSize_t
5694 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5695 {
5696     dVAR;
5697     const bool utf8_target = reginfo->is_utf8_target;
5698     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5699     REGEXP *rex_sv = reginfo->prog;
5700     regexp *rex = ReANY(rex_sv);
5701     RXi_GET_DECL(rex,rexi);
5702     /* the current state. This is a cached copy of PL_regmatch_state */
5703     regmatch_state *st;
5704     /* cache heavy used fields of st in registers */
5705     regnode *scan;
5706     regnode *next;
5707     U32 n = 0;  /* general value; init to avoid compiler warning */
5708     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5709     SSize_t endref = 0; /* offset of end of backref when ln is start */
5710     char *locinput = startpos;
5711     char *pushinput; /* where to continue after a PUSH */
5712     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5713
5714     bool result = 0;        /* return value of S_regmatch */
5715     U32 depth = 0;            /* depth of backtrack stack */
5716     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5717     const U32 max_nochange_depth =
5718         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5719         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5720     regmatch_state *yes_state = NULL; /* state to pop to on success of
5721                                                             subpattern */
5722     /* mark_state piggy backs on the yes_state logic so that when we unwind
5723        the stack on success we can update the mark_state as we go */
5724     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5725     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5726     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5727     U32 state_num;
5728     bool no_final = 0;      /* prevent failure from backtracking? */
5729     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5730     char *startpoint = locinput;
5731     SV *popmark = NULL;     /* are we looking for a mark? */
5732     SV *sv_commit = NULL;   /* last mark name seen in failure */
5733     SV *sv_yes_mark = NULL; /* last mark name we have seen
5734                                during a successful match */
5735     U32 lastopen = 0;       /* last open we saw */
5736     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5737     SV* const oreplsv = GvSVn(PL_replgv);
5738     /* these three flags are set by various ops to signal information to
5739      * the very next op. They have a useful lifetime of exactly one loop
5740      * iteration, and are not preserved or restored by state pushes/pops
5741      */
5742     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5743     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5744     int logical = 0;        /* the following EVAL is:
5745                                 0: (?{...})
5746                                 1: (?(?{...})X|Y)
5747                                 2: (??{...})
5748                                or the following IFMATCH/UNLESSM is:
5749                                 false: plain (?=foo)
5750                                 true:  used as a condition: (?(?=foo))
5751                             */
5752     PAD* last_pad = NULL;
5753     dMULTICALL;
5754     U8 gimme = G_SCALAR;
5755     CV *caller_cv = NULL;       /* who called us */
5756     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5757     U32 maxopenparen = 0;       /* max '(' index seen so far */
5758     int to_complement;  /* Invert the result? */
5759     _char_class_number classnum;
5760     bool is_utf8_pat = reginfo->is_utf8_pat;
5761     bool match = FALSE;
5762     I32 orig_savestack_ix = PL_savestack_ix;
5763     U8 * script_run_begin = NULL;
5764
5765 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5766 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5767 #  define SOLARIS_BAD_OPTIMIZER
5768     const U32 *pl_charclass_dup = PL_charclass;
5769 #  define PL_charclass pl_charclass_dup
5770 #endif
5771
5772 #ifdef DEBUGGING
5773     GET_RE_DEBUG_FLAGS_DECL;
5774 #endif
5775
5776     /* protect against undef(*^R) */
5777     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5778
5779     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5780     multicall_oldcatch = 0;
5781     PERL_UNUSED_VAR(multicall_cop);
5782
5783     PERL_ARGS_ASSERT_REGMATCH;
5784
5785     st = PL_regmatch_state;
5786
5787     /* Note that nextchr is a byte even in UTF */
5788     SET_nextchr;
5789     scan = prog;
5790
5791     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5792             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5793             Perl_re_printf( aTHX_ "regmatch start\n" );
5794     }));
5795
5796     while (scan != NULL) {
5797         next = scan + NEXT_OFF(scan);
5798         if (next == scan)
5799             next = NULL;
5800         state_num = OP(scan);
5801
5802       reenter_switch:
5803         DEBUG_EXECUTE_r(
5804             if (state_num <= REGNODE_MAX) {
5805                 SV * const prop = sv_newmortal();
5806                 regnode *rnext = regnext(scan);
5807
5808                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5809                 regprop(rex, prop, scan, reginfo, NULL);
5810                 Perl_re_printf( aTHX_
5811                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5812                     INDENT_CHARS(depth), "",
5813                     (IV)(scan - rexi->program),
5814                     SvPVX_const(prop),
5815                     (PL_regkind[OP(scan)] == END || !rnext) ?
5816                         0 : (IV)(rnext - rexi->program));
5817             }
5818         );
5819
5820         to_complement = 0;
5821
5822         SET_nextchr;
5823         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5824
5825         switch (state_num) {
5826         case SBOL: /*  /^../ and /\A../  */
5827             if (locinput == reginfo->strbeg)
5828                 break;
5829             sayNO;
5830
5831         case MBOL: /*  /^../m  */
5832             if (locinput == reginfo->strbeg ||
5833                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5834             {
5835                 break;
5836             }
5837             sayNO;
5838
5839         case GPOS: /*  \G  */
5840             if (locinput == reginfo->ganch)
5841                 break;
5842             sayNO;
5843
5844         case KEEPS: /*   \K  */
5845             /* update the startpoint */
5846             st->u.keeper.val = rex->offs[0].start;
5847             rex->offs[0].start = locinput - reginfo->strbeg;
5848             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5849             NOT_REACHED; /* NOTREACHED */
5850
5851         case KEEPS_next_fail:
5852             /* rollback the start point change */
5853             rex->offs[0].start = st->u.keeper.val;
5854             sayNO_SILENT;
5855             NOT_REACHED; /* NOTREACHED */
5856
5857         case MEOL: /* /..$/m  */
5858             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5859                 sayNO;
5860             break;
5861
5862         case SEOL: /* /..$/  */
5863             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5864                 sayNO;
5865             if (reginfo->strend - locinput > 1)
5866                 sayNO;
5867             break;
5868
5869         case EOS: /*  \z  */
5870             if (!NEXTCHR_IS_EOS)
5871                 sayNO;
5872             break;
5873
5874         case SANY: /*  /./s  */
5875             if (NEXTCHR_IS_EOS)
5876                 sayNO;
5877             goto increment_locinput;
5878
5879         case REG_ANY: /*  /./  */
5880             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5881                 sayNO;
5882             goto increment_locinput;
5883
5884
5885 #undef  ST
5886 #define ST st->u.trie
5887         case TRIEC: /* (ab|cd) with known charclass */
5888             /* In this case the charclass data is available inline so
5889                we can fail fast without a lot of extra overhead.
5890              */
5891             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5892                 DEBUG_EXECUTE_r(
5893                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5894                               depth, PL_colors[4], PL_colors[5])
5895                 );
5896                 sayNO_SILENT;
5897                 NOT_REACHED; /* NOTREACHED */
5898             }
5899             /* FALLTHROUGH */
5900         case TRIE:  /* (ab|cd)  */
5901             /* the basic plan of execution of the trie is:
5902              * At the beginning, run though all the states, and
5903              * find the longest-matching word. Also remember the position
5904              * of the shortest matching word. For example, this pattern:
5905              *    1  2 3 4    5
5906              *    ab|a|x|abcd|abc
5907              * when matched against the string "abcde", will generate
5908              * accept states for all words except 3, with the longest
5909              * matching word being 4, and the shortest being 2 (with
5910              * the position being after char 1 of the string).
5911              *
5912              * Then for each matching word, in word order (i.e. 1,2,4,5),
5913              * we run the remainder of the pattern; on each try setting
5914              * the current position to the character following the word,
5915              * returning to try the next word on failure.
5916              *
5917              * We avoid having to build a list of words at runtime by
5918              * using a compile-time structure, wordinfo[].prev, which
5919              * gives, for each word, the previous accepting word (if any).
5920              * In the case above it would contain the mappings 1->2, 2->0,
5921              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5922              * the longest word (4 above), a list of all words, by
5923              * following the list of prev pointers; this gives us the
5924              * unordered list 4,5,1,2. Then given the current word we have
5925              * just tried, we can go through the list and find the
5926              * next-biggest word to try (so if we just failed on word 2,
5927              * the next in the list is 4).
5928              *
5929              * Since at runtime we don't record the matching position in
5930              * the string for each word, we have to work that out for
5931              * each word we're about to process. The wordinfo table holds
5932              * the character length of each word; given that we recorded
5933              * at the start: the position of the shortest word and its
5934              * length in chars, we just need to move the pointer the
5935              * difference between the two char lengths. Depending on
5936              * Unicode status and folding, that's cheap or expensive.
5937              *
5938              * This algorithm is optimised for the case where are only a
5939              * small number of accept states, i.e. 0,1, or maybe 2.
5940              * With lots of accepts states, and having to try all of them,
5941              * it becomes quadratic on number of accept states to find all
5942              * the next words.
5943              */
5944
5945             {
5946                 /* what type of TRIE am I? (utf8 makes this contextual) */
5947                 DECL_TRIE_TYPE(scan);
5948
5949                 /* what trie are we using right now */
5950                 reg_trie_data * const trie
5951                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5952                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5953                 U32 state = trie->startstate;
5954
5955                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5956                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5957                     if (utf8_target
5958                         && ! NEXTCHR_IS_EOS
5959                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5960                         && scan->flags == EXACTL)
5961                     {
5962                         /* We only output for EXACTL, as we let the folder
5963                          * output this message for EXACTFLU8 to avoid
5964                          * duplication */
5965                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5966                                                                reginfo->strend);
5967                     }
5968                 }
5969                 if (   trie->bitmap
5970                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5971                 {
5972                     if (trie->states[ state ].wordnum) {
5973                          DEBUG_EXECUTE_r(
5974                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5975                                           depth, PL_colors[4], PL_colors[5])
5976                         );
5977                         if (!trie->jump)
5978                             break;
5979                     } else {
5980                         DEBUG_EXECUTE_r(
5981                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5982                                           depth, PL_colors[4], PL_colors[5])
5983                         );
5984                         sayNO_SILENT;
5985                    }
5986                 }
5987
5988             {
5989                 U8 *uc = ( U8* )locinput;
5990
5991                 STRLEN len = 0;
5992                 STRLEN foldlen = 0;
5993                 U8 *uscan = (U8*)NULL;
5994                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5995                 U32 charcount = 0; /* how many input chars we have matched */
5996                 U32 accepted = 0; /* have we seen any accepting states? */
5997
5998                 ST.jump = trie->jump;
5999                 ST.me = scan;
6000                 ST.firstpos = NULL;
6001                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6002                 ST.nextword = 0;
6003
6004                 /* fully traverse the TRIE; note the position of the
6005                    shortest accept state and the wordnum of the longest
6006                    accept state */
6007
6008                 while ( state && uc <= (U8*)(reginfo->strend) ) {
6009                     U32 base = trie->states[ state ].trans.base;
6010                     UV uvc = 0;
6011                     U16 charid = 0;
6012                     U16 wordnum;
6013                     wordnum = trie->states[ state ].wordnum;
6014
6015                     if (wordnum) { /* it's an accept state */
6016                         if (!accepted) {
6017                             accepted = 1;
6018                             /* record first match position */
6019                             if (ST.longfold) {
6020                                 ST.firstpos = (U8*)locinput;
6021                                 ST.firstchars = 0;
6022                             }
6023                             else {
6024                                 ST.firstpos = uc;
6025                                 ST.firstchars = charcount;
6026                             }
6027                         }
6028                         if (!ST.nextword || wordnum < ST.nextword)
6029                             ST.nextword = wordnum;
6030                         ST.topword = wordnum;
6031                     }
6032
6033                     DEBUG_TRIE_EXECUTE_r({
6034                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6035                                 /* HERE */
6036                                 PerlIO_printf( Perl_debug_log,
6037                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6038                                     INDENT_CHARS(depth), "", PL_colors[4],
6039                                     (UV)state, (accepted ? 'Y' : 'N'));
6040                     });
6041
6042                     /* read a char and goto next state */
6043                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6044                         I32 offset;
6045                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6046                                              (U8 *) reginfo->strend, uscan,
6047                                              len, uvc, charid, foldlen,
6048                                              foldbuf, uniflags);
6049                         charcount++;
6050                         if (foldlen>0)
6051                             ST.longfold = TRUE;
6052                         if (charid &&
6053                              ( ((offset =
6054                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6055
6056                              && ((U32)offset < trie->lasttrans)
6057                              && trie->trans[offset].check == state)
6058                         {
6059                             state = trie->trans[offset].next;
6060                         }
6061                         else {
6062                             state = 0;
6063                         }
6064                         uc += len;
6065
6066                     }
6067                     else {
6068                         state = 0;
6069                     }
6070                     DEBUG_TRIE_EXECUTE_r(
6071                         Perl_re_printf( aTHX_
6072                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6073                             charid, uvc, (UV)state, PL_colors[5] );
6074                     );
6075                 }
6076                 if (!accepted)
6077                    sayNO;
6078
6079                 /* calculate total number of accept states */
6080                 {
6081                     U16 w = ST.topword;
6082                     accepted = 0;
6083                     while (w) {
6084                         w = trie->wordinfo[w].prev;
6085                         accepted++;
6086                     }
6087                     ST.accepted = accepted;
6088                 }
6089
6090                 DEBUG_EXECUTE_r(
6091                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6092                         depth,
6093                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6094                 );
6095                 goto trie_first_try; /* jump into the fail handler */
6096             }}
6097             NOT_REACHED; /* NOTREACHED */
6098
6099         case TRIE_next_fail: /* we failed - try next alternative */
6100         {
6101             U8 *uc;
6102             if ( ST.jump ) {
6103                 /* undo any captures done in the tail part of a branch,
6104                  * e.g.
6105                  *    /(?:X(.)(.)|Y(.)).../
6106                  * where the trie just matches X then calls out to do the
6107                  * rest of the branch */
6108                 REGCP_UNWIND(ST.cp);
6109                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6110             }
6111             if (!--ST.accepted) {
6112                 DEBUG_EXECUTE_r({
6113                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6114                         depth,
6115                         PL_colors[4],
6116                         PL_colors[5] );
6117                 });
6118                 sayNO_SILENT;
6119             }
6120             {
6121                 /* Find next-highest word to process.  Note that this code
6122                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6123                 U16 min = 0;
6124                 U16 word;
6125                 U16 const nextword = ST.nextword;
6126                 reg_trie_wordinfo * const wordinfo
6127                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6128                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6129                     if (word > nextword && (!min || word < min))
6130                         min = word;
6131                 }
6132                 ST.nextword = min;
6133             }
6134
6135           trie_first_try:
6136             if (do_cutgroup) {
6137                 do_cutgroup = 0;
6138                 no_final = 0;
6139             }
6140
6141             if ( ST.jump ) {
6142                 ST.lastparen = rex->lastparen;
6143                 ST.lastcloseparen = rex->lastcloseparen;
6144                 REGCP_SET(ST.cp);
6145             }
6146
6147             /* find start char of end of current word */
6148             {
6149                 U32 chars; /* how many chars to skip */
6150                 reg_trie_data * const trie
6151                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6152
6153                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6154                             >=  ST.firstchars);
6155                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6156                             - ST.firstchars;
6157                 uc = ST.firstpos;
6158
6159                 if (ST.longfold) {
6160                     /* the hard option - fold each char in turn and find
6161                      * its folded length (which may be different */
6162                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6163                     STRLEN foldlen;
6164                     STRLEN len;
6165                     UV uvc;
6166                     U8 *uscan;
6167
6168                     while (chars) {
6169                         if (utf8_target) {
6170                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6171                                                     uniflags);
6172                             uc += len;
6173                         }
6174                         else {
6175                             uvc = *uc;
6176                             uc++;
6177                         }
6178                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6179                         uscan = foldbuf;
6180                         while (foldlen) {
6181                             if (!--chars)
6182                                 break;
6183                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6184                                                  uniflags);
6185                             uscan += len;
6186                             foldlen -= len;
6187                         }
6188                     }
6189                 }
6190                 else {
6191                     if (utf8_target)
6192                         while (chars--)
6193                             uc += UTF8SKIP(uc);
6194                     else
6195                         uc += chars;
6196                 }
6197             }
6198
6199             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6200                             ? ST.jump[ST.nextword]
6201                             : NEXT_OFF(ST.me));
6202
6203             DEBUG_EXECUTE_r({
6204                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6205                     depth,
6206                     PL_colors[4],
6207                     ST.nextword,
6208                     PL_colors[5]
6209                     );
6210             });
6211
6212             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6213                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6214                 NOT_REACHED; /* NOTREACHED */
6215             }
6216             /* only one choice left - just continue */
6217             DEBUG_EXECUTE_r({
6218                 AV *const trie_words
6219                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6220                 SV ** const tmp = trie_words
6221                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6222                 SV *sv= tmp ? sv_newmortal() : NULL;
6223
6224                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6225                     depth, PL_colors[4],
6226                     ST.nextword,
6227                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6228                             PL_colors[0], PL_colors[1],
6229                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6230                         )
6231                     : "not compiled under -Dr",
6232                     PL_colors[5] );
6233             });
6234
6235             locinput = (char*)uc;
6236             continue; /* execute rest of RE */
6237             /* NOTREACHED */
6238         }
6239 #undef  ST
6240
6241         case EXACTL:             /*  /abc/l       */
6242             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6243
6244             /* Complete checking would involve going through every character
6245              * matched by the string to see if any is above latin1.  But the
6246              * comparision otherwise might very well be a fast assembly
6247              * language routine, and I (khw) don't think slowing things down
6248              * just to check for this warning is worth it.  So this just checks
6249              * the first character */
6250             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6251                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6252             }
6253             /* FALLTHROUGH */
6254         case EXACT: {            /*  /abc/        */
6255             char *s = STRING(scan);
6256             ln = STR_LEN(scan);
6257             if (utf8_target != is_utf8_pat) {
6258                 /* The target and the pattern have differing utf8ness. */
6259                 char *l = locinput;
6260                 const char * const e = s + ln;
6261
6262                 if (utf8_target) {
6263                     /* The target is utf8, the pattern is not utf8.
6264                      * Above-Latin1 code points can't match the pattern;
6265                      * invariants match exactly, and the other Latin1 ones need
6266                      * to be downgraded to a single byte in order to do the
6267                      * comparison.  (If we could be confident that the target
6268                      * is not malformed, this could be refactored to have fewer
6269                      * tests by just assuming that if the first bytes match, it
6270                      * is an invariant, but there are tests in the test suite
6271                      * dealing with (??{...}) which violate this) */
6272                     while (s < e) {
6273                         if (l >= reginfo->strend
6274                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6275                         {
6276                             sayNO;
6277                         }
6278                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6279                             if (*l != *s) {
6280                                 sayNO;
6281                             }
6282                             l++;
6283                         }
6284                         else {
6285                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6286                             {
6287                                 sayNO;
6288                             }
6289                             l += 2;
6290                         }
6291                         s++;
6292                     }
6293                 }
6294                 else {
6295                     /* The target is not utf8, the pattern is utf8. */
6296                     while (s < e) {
6297                         if (l >= reginfo->strend
6298                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6299                         {
6300                             sayNO;
6301                         }
6302                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6303                             if (*s != *l) {
6304                                 sayNO;
6305                             }
6306                             s++;
6307                         }
6308                         else {
6309                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6310                             {
6311                                 sayNO;
6312                             }
6313                             s += 2;
6314                         }
6315                         l++;
6316                     }
6317                 }
6318                 locinput = l;
6319             }
6320             else {
6321                 /* The target and the pattern have the same utf8ness. */
6322                 /* Inline the first character, for speed. */
6323                 if (reginfo->strend - locinput < ln
6324                     || UCHARAT(s) != nextchr
6325                     || (ln > 1 && memNE(s, locinput, ln)))
6326                 {
6327                     sayNO;
6328                 }
6329                 locinput += ln;
6330             }
6331             break;
6332             }
6333
6334         case EXACTFL: {          /*  /abc/il      */
6335             re_fold_t folder;
6336             const U8 * fold_array;
6337             const char * s;
6338             U32 fold_utf8_flags;
6339
6340             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6341             folder = foldEQ_locale;
6342             fold_array = PL_fold_locale;
6343             fold_utf8_flags = FOLDEQ_LOCALE;
6344             goto do_exactf;
6345
6346         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6347                                       is effectively /u; hence to match, target
6348                                       must be UTF-8. */
6349             if (! utf8_target) {
6350                 sayNO;
6351             }
6352             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6353                                              | FOLDEQ_S1_FOLDS_SANE;
6354             folder = foldEQ_latin1;
6355             fold_array = PL_fold_latin1;
6356             goto do_exactf;
6357
6358         case EXACTFU_SS:         /*  /\x{df}/iu   */
6359         case EXACTFU:            /*  /abc/iu      */
6360             folder = foldEQ_latin1;
6361             fold_array = PL_fold_latin1;
6362             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6363             goto do_exactf;
6364
6365         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6366                                    patterns */
6367             assert(! is_utf8_pat);
6368             /* FALLTHROUGH */
6369         case EXACTFAA:            /*  /abc/iaa     */
6370             folder = foldEQ_latin1;
6371             fold_array = PL_fold_latin1;
6372             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6373             goto do_exactf;
6374
6375         case EXACTF:             /*  /abc/i    This node only generated for
6376                                                non-utf8 patterns */
6377             assert(! is_utf8_pat);
6378             folder = foldEQ;
6379             fold_array = PL_fold;
6380             fold_utf8_flags = 0;
6381
6382           do_exactf:
6383             s = STRING(scan);
6384             ln = STR_LEN(scan);
6385
6386             if (utf8_target
6387                 || is_utf8_pat
6388                 || state_num == EXACTFU_SS
6389                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6390             {
6391               /* Either target or the pattern are utf8, or has the issue where
6392                * the fold lengths may differ. */
6393                 const char * const l = locinput;
6394                 char *e = reginfo->strend;
6395
6396                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6397                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6398                 {
6399                     sayNO;
6400                 }
6401                 locinput = e;
6402                 break;
6403             }
6404
6405             /* Neither the target nor the pattern are utf8 */
6406             if (UCHARAT(s) != nextchr
6407                 && !NEXTCHR_IS_EOS
6408                 && UCHARAT(s) != fold_array[nextchr])
6409             {
6410                 sayNO;
6411             }
6412             if (reginfo->strend - locinput < ln)
6413                 sayNO;
6414             if (ln > 1 && ! folder(s, locinput, ln))
6415                 sayNO;
6416             locinput += ln;
6417             break;
6418         }
6419
6420         case NBOUNDL: /*  /\B/l  */
6421             to_complement = 1;
6422             /* FALLTHROUGH */
6423
6424         case BOUNDL:  /*  /\b/l  */
6425         {
6426             bool b1, b2;
6427             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6428
6429             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6430                 if (! IN_UTF8_CTYPE_LOCALE) {
6431                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6432                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6433                 }
6434                 goto boundu;
6435             }
6436
6437             if (utf8_target) {
6438                 if (locinput == reginfo->strbeg)
6439                     b1 = isWORDCHAR_LC('\n');
6440                 else {
6441                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6442                                                         (U8*)(reginfo->strbeg)),
6443                                                  (U8*)(reginfo->strend));
6444                 }
6445                 b2 = (NEXTCHR_IS_EOS)
6446                     ? isWORDCHAR_LC('\n')
6447                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6448                                               (U8*) reginfo->strend);
6449             }
6450             else { /* Here the string isn't utf8 */
6451                 b1 = (locinput == reginfo->strbeg)
6452                      ? isWORDCHAR_LC('\n')
6453                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6454                 b2 = (NEXTCHR_IS_EOS)
6455                     ? isWORDCHAR_LC('\n')
6456                     : isWORDCHAR_LC(nextchr);
6457             }
6458             if (to_complement ^ (b1 == b2)) {
6459                 sayNO;
6460             }
6461             break;
6462         }
6463
6464         case NBOUND:  /*  /\B/   */
6465             to_complement = 1;
6466             /* FALLTHROUGH */
6467
6468         case BOUND:   /*  /\b/   */
6469             if (utf8_target) {
6470                 goto bound_utf8;
6471             }
6472             goto bound_ascii_match_only;
6473
6474         case NBOUNDA: /*  /\B/a  */
6475             to_complement = 1;
6476             /* FALLTHROUGH */
6477
6478         case BOUNDA:  /*  /\b/a  */
6479         {
6480             bool b1, b2;
6481
6482           bound_ascii_match_only:
6483             /* Here the string isn't utf8, or is utf8 and only ascii characters
6484              * are to match \w.  In the latter case looking at the byte just
6485              * prior to the current one may be just the final byte of a
6486              * multi-byte character.  This is ok.  There are two cases:
6487              * 1) it is a single byte character, and then the test is doing
6488              *    just what it's supposed to.
6489              * 2) it is a multi-byte character, in which case the final byte is
6490              *    never mistakable for ASCII, and so the test will say it is
6491              *    not a word character, which is the correct answer. */
6492             b1 = (locinput == reginfo->strbeg)
6493                  ? isWORDCHAR_A('\n')
6494                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6495             b2 = (NEXTCHR_IS_EOS)
6496                 ? isWORDCHAR_A('\n')
6497                 : isWORDCHAR_A(nextchr);
6498             if (to_complement ^ (b1 == b2)) {
6499                 sayNO;
6500             }
6501             break;
6502         }
6503
6504         case NBOUNDU: /*  /\B/u  */
6505             to_complement = 1;
6506             /* FALLTHROUGH */
6507
6508         case BOUNDU:  /*  /\b/u  */
6509
6510           boundu:
6511             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6512                 match = FALSE;
6513             }
6514             else if (utf8_target) {
6515               bound_utf8:
6516                 switch((bound_type) FLAGS(scan)) {
6517                     case TRADITIONAL_BOUND:
6518                     {
6519                         bool b1, b2;
6520                         b1 = (locinput == reginfo->strbeg)
6521                              ? 0 /* isWORDCHAR_L1('\n') */
6522                              : isWORDCHAR_utf8_safe(
6523                                                reghop3((U8*)locinput,
6524                                                        -1,
6525                                                        (U8*)(reginfo->strbeg)),
6526                                                     (U8*) reginfo->strend);
6527                         b2 = (NEXTCHR_IS_EOS)
6528                             ? 0 /* isWORDCHAR_L1('\n') */
6529                             : isWORDCHAR_utf8_safe((U8*)locinput,
6530                                                    (U8*) reginfo->strend);
6531                         match = cBOOL(b1 != b2);
6532                         break;
6533                     }
6534                     case GCB_BOUND:
6535                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6536                             match = TRUE; /* GCB always matches at begin and
6537                                              end */
6538                         }
6539                         else {
6540                             /* Find the gcb values of previous and current
6541                              * chars, then see if is a break point */
6542                             match = isGCB(getGCB_VAL_UTF8(
6543                                                 reghop3((U8*)locinput,
6544                                                         -1,
6545                                                         (U8*)(reginfo->strbeg)),
6546                                                 (U8*) reginfo->strend),
6547                                           getGCB_VAL_UTF8((U8*) locinput,
6548                                                         (U8*) reginfo->strend),
6549                                           (U8*) reginfo->strbeg,
6550                                           (U8*) locinput,
6551                                           utf8_target);
6552                         }
6553                         break;
6554
6555                     case LB_BOUND:
6556                         if (locinput == reginfo->strbeg) {
6557                             match = FALSE;
6558                         }
6559                         else if (NEXTCHR_IS_EOS) {
6560                             match = TRUE;
6561                         }
6562                         else {
6563                             match = isLB(getLB_VAL_UTF8(
6564                                                 reghop3((U8*)locinput,
6565                                                         -1,
6566                                                         (U8*)(reginfo->strbeg)),
6567                                                 (U8*) reginfo->strend),
6568                                           getLB_VAL_UTF8((U8*) locinput,
6569                                                         (U8*) reginfo->strend),
6570                                           (U8*) reginfo->strbeg,
6571                                           (U8*) locinput,
6572                                           (U8*) reginfo->strend,
6573                                           utf8_target);
6574                         }
6575                         break;
6576
6577                     case SB_BOUND: /* Always matches at begin and end */
6578                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6579                             match = TRUE;
6580                         }
6581                         else {
6582                             match = isSB(getSB_VAL_UTF8(
6583                                                 reghop3((U8*)locinput,
6584                                                         -1,
6585                                                         (U8*)(reginfo->strbeg)),
6586                                                 (U8*) reginfo->strend),
6587                                           getSB_VAL_UTF8((U8*) locinput,
6588                                                         (U8*) reginfo->strend),
6589                                           (U8*) reginfo->strbeg,
6590                                           (U8*) locinput,
6591                                           (U8*) reginfo->strend,
6592                                           utf8_target);
6593                         }
6594                         break;
6595
6596                     case WB_BOUND:
6597                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6598                             match = TRUE;
6599                         }
6600                         else {
6601                             match = isWB(WB_UNKNOWN,
6602                                          getWB_VAL_UTF8(
6603                                                 reghop3((U8*)locinput,
6604                                                         -1,
6605                                                         (U8*)(reginfo->strbeg)),
6606                                                 (U8*) reginfo->strend),
6607                                           getWB_VAL_UTF8((U8*) locinput,
6608                                                         (U8*) reginfo->strend),
6609                                           (U8*) reginfo->strbeg,
6610                                           (U8*) locinput,
6611                                           (U8*) reginfo->strend,
6612                                           utf8_target);
6613                         }
6614                         break;
6615                 }
6616             }
6617             else {  /* Not utf8 target */
6618                 switch((bound_type) FLAGS(scan)) {
6619                     case TRADITIONAL_BOUND:
6620                     {
6621                         bool b1, b2;
6622                         b1 = (locinput == reginfo->strbeg)
6623                             ? 0 /* isWORDCHAR_L1('\n') */
6624                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6625                         b2 = (NEXTCHR_IS_EOS)
6626                             ? 0 /* isWORDCHAR_L1('\n') */
6627                             : isWORDCHAR_L1(nextchr);
6628                         match = cBOOL(b1 != b2);
6629                         break;
6630                     }
6631
6632                     case GCB_BOUND:
6633                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6634                             match = TRUE; /* GCB always matches at begin and
6635                                              end */
6636                         }
6637                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6638                                    range */
6639                             match =    UCHARAT(locinput - 1) != '\r'
6640                                     || UCHARAT(locinput) != '\n';
6641                         }
6642                         break;
6643
6644                     case LB_BOUND:
6645                         if (locinput == reginfo->strbeg) {
6646                             match = FALSE;
6647                         }
6648                         else if (NEXTCHR_IS_EOS) {
6649                             match = TRUE;
6650                         }
6651                         else {
6652                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6653                                          getLB_VAL_CP(UCHARAT(locinput)),
6654                                          (U8*) reginfo->strbeg,
6655                                          (U8*) locinput,
6656                                          (U8*) reginfo->strend,
6657                                          utf8_target);
6658                         }
6659                         break;
6660
6661                     case SB_BOUND: /* Always matches at begin and end */
6662                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6663                             match = TRUE;
6664                         }
6665                         else {
6666                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6667                                          getSB_VAL_CP(UCHARAT(locinput)),
6668                                          (U8*) reginfo->strbeg,
6669                                          (U8*) locinput,
6670                                          (U8*) reginfo->strend,
6671                                          utf8_target);
6672                         }
6673                         break;
6674
6675                     case WB_BOUND:
6676                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6677                             match = TRUE;
6678                         }
6679                         else {
6680                             match = isWB(WB_UNKNOWN,
6681                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6682                                          getWB_VAL_CP(UCHARAT(locinput)),
6683                                          (U8*) reginfo->strbeg,
6684                                          (U8*) locinput,
6685                                          (U8*) reginfo->strend,
6686                                          utf8_target);
6687                         }
6688                         break;
6689                 }
6690             }
6691
6692             if (to_complement ^ ! match) {
6693                 sayNO;
6694             }
6695             break;
6696
6697         case ANYOFPOSIXL:
6698         case ANYOFL:  /*  /[abc]/l      */
6699             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6700
6701             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6702             {
6703               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6704             }
6705             /* FALLTHROUGH */
6706         case ANYOFD:  /*   /[abc]/d       */
6707         case ANYOF:  /*   /[abc]/       */
6708             if (NEXTCHR_IS_EOS)
6709                 sayNO;
6710             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6711                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6712                                                                    utf8_target))
6713                     sayNO;
6714                 locinput += UTF8SKIP(locinput);
6715             }
6716             else {
6717                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6718                     sayNO;
6719                 locinput++;
6720             }
6721             break;
6722
6723         case ANYOFM:
6724             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
6725                 sayNO;
6726             }
6727             locinput++; /* ANYOFM is always single byte */
6728             break;
6729
6730         case NANYOFM:
6731             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)) {
6732                 sayNO;
6733             }
6734             goto increment_locinput;
6735             break;
6736
6737         case ASCII:
6738             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6739                 sayNO;
6740             }
6741
6742             locinput++;     /* ASCII is always single byte */
6743             break;
6744
6745         case NASCII:
6746             if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6747                 sayNO;
6748             }
6749
6750             goto increment_locinput;
6751             break;
6752
6753         /* The argument (FLAGS) to all the POSIX node types is the class number
6754          * */
6755
6756         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6757             to_complement = 1;
6758             /* FALLTHROUGH */
6759
6760         case POSIXL:    /* \w or [:punct:] etc. under /l */
6761             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6762             if (NEXTCHR_IS_EOS)
6763                 sayNO;
6764
6765             /* Use isFOO_lc() for characters within Latin1.  (Note that
6766              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6767              * wouldn't be invariant) */
6768             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6769                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6770                     sayNO;
6771                 }
6772
6773                 locinput++;
6774                 break;
6775             }
6776
6777             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6778                 /* An above Latin-1 code point, or malformed */
6779                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6780                                                        reginfo->strend);
6781                 goto utf8_posix_above_latin1;
6782             }
6783
6784             /* Here is a UTF-8 variant code point below 256 and the target is
6785              * UTF-8 */
6786             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6787                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6788                                             *(locinput + 1))))))
6789             {
6790                 sayNO;
6791             }
6792
6793             goto increment_locinput;
6794
6795         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6796             to_complement = 1;
6797             /* FALLTHROUGH */
6798
6799         case POSIXD:    /* \w or [:punct:] etc. under /d */
6800             if (utf8_target) {
6801                 goto utf8_posix;
6802             }
6803             goto posixa;
6804
6805         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6806
6807             if (NEXTCHR_IS_EOS) {
6808                 sayNO;
6809             }
6810
6811             /* All UTF-8 variants match */
6812             if (! UTF8_IS_INVARIANT(nextchr)) {
6813                 goto increment_locinput;
6814             }
6815
6816             to_complement = 1;
6817             goto join_nposixa;
6818
6819         case POSIXA:    /* \w or [:punct:] etc. under /a */
6820
6821           posixa:
6822             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6823              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6824              * character is a single byte */
6825
6826             if (NEXTCHR_IS_EOS) {
6827                 sayNO;
6828             }
6829
6830           join_nposixa:
6831
6832             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6833                                                                 FLAGS(scan)))))
6834             {
6835                 sayNO;
6836             }
6837
6838             /* Here we are either not in utf8, or we matched a utf8-invariant,
6839              * so the next char is the next byte */
6840             locinput++;
6841             break;
6842
6843         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6844             to_complement = 1;
6845             /* FALLTHROUGH */
6846
6847         case POSIXU:    /* \w or [:punct:] etc. under /u */
6848           utf8_posix:
6849             if (NEXTCHR_IS_EOS) {
6850                 sayNO;
6851             }
6852
6853             /* Use _generic_isCC() for characters within Latin1.  (Note that
6854              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6855              * wouldn't be invariant) */
6856             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6857                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6858                                                            FLAGS(scan)))))
6859                 {
6860                     sayNO;
6861                 }
6862                 locinput++;
6863             }
6864             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6865                 if (! (to_complement
6866                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6867                                                                *(locinput + 1)),
6868                                              FLAGS(scan)))))
6869                 {
6870                     sayNO;
6871                 }
6872                 locinput += 2;
6873             }
6874             else {  /* Handle above Latin-1 code points */
6875               utf8_posix_above_latin1:
6876                 classnum = (_char_class_number) FLAGS(scan);
6877                 switch (classnum) {
6878                     default:
6879                         if (! (to_complement
6880                            ^ cBOOL(_invlist_contains_cp(
6881                                       PL_XPosix_ptrs[classnum],
6882                                       utf8_to_uvchr_buf((U8 *) locinput,
6883                                                         (U8 *) reginfo->strend,
6884                                                         NULL)))))
6885                         {
6886                             sayNO;
6887                         }
6888                         break;
6889                     case _CC_ENUM_SPACE:
6890                         if (! (to_complement
6891                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6892                         {
6893                             sayNO;
6894                         }
6895                         break;
6896                     case _CC_ENUM_BLANK:
6897                         if (! (to_complement
6898                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6899                         {
6900                             sayNO;
6901                         }
6902                         break;
6903                     case _CC_ENUM_XDIGIT:
6904                         if (! (to_complement
6905                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6906                         {
6907                             sayNO;
6908                         }
6909                         break;
6910                     case _CC_ENUM_VERTSPACE:
6911                         if (! (to_complement
6912                                         ^ cBOOL(is_VERTWS_high(locinput))))
6913                         {
6914                             sayNO;
6915                         }
6916                         break;
6917                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6918                     case _CC_ENUM_ASCII:
6919                         if (! to_complement) {
6920                             sayNO;
6921                         }
6922                         break;
6923                 }
6924                 locinput += UTF8SKIP(locinput);
6925             }
6926             break;
6927
6928         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6929                        a Unicode extended Grapheme Cluster */
6930             if (NEXTCHR_IS_EOS)
6931                 sayNO;
6932             if  (! utf8_target) {
6933
6934                 /* Match either CR LF  or '.', as all the other possibilities
6935                  * require utf8 */
6936                 locinput++;         /* Match the . or CR */
6937                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6938                                        match the LF */
6939                     && locinput < reginfo->strend
6940                     && UCHARAT(locinput) == '\n')
6941                 {
6942                     locinput++;
6943                 }
6944             }
6945             else {
6946
6947                 /* Get the gcb type for the current character */
6948                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6949                                                        (U8*) reginfo->strend);
6950
6951                 /* Then scan through the input until we get to the first
6952                  * character whose type is supposed to be a gcb with the
6953                  * current character.  (There is always a break at the
6954                  * end-of-input) */
6955                 locinput += UTF8SKIP(locinput);
6956                 while (locinput < reginfo->strend) {
6957                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6958                                                          (U8*) reginfo->strend);
6959                     if (isGCB(prev_gcb, cur_gcb,
6960                               (U8*) reginfo->strbeg, (U8*) locinput,
6961                               utf8_target))
6962                     {
6963                         break;
6964                     }
6965
6966                     prev_gcb = cur_gcb;
6967                     locinput += UTF8SKIP(locinput);
6968                 }
6969
6970
6971             }
6972             break;
6973
6974         case NREFFL:  /*  /\g{name}/il  */
6975         {   /* The capture buffer cases.  The ones beginning with N for the
6976                named buffers just convert to the equivalent numbered and
6977                pretend they were called as the corresponding numbered buffer
6978                op.  */
6979             /* don't initialize these in the declaration, it makes C++
6980                unhappy */
6981             const char *s;
6982             char type;
6983             re_fold_t folder;
6984             const U8 *fold_array;
6985             UV utf8_fold_flags;
6986
6987             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6988             folder = foldEQ_locale;
6989             fold_array = PL_fold_locale;
6990             type = REFFL;
6991             utf8_fold_flags = FOLDEQ_LOCALE;
6992             goto do_nref;
6993
6994         case NREFFA:  /*  /\g{name}/iaa  */
6995             folder = foldEQ_latin1;
6996             fold_array = PL_fold_latin1;
6997             type = REFFA;
6998             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6999             goto do_nref;
7000
7001         case NREFFU:  /*  /\g{name}/iu  */
7002             folder = foldEQ_latin1;
7003             fold_array = PL_fold_latin1;
7004             type = REFFU;
7005             utf8_fold_flags = 0;
7006             goto do_nref;
7007
7008         case NREFF:  /*  /\g{name}/i  */
7009             folder = foldEQ;
7010             fold_array = PL_fold;
7011             type = REFF;
7012             utf8_fold_flags = 0;
7013             goto do_nref;
7014
7015         case NREF:  /*  /\g{name}/   */
7016             type = REF;
7017             folder = NULL;
7018             fold_array = NULL;
7019             utf8_fold_flags = 0;
7020           do_nref:
7021
7022             /* For the named back references, find the corresponding buffer
7023              * number */
7024             n = reg_check_named_buff_matched(rex,scan);
7025
7026             if ( ! n ) {
7027                 sayNO;
7028             }
7029             goto do_nref_ref_common;
7030
7031         case REFFL:  /*  /\1/il  */
7032             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7033             folder = foldEQ_locale;
7034             fold_array = PL_fold_locale;
7035             utf8_fold_flags = FOLDEQ_LOCALE;
7036             goto do_ref;
7037
7038         case REFFA:  /*  /\1/iaa  */
7039             folder = foldEQ_latin1;
7040             fold_array = PL_fold_latin1;
7041             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7042             goto do_ref;
7043
7044         case REFFU:  /*  /\1/iu  */
7045             folder = foldEQ_latin1;
7046             fold_array = PL_fold_latin1;
7047             utf8_fold_flags = 0;
7048             goto do_ref;
7049
7050         case REFF:  /*  /\1/i  */
7051             folder = foldEQ;
7052             fold_array = PL_fold;
7053             utf8_fold_flags = 0;
7054             goto do_ref;
7055
7056         case REF:  /*  /\1/    */
7057             folder = NULL;
7058             fold_array = NULL;
7059             utf8_fold_flags = 0;
7060
7061           do_ref:
7062             type = OP(scan);
7063             n = ARG(scan);  /* which paren pair */
7064
7065           do_nref_ref_common:
7066             ln = rex->offs[n].start;
7067             endref = rex->offs[n].end;
7068             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7069             if (rex->lastparen < n || ln == -1 || endref == -1)
7070                 sayNO;                  /* Do not match unless seen CLOSEn. */
7071             if (ln == endref)
7072                 break;
7073
7074             s = reginfo->strbeg + ln;
7075             if (type != REF     /* REF can do byte comparison */
7076                 && (utf8_target || type == REFFU || type == REFFL))
7077             {
7078                 char * limit = reginfo->strend;
7079
7080                 /* This call case insensitively compares the entire buffer
7081                     * at s, with the current input starting at locinput, but
7082                     * not going off the end given by reginfo->strend, and
7083                     * returns in <limit> upon success, how much of the
7084                     * current input was matched */
7085                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7086                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7087                 {
7088                     sayNO;
7089                 }
7090                 locinput = limit;
7091                 break;
7092             }
7093
7094             /* Not utf8:  Inline the first character, for speed. */
7095             if (!NEXTCHR_IS_EOS &&
7096                 UCHARAT(s) != nextchr &&
7097                 (type == REF ||
7098                  UCHARAT(s) != fold_array[nextchr]))
7099                 sayNO;
7100             ln = endref - ln;
7101             if (locinput + ln > reginfo->strend)
7102                 sayNO;
7103             if (ln > 1 && (type == REF
7104                            ? memNE(s, locinput, ln)
7105                            : ! folder(s, locinput, ln)))
7106                 sayNO;
7107             locinput += ln;
7108             break;
7109         }
7110
7111         case NOTHING: /* null op; e.g. the 'nothing' following
7112                        * the '*' in m{(a+|b)*}' */
7113             break;
7114         case TAIL: /* placeholder while compiling (A|B|C) */
7115             break;
7116
7117 #undef  ST
7118 #define ST st->u.eval
7119 #define CUR_EVAL cur_eval->u.eval
7120
7121         {
7122             SV *ret;
7123             REGEXP *re_sv;
7124             regexp *re;
7125             regexp_internal *rei;
7126             regnode *startpoint;
7127             U32 arg;
7128
7129         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7130             arg= (U32)ARG(scan);
7131             if (cur_eval && cur_eval->locinput == locinput) {
7132                 if ( ++nochange_depth > max_nochange_depth )
7133                     Perl_croak(aTHX_
7134                         "Pattern subroutine nesting without pos change"
7135                         " exceeded limit in regex");
7136             } else {
7137                 nochange_depth = 0;
7138             }
7139             re_sv = rex_sv;
7140             re = rex;
7141             rei = rexi;
7142             startpoint = scan + ARG2L(scan);
7143             EVAL_CLOSE_PAREN_SET( st, arg );
7144             /* Detect infinite recursion
7145              *
7146              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7147              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7148              * So we track the position in the string we are at each time
7149              * we recurse and if we try to enter the same routine twice from
7150              * the same position we throw an error.
7151              */
7152             if ( rex->recurse_locinput[arg] == locinput ) {
7153                 /* FIXME: we should show the regop that is failing as part
7154                  * of the error message. */
7155                 Perl_croak(aTHX_ "Infinite recursion in regex");
7156             } else {
7157                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7158                 rex->recurse_locinput[arg]= locinput;
7159
7160                 DEBUG_r({
7161                     GET_RE_DEBUG_FLAGS_DECL;
7162                     DEBUG_STACK_r({
7163                         Perl_re_exec_indentf( aTHX_
7164                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7165                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7166                         );
7167                     });
7168                 });
7169             }
7170
7171             /* Save all the positions seen so far. */
7172             ST.cp = regcppush(rex, 0, maxopenparen);
7173             REGCP_SET(ST.lastcp);
7174
7175             /* and then jump to the code we share with EVAL */
7176             goto eval_recurse_doit;
7177             /* NOTREACHED */
7178
7179         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7180             if (cur_eval && cur_eval->locinput==locinput) {
7181                 if ( ++nochange_depth > max_nochange_depth )
7182                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7183             } else {
7184                 nochange_depth = 0;
7185             }
7186             {
7187                 /* execute the code in the {...} */
7188
7189                 dSP;
7190                 IV before;
7191                 OP * const oop = PL_op;
7192                 COP * const ocurcop = PL_curcop;
7193                 OP *nop;
7194                 CV *newcv;
7195
7196                 /* save *all* paren positions */
7197                 regcppush(rex, 0, maxopenparen);
7198                 REGCP_SET(ST.lastcp);
7199
7200                 if (!caller_cv)
7201                     caller_cv = find_runcv(NULL);
7202
7203                 n = ARG(scan);
7204
7205                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7206                     newcv = (ReANY(
7207                                     (REGEXP*)(rexi->data->data[n])
7208                             ))->qr_anoncv;
7209                     nop = (OP*)rexi->data->data[n+1];
7210                 }
7211                 else if (rexi->data->what[n] == 'l') { /* literal code */
7212                     newcv = caller_cv;
7213                     nop = (OP*)rexi->data->data[n];
7214                     assert(CvDEPTH(newcv));
7215                 }
7216                 else {
7217                     /* literal with own CV */
7218                     assert(rexi->data->what[n] == 'L');
7219                     newcv = rex->qr_anoncv;
7220                     nop = (OP*)rexi->data->data[n];
7221                 }
7222
7223                 /* Some notes about MULTICALL and the context and save stacks.
7224                  *
7225                  * In something like
7226                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7227                  * since codeblocks don't introduce a new scope (so that
7228                  * local() etc accumulate), at the end of a successful
7229                  * match there will be a SAVEt_CLEARSV on the savestack
7230                  * for each of $x, $y, $z. If the three code blocks above
7231                  * happen to have come from different CVs (e.g. via
7232                  * embedded qr//s), then we must ensure that during any
7233                  * savestack unwinding, PL_comppad always points to the
7234                  * right pad at each moment. We achieve this by
7235                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7236                  * there is a change of pad.
7237                  * In theory whenever we call a code block, we should
7238                  * push a CXt_SUB context, then pop it on return from
7239                  * that code block. This causes a bit of an issue in that
7240                  * normally popping a context also clears the savestack
7241                  * back to cx->blk_oldsaveix, but here we specifically
7242                  * don't want to clear the save stack on exit from the
7243                  * code block.
7244                  * Also for efficiency we don't want to keep pushing and
7245                  * popping the single SUB context as we backtrack etc.
7246                  * So instead, we push a single context the first time
7247                  * we need, it, then hang onto it until the end of this
7248                  * function. Whenever we encounter a new code block, we
7249                  * update the CV etc if that's changed. During the times
7250                  * in this function where we're not executing a code
7251                  * block, having the SUB context still there is a bit
7252                  * naughty - but we hope that no-one notices.
7253                  * When the SUB context is initially pushed, we fake up
7254                  * cx->blk_oldsaveix to be as if we'd pushed this context
7255                  * on first entry to S_regmatch rather than at some random
7256                  * point during the regexe execution. That way if we
7257                  * croak, popping the context stack will ensure that
7258                  * *everything* SAVEd by this function is undone and then
7259                  * the context popped, rather than e.g., popping the
7260                  * context (and restoring the original PL_comppad) then
7261                  * popping more of the savestack and restoring a bad
7262                  * PL_comppad.
7263                  */
7264
7265                 /* If this is the first EVAL, push a MULTICALL. On
7266                  * subsequent calls, if we're executing a different CV, or
7267                  * if PL_comppad has got messed up from backtracking
7268                  * through SAVECOMPPADs, then refresh the context.
7269                  */
7270                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7271                 {
7272                     U8 flags = (CXp_SUB_RE |
7273                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7274                     SAVECOMPPAD();
7275                     if (last_pushed_cv) {
7276                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7277                     }
7278                     else {
7279                         PUSH_MULTICALL_FLAGS(newcv, flags);
7280                     }
7281                     /* see notes above */
7282                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7283
7284                     last_pushed_cv = newcv;
7285                 }
7286                 else {
7287                     /* these assignments are just to silence compiler
7288                      * warnings */
7289                     multicall_cop = NULL;
7290                 }
7291                 last_pad = PL_comppad;
7292
7293                 /* the initial nextstate you would normally execute
7294                  * at the start of an eval (which would cause error
7295                  * messages to come from the eval), may be optimised
7296                  * away from the execution path in the regex code blocks;
7297                  * so manually set PL_curcop to it initially */
7298                 {
7299                     OP *o = cUNOPx(nop)->op_first;
7300                     assert(o->op_type == OP_NULL);
7301                     if (o->op_targ == OP_SCOPE) {
7302                         o = cUNOPo->op_first;
7303                     }
7304                     else {
7305                         assert(o->op_targ == OP_LEAVE);
7306                         o = cUNOPo->op_first;
7307                         assert(o->op_type == OP_ENTER);
7308                         o = OpSIBLING(o);
7309                     }
7310
7311                     if (o->op_type != OP_STUB) {
7312                         assert(    o->op_type == OP_NEXTSTATE
7313                                 || o->op_type == OP_DBSTATE
7314                                 || (o->op_type == OP_NULL
7315                                     &&  (  o->op_targ == OP_NEXTSTATE
7316                                         || o->op_targ == OP_DBSTATE
7317                                         )
7318                                     )
7319                         );
7320                         PL_curcop = (COP*)o;
7321                     }
7322                 }
7323                 nop = nop->op_next;
7324
7325                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7326                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7327
7328                 rex->offs[0].end = locinput - reginfo->strbeg;
7329                 if (reginfo->info_aux_eval->pos_magic)
7330                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7331                                   reginfo->sv, reginfo->strbeg,
7332                                   locinput - reginfo->strbeg);
7333
7334                 if (sv_yes_mark) {
7335                     SV *sv_mrk = get_sv("REGMARK", 1);
7336                     sv_setsv(sv_mrk, sv_yes_mark);
7337                 }
7338
7339                 /* we don't use MULTICALL here as we want to call the
7340                  * first op of the block of interest, rather than the
7341                  * first op of the sub. Also, we don't want to free
7342                  * the savestack frame */
7343                 before = (IV)(SP-PL_stack_base);
7344                 PL_op = nop;
7345                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7346                 SPAGAIN;
7347                 if ((IV)(SP-PL_stack_base) == before)
7348                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7349                 else {
7350                     ret = POPs;
7351                     PUTBACK;
7352                 }
7353
7354                 /* before restoring everything, evaluate the returned
7355                  * value, so that 'uninit' warnings don't use the wrong
7356                  * PL_op or pad. Also need to process any magic vars
7357                  * (e.g. $1) *before* parentheses are restored */
7358
7359                 PL_op = NULL;
7360
7361                 re_sv = NULL;
7362                 if (logical == 0)        /*   (?{})/   */
7363                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7364                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7365                     sw = cBOOL(SvTRUE_NN(ret));
7366                     logical = 0;
7367                 }
7368                 else {                   /*  /(??{})  */
7369                     /*  if its overloaded, let the regex compiler handle
7370                      *  it; otherwise extract regex, or stringify  */
7371                     if (SvGMAGICAL(ret))
7372                         ret = sv_mortalcopy(ret);
7373                     if (!SvAMAGIC(ret)) {
7374                         SV *sv = ret;
7375                         if (SvROK(sv))
7376                             sv = SvRV(sv);
7377                         if (SvTYPE(sv) == SVt_REGEXP)
7378                             re_sv = (REGEXP*) sv;
7379                         else if (SvSMAGICAL(ret)) {
7380                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7381                             if (mg)
7382                                 re_sv = (REGEXP *) mg->mg_obj;
7383                         }
7384
7385                         /* force any undef warnings here */
7386                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7387                             ret = sv_mortalcopy(ret);
7388                             (void) SvPV_force_nolen(ret);
7389                         }
7390                     }
7391
7392                 }
7393
7394                 /* *** Note that at this point we don't restore
7395                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7396                  * be used again soon. This is safe as long as nothing
7397                  * in the regexp code uses the pad ! */
7398                 PL_op = oop;
7399                 PL_curcop = ocurcop;
7400                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7401                 PL_curpm_under = PL_curpm;
7402                 PL_curpm = PL_reg_curpm;
7403
7404                 if (logical != 2) {
7405                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7406                     /* NOTREACHED */
7407                 }
7408             }
7409
7410                 /* only /(??{})/  from now on */
7411                 logical = 0;
7412                 {
7413                     /* extract RE object from returned value; compiling if
7414                      * necessary */
7415
7416                     if (re_sv) {
7417                         re_sv = reg_temp_copy(NULL, re_sv);
7418                     }
7419                     else {
7420                         U32 pm_flags = 0;
7421
7422                         if (SvUTF8(ret) && IN_BYTES) {
7423                             /* In use 'bytes': make a copy of the octet
7424                              * sequence, but without the flag on */
7425                             STRLEN len;
7426                             const char *const p = SvPV(ret, len);
7427                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7428                         }
7429                         if (rex->intflags & PREGf_USE_RE_EVAL)
7430                             pm_flags |= PMf_USE_RE_EVAL;
7431
7432                         /* if we got here, it should be an engine which
7433                          * supports compiling code blocks and stuff */
7434                         assert(rex->engine && rex->engine->op_comp);
7435                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7436                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7437                                     rex->engine, NULL, NULL,
7438                                     /* copy /msixn etc to inner pattern */
7439                                     ARG2L(scan),
7440                                     pm_flags);
7441
7442                         if (!(SvFLAGS(ret)
7443                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7444                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7445                             /* This isn't a first class regexp. Instead, it's
7446                                caching a regexp onto an existing, Perl visible
7447                                scalar.  */
7448                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7449                         }
7450                     }
7451                     SAVEFREESV(re_sv);
7452                     re = ReANY(re_sv);
7453                 }
7454                 RXp_MATCH_COPIED_off(re);
7455                 re->subbeg = rex->subbeg;
7456                 re->sublen = rex->sublen;
7457                 re->suboffset = rex->suboffset;
7458                 re->subcoffset = rex->subcoffset;
7459                 re->lastparen = 0;
7460                 re->lastcloseparen = 0;
7461                 rei = RXi_GET(re);
7462                 DEBUG_EXECUTE_r(
7463                     debug_start_match(re_sv, utf8_target, locinput,
7464                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7465                 );
7466                 startpoint = rei->program + 1;
7467                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7468                                              * close_paren only for GOSUB */
7469                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7470                 /* Save all the seen positions so far. */
7471                 ST.cp = regcppush(rex, 0, maxopenparen);
7472                 REGCP_SET(ST.lastcp);
7473                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7474                 maxopenparen = 0;
7475                 /* run the pattern returned from (??{...}) */
7476
7477               eval_recurse_doit: /* Share code with GOSUB below this line
7478                             * At this point we expect the stack context to be
7479                             * set up correctly */
7480
7481                 /* invalidate the S-L poscache. We're now executing a
7482                  * different set of WHILEM ops (and their associated
7483                  * indexes) against the same string, so the bits in the
7484                  * cache are meaningless. Setting maxiter to zero forces
7485                  * the cache to be invalidated and zeroed before reuse.
7486                  * XXX This is too dramatic a measure. Ideally we should
7487                  * save the old cache and restore when running the outer
7488                  * pattern again */
7489                 reginfo->poscache_maxiter = 0;
7490
7491                 /* the new regexp might have a different is_utf8_pat than we do */
7492                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7493
7494                 ST.prev_rex = rex_sv;
7495                 ST.prev_curlyx = cur_curlyx;
7496                 rex_sv = re_sv;
7497                 SET_reg_curpm(rex_sv);
7498                 rex = re;
7499                 rexi = rei;
7500                 cur_curlyx = NULL;
7501                 ST.B = next;
7502                 ST.prev_eval = cur_eval;
7503                 cur_eval = st;
7504                 /* now continue from first node in postoned RE */
7505                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7506                 NOT_REACHED; /* NOTREACHED */
7507         }
7508
7509         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7510             /* note: this is called twice; first after popping B, then A */
7511             DEBUG_STACK_r({
7512                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7513                     depth, cur_eval, ST.prev_eval);
7514             });
7515
7516 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7517             if ( cur_eval && CUR_EVAL.close_paren ) {\
7518                 DEBUG_STACK_r({ \
7519                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7520                         depth,    \
7521                         CUR_EVAL.close_paren - 1,\
7522                         cur_eval, \
7523                         VAL);     \
7524                 });               \
7525                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7526             }
7527
7528             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7529
7530             rex_sv = ST.prev_rex;
7531             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7532             SET_reg_curpm(rex_sv);
7533             rex = ReANY(rex_sv);
7534             rexi = RXi_GET(rex);
7535             {
7536                 /* preserve $^R across LEAVE's. See Bug 121070. */
7537                 SV *save_sv= GvSV(PL_replgv);
7538                 SvREFCNT_inc(save_sv);
7539                 regcpblow(ST.cp); /* LEAVE in disguise */
7540                 sv_setsv(GvSV(PL_replgv), save_sv);
7541                 SvREFCNT_dec(save_sv);
7542             }
7543             cur_eval = ST.prev_eval;
7544             cur_curlyx = ST.prev_curlyx;
7545
7546             /* Invalidate cache. See "invalidate" comment above. */
7547             reginfo->poscache_maxiter = 0;
7548             if ( nochange_depth )
7549                 nochange_depth--;
7550
7551             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7552             sayYES;
7553
7554
7555         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7556             REGCP_UNWIND(ST.lastcp);
7557             sayNO;
7558
7559         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7560             /* note: this is called twice; first after popping B, then A */
7561             DEBUG_STACK_r({
7562                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7563                     depth, cur_eval, ST.prev_eval);
7564             });
7565
7566             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7567
7568             rex_sv = ST.prev_rex;
7569             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7570             SET_reg_curpm(rex_sv);
7571             rex = ReANY(rex_sv);
7572             rexi = RXi_GET(rex);
7573
7574             REGCP_UNWIND(ST.lastcp);
7575             regcppop(rex, &maxopenparen);
7576             cur_eval = ST.prev_eval;
7577             cur_curlyx = ST.prev_curlyx;
7578
7579             /* Invalidate cache. See "invalidate" comment above. */
7580             reginfo->poscache_maxiter = 0;
7581             if ( nochange_depth )
7582                 nochange_depth--;
7583
7584             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7585             sayNO_SILENT;
7586 #undef ST
7587
7588         case OPEN: /*  (  */
7589             n = ARG(scan);  /* which paren pair */
7590             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7591             if (n > maxopenparen)
7592                 maxopenparen = n;
7593             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7594                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7595                 depth,
7596                 PTR2UV(rex),
7597                 PTR2UV(rex->offs),
7598                 (UV)n,
7599                 (IV)rex->offs[n].start_tmp,
7600                 (UV)maxopenparen
7601             ));
7602             lastopen = n;
7603             break;
7604
7605         case SROPEN: /*  (*SCRIPT_RUN:  */
7606             script_run_begin = (U8 *) locinput;
7607             break;
7608
7609
7610         case CLOSE:  /*  )  */
7611             n = ARG(scan);  /* which paren pair */
7612             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7613                              locinput - reginfo->strbeg);
7614             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7615                 goto fake_end;
7616
7617             break;
7618
7619         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7620
7621             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7622             {
7623                 sayNO;
7624             }
7625
7626             break;
7627
7628
7629         case ACCEPT:  /*  (*ACCEPT)  */
7630             if (scan->flags)
7631                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7632             if (ARG2L(scan)){
7633                 regnode *cursor;
7634                 for (cursor=scan;
7635                      cursor && OP(cursor)!=END;
7636                      cursor=regnext(cursor))
7637                 {
7638                     if ( OP(cursor)==CLOSE ){
7639                         n = ARG(cursor);
7640                         if ( n <= lastopen ) {
7641                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7642                                              locinput - reginfo->strbeg);
7643                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7644                                 break;
7645                         }
7646                     }
7647                 }
7648             }
7649             goto fake_end;
7650             /* NOTREACHED */
7651
7652         case GROUPP:  /*  (?(1))  */
7653             n = ARG(scan);  /* which paren pair */
7654             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7655             break;
7656
7657         case NGROUPP:  /*  (?(<name>))  */
7658             /* reg_check_named_buff_matched returns 0 for no match */
7659             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7660             break;
7661
7662         case INSUBP:   /*  (?(R))  */
7663             n = ARG(scan);
7664             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7665              * of SCAN is already set up as matches a eval.close_paren */
7666             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7667             break;
7668
7669         case DEFINEP:  /*  (?(DEFINE))  */
7670             sw = 0;
7671             break;
7672
7673         case IFTHEN:   /*  (?(cond)A|B)  */
7674             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7675             if (sw)
7676                 next = NEXTOPER(NEXTOPER(scan));
7677             else {
7678                 next = scan + ARG(scan);
7679                 if (OP(next) == IFTHEN) /* Fake one. */
7680                     next = NEXTOPER(NEXTOPER(next));
7681             }
7682             break;
7683
7684         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7685             logical = scan->flags;
7686             break;
7687
7688 /*******************************************************************
7689
7690 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7691 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7692 STAR/PLUS/CURLY/CURLYN are used instead.)
7693
7694 A*B is compiled as <CURLYX><A><WHILEM><B>
7695
7696 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7697 state, which contains the current count, initialised to -1. It also sets
7698 cur_curlyx to point to this state, with any previous value saved in the
7699 state block.
7700
7701 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7702 since the pattern may possibly match zero times (i.e. it's a while {} loop
7703 rather than a do {} while loop).
7704
7705 Each entry to WHILEM represents a successful match of A. The count in the
7706 CURLYX block is incremented, another WHILEM state is pushed, and execution
7707 passes to A or B depending on greediness and the current count.
7708
7709 For example, if matching against the string a1a2a3b (where the aN are
7710 substrings that match /A/), then the match progresses as follows: (the
7711 pushed states are interspersed with the bits of strings matched so far):
7712
7713     <CURLYX cnt=-1>
7714     <CURLYX cnt=0><WHILEM>
7715     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7716     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7717     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7718     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7719
7720 (Contrast this with something like CURLYM, which maintains only a single
7721 backtrack state:
7722
7723     <CURLYM cnt=0> a1
7724     a1 <CURLYM cnt=1> a2
7725     a1 a2 <CURLYM cnt=2> a3
7726     a1 a2 a3 <CURLYM cnt=3> b
7727 )
7728
7729 Each WHILEM state block marks a point to backtrack to upon partial failure
7730 of A or B, and also contains some minor state data related to that
7731 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7732 overall state, such as the count, and pointers to the A and B ops.
7733
7734 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7735 must always point to the *current* CURLYX block, the rules are:
7736
7737 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7738 and set cur_curlyx to point the new block.
7739
7740 When popping the CURLYX block after a successful or unsuccessful match,
7741 restore the previous cur_curlyx.
7742
7743 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7744 to the outer one saved in the CURLYX block.
7745
7746 When popping the WHILEM block after a successful or unsuccessful B match,
7747 restore the previous cur_curlyx.
7748
7749 Here's an example for the pattern (AI* BI)*BO
7750 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7751
7752 cur_
7753 curlyx backtrack stack
7754 ------ ---------------
7755 NULL
7756 CO     <CO prev=NULL> <WO>
7757 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7758 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7759 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7760
7761 At this point the pattern succeeds, and we work back down the stack to
7762 clean up, restoring as we go:
7763
7764 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7765 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7766 CO     <CO prev=NULL> <WO>
7767 NULL
7768
7769 *******************************************************************/
7770
7771 #define ST st->u.curlyx
7772
7773         case CURLYX:    /* start of /A*B/  (for complex A) */
7774         {
7775             /* No need to save/restore up to this paren */
7776             I32 parenfloor = scan->flags;
7777
7778             assert(next); /* keep Coverity happy */
7779             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7780                 next += ARG(next);
7781
7782             /* XXXX Probably it is better to teach regpush to support
7783                parenfloor > maxopenparen ... */
7784             if (parenfloor > (I32)rex->lastparen)
7785                 parenfloor = rex->lastparen; /* Pessimization... */
7786
7787             ST.prev_curlyx= cur_curlyx;
7788             cur_curlyx = st;
7789             ST.cp = PL_savestack_ix;
7790
7791             /* these fields contain the state of the current curly.
7792              * they are accessed by subsequent WHILEMs */
7793             ST.parenfloor = parenfloor;
7794             ST.me = scan;
7795             ST.B = next;
7796             ST.minmod = minmod;
7797             minmod = 0;
7798             ST.count = -1;      /* this will be updated by WHILEM */
7799             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7800
7801             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7802             NOT_REACHED; /* NOTREACHED */
7803         }
7804
7805         case CURLYX_end: /* just finished matching all of A*B */
7806             cur_curlyx = ST.prev_curlyx;
7807             sayYES;
7808             NOT_REACHED; /* NOTREACHED */
7809
7810         case CURLYX_end_fail: /* just failed to match all of A*B */
7811             regcpblow(ST.cp);
7812             cur_curlyx = ST.prev_curlyx;
7813             sayNO;
7814             NOT_REACHED; /* NOTREACHED */
7815
7816
7817 #undef ST
7818 #define ST st->u.whilem
7819
7820         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7821         {
7822             /* see the discussion above about CURLYX/WHILEM */
7823             I32 n;
7824             int min, max;
7825             regnode *A;
7826
7827             assert(cur_curlyx); /* keep Coverity happy */
7828
7829             min = ARG1(cur_curlyx->u.curlyx.me);
7830             max = ARG2(cur_curlyx->u.curlyx.me);
7831             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7832             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7833             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7834             ST.cache_offset = 0;
7835             ST.cache_mask = 0;
7836
7837
7838             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7839                   depth, (long)n, min, max)
7840             );
7841
7842             /* First just match a string of min A's. */
7843
7844             if (n < min) {
7845                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7846                 cur_curlyx->u.curlyx.lastloc = locinput;
7847                 REGCP_SET(ST.lastcp);
7848
7849                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7850                 NOT_REACHED; /* NOTREACHED */
7851             }
7852
7853             /* If degenerate A matches "", assume A done. */
7854
7855             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7856                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7857                    depth)
7858                 );
7859                 goto do_whilem_B_max;
7860             }
7861
7862             /* super-linear cache processing.
7863              *
7864              * The idea here is that for certain types of CURLYX/WHILEM -
7865              * principally those whose upper bound is infinity (and
7866              * excluding regexes that have things like \1 and other very
7867              * non-regular expresssiony things), then if a pattern like
7868              * /....A*.../ fails and we backtrack to the WHILEM, then we
7869              * make a note that this particular WHILEM op was at string
7870              * position 47 (say) when the rest of pattern failed. Then, if
7871              * we ever find ourselves back at that WHILEM, and at string
7872              * position 47 again, we can just fail immediately rather than
7873              * running the rest of the pattern again.
7874              *
7875              * This is very handy when patterns start to go
7876              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7877              * with a combinatorial explosion of backtracking.
7878              *
7879              * The cache is implemented as a bit array, with one bit per
7880              * string byte position per WHILEM op (up to 16) - so its
7881              * between 0.25 and 2x the string size.
7882              *
7883              * To avoid allocating a poscache buffer every time, we do an
7884              * initially countdown; only after we have  executed a WHILEM
7885              * op (string-length x #WHILEMs) times do we allocate the
7886              * cache.
7887              *
7888              * The top 4 bits of scan->flags byte say how many different
7889              * relevant CURLLYX/WHILEM op pairs there are, while the
7890              * bottom 4-bits is the identifying index number of this
7891              * WHILEM.
7892              */
7893
7894             if (scan->flags) {
7895
7896                 if (!reginfo->poscache_maxiter) {
7897                     /* start the countdown: Postpone detection until we
7898                      * know the match is not *that* much linear. */
7899                     reginfo->poscache_maxiter
7900                         =    (reginfo->strend - reginfo->strbeg + 1)
7901                            * (scan->flags>>4);
7902                     /* possible overflow for long strings and many CURLYX's */
7903                     if (reginfo->poscache_maxiter < 0)
7904                         reginfo->poscache_maxiter = I32_MAX;
7905                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7906                 }
7907
7908                 if (reginfo->poscache_iter-- == 0) {
7909                     /* initialise cache */
7910                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7911                     regmatch_info_aux *const aux = reginfo->info_aux;
7912                     if (aux->poscache) {
7913                         if ((SSize_t)reginfo->poscache_size < size) {
7914                             Renew(aux->poscache, size, char);
7915                             reginfo->poscache_size = size;
7916                         }
7917                         Zero(aux->poscache, size, char);
7918                     }
7919                     else {
7920                         reginfo->poscache_size = size;
7921                         Newxz(aux->poscache, size, char);
7922                     }
7923                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7924       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
7925                               PL_colors[4], PL_colors[5])
7926                     );
7927                 }
7928
7929                 if (reginfo->poscache_iter < 0) {
7930                     /* have we already failed at this position? */
7931                     SSize_t offset, mask;
7932
7933                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7934                     offset  = (scan->flags & 0xf) - 1
7935                                 +   (locinput - reginfo->strbeg)
7936                                   * (scan->flags>>4);
7937                     mask    = 1 << (offset % 8);
7938                     offset /= 8;
7939                     if (reginfo->info_aux->poscache[offset] & mask) {
7940                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
7941                             depth)
7942                         );
7943                         cur_curlyx->u.curlyx.count--;
7944                         sayNO; /* cache records failure */
7945                     }
7946                     ST.cache_offset = offset;
7947                     ST.cache_mask   = mask;
7948                 }
7949             }
7950
7951             /* Prefer B over A for minimal matching. */
7952
7953             if (cur_curlyx->u.curlyx.minmod) {
7954                 ST.save_curlyx = cur_curlyx;
7955                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7956                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7957                                     locinput);
7958                 NOT_REACHED; /* NOTREACHED */
7959             }
7960
7961             /* Prefer A over B for maximal matching. */
7962
7963             if (n < max) { /* More greed allowed? */
7964                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7965                             maxopenparen);
7966                 cur_curlyx->u.curlyx.lastloc = locinput;
7967                 REGCP_SET(ST.lastcp);
7968                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7969                 NOT_REACHED; /* NOTREACHED */
7970             }
7971             goto do_whilem_B_max;
7972         }
7973         NOT_REACHED; /* NOTREACHED */
7974
7975         case WHILEM_B_min: /* just matched B in a minimal match */
7976         case WHILEM_B_max: /* just matched B in a maximal match */
7977             cur_curlyx = ST.save_curlyx;
7978             sayYES;
7979             NOT_REACHED; /* NOTREACHED */
7980
7981         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7982             cur_curlyx = ST.save_curlyx;
7983             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7984             cur_curlyx->u.curlyx.count--;
7985             CACHEsayNO;
7986             NOT_REACHED; /* NOTREACHED */
7987
7988         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7989             /* FALLTHROUGH */
7990         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7991             REGCP_UNWIND(ST.lastcp);
7992             regcppop(rex, &maxopenparen);
7993             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7994             cur_curlyx->u.curlyx.count--;
7995             CACHEsayNO;
7996             NOT_REACHED; /* NOTREACHED */
7997
7998         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7999             REGCP_UNWIND(ST.lastcp);
8000             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8001             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8002                 depth)
8003             );
8004           do_whilem_B_max:
8005             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8006                 && ckWARN(WARN_REGEXP)
8007                 && !reginfo->warned)
8008             {
8009                 reginfo->warned = TRUE;
8010                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8011                      "Complex regular subexpression recursion limit (%d) "
8012                      "exceeded",
8013                      REG_INFTY - 1);
8014             }
8015
8016             /* now try B */
8017             ST.save_curlyx = cur_curlyx;
8018             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8019             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8020                                 locinput);
8021             NOT_REACHED; /* NOTREACHED */
8022
8023         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8024             cur_curlyx = ST.save_curlyx;
8025
8026             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8027                 /* Maximum greed exceeded */
8028                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8029                     && ckWARN(WARN_REGEXP)
8030                     && !reginfo->warned)
8031                 {
8032                     reginfo->warned     = TRUE;
8033                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8034                         "Complex regular subexpression recursion "
8035                         "limit (%d) exceeded",
8036                         REG_INFTY - 1);
8037                 }
8038                 cur_curlyx->u.curlyx.count--;
8039                 CACHEsayNO;
8040             }
8041
8042             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8043             );
8044             /* Try grabbing another A and see if it helps. */
8045             cur_curlyx->u.curlyx.lastloc = locinput;
8046             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8047                             maxopenparen);
8048             REGCP_SET(ST.lastcp);
8049             PUSH_STATE_GOTO(WHILEM_A_min,
8050                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8051                 locinput);
8052             NOT_REACHED; /* NOTREACHED */
8053
8054 #undef  ST
8055 #define ST st->u.branch
8056
8057         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8058             next = scan + ARG(scan);
8059             if (next == scan)
8060                 next = NULL;
8061             scan = NEXTOPER(scan);
8062             /* FALLTHROUGH */
8063
8064         case BRANCH:        /*  /(...|A|...)/ */
8065             scan = NEXTOPER(scan); /* scan now points to inner node */
8066             ST.lastparen = rex->lastparen;
8067             ST.lastcloseparen = rex->lastcloseparen;
8068             ST.next_branch = next;
8069             REGCP_SET(ST.cp);
8070
8071             /* Now go into the branch */
8072             if (has_cutgroup) {
8073                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
8074             } else {
8075                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
8076             }
8077             NOT_REACHED; /* NOTREACHED */
8078
8079         case CUTGROUP:  /*  /(*THEN)/  */
8080             sv_yes_mark = st->u.mark.mark_name = scan->flags
8081                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8082                 : NULL;
8083             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
8084             NOT_REACHED; /* NOTREACHED */
8085
8086         case CUTGROUP_next_fail:
8087             do_cutgroup = 1;
8088             no_final = 1;
8089             if (st->u.mark.mark_name)
8090                 sv_commit = st->u.mark.mark_name;
8091             sayNO;
8092             NOT_REACHED; /* NOTREACHED */
8093
8094         case BRANCH_next:
8095             sayYES;
8096             NOT_REACHED; /* NOTREACHED */
8097
8098         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8099             if (do_cutgroup) {
8100                 do_cutgroup = 0;
8101                 no_final = 0;
8102             }
8103             REGCP_UNWIND(ST.cp);
8104             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8105             scan = ST.next_branch;
8106             /* no more branches? */
8107             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8108                 DEBUG_EXECUTE_r({
8109                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8110                         depth,
8111                         PL_colors[4],
8112                         PL_colors[5] );
8113                 });
8114                 sayNO_SILENT;
8115             }
8116             continue; /* execute next BRANCH[J] op */
8117             /* NOTREACHED */
8118
8119         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8120             minmod = 1;
8121             break;
8122
8123 #undef  ST
8124 #define ST st->u.curlym
8125
8126         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8127
8128             /* This is an optimisation of CURLYX that enables us to push
8129              * only a single backtracking state, no matter how many matches
8130              * there are in {m,n}. It relies on the pattern being constant
8131              * length, with no parens to influence future backrefs
8132              */
8133
8134             ST.me = scan;
8135             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8136
8137             ST.lastparen      = rex->lastparen;
8138             ST.lastcloseparen = rex->lastcloseparen;
8139
8140             /* if paren positive, emulate an OPEN/CLOSE around A */
8141             if (ST.me->flags) {
8142                 U32 paren = ST.me->flags;
8143                 if (paren > maxopenparen)
8144                     maxopenparen = paren;
8145                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8146             }
8147             ST.A = scan;
8148             ST.B = next;
8149             ST.alen = 0;
8150             ST.count = 0;
8151             ST.minmod = minmod;
8152             minmod = 0;
8153             ST.c1 = CHRTEST_UNINIT;
8154             REGCP_SET(ST.cp);
8155
8156             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8157                 goto curlym_do_B;
8158
8159           curlym_do_A: /* execute the A in /A{m,n}B/  */
8160             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
8161             NOT_REACHED; /* NOTREACHED */
8162
8163         case CURLYM_A: /* we've just matched an A */
8164             ST.count++;
8165             /* after first match, determine A's length: u.curlym.alen */
8166             if (ST.count == 1) {
8167                 if (reginfo->is_utf8_target) {
8168                     char *s = st->locinput;
8169                     while (s < locinput) {
8170                         ST.alen++;
8171                         s += UTF8SKIP(s);
8172                     }
8173                 }
8174                 else {
8175                     ST.alen = locinput - st->locinput;
8176                 }
8177                 if (ST.alen == 0)
8178                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8179             }
8180             DEBUG_EXECUTE_r(
8181                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8182                           depth, (IV) ST.count, (IV)ST.alen)
8183             );
8184
8185             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8186                 goto fake_end;
8187
8188             {
8189                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8190                 if ( max == REG_INFTY || ST.count < max )
8191                     goto curlym_do_A; /* try to match another A */
8192             }
8193             goto curlym_do_B; /* try to match B */
8194
8195         case CURLYM_A_fail: /* just failed to match an A */
8196             REGCP_UNWIND(ST.cp);
8197
8198
8199             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8200                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8201                 sayNO;
8202
8203           curlym_do_B: /* execute the B in /A{m,n}B/  */
8204             if (ST.c1 == CHRTEST_UNINIT) {
8205                 /* calculate c1 and c2 for possible match of 1st char
8206                  * following curly */
8207                 ST.c1 = ST.c2 = CHRTEST_VOID;
8208                 assert(ST.B);
8209                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8210                     regnode *text_node = ST.B;
8211                     if (! HAS_TEXT(text_node))
8212                         FIND_NEXT_IMPT(text_node);
8213                     /* this used to be
8214
8215                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8216
8217                         But the former is redundant in light of the latter.
8218
8219                         if this changes back then the macro for
8220                         IS_TEXT and friends need to change.
8221                      */
8222                     if (PL_regkind[OP(text_node)] == EXACT) {
8223                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8224                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8225                            reginfo))
8226                         {
8227                             sayNO;
8228                         }
8229                     }
8230                 }
8231             }
8232
8233             DEBUG_EXECUTE_r(
8234                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8235                     depth, (IV)ST.count)
8236                 );
8237             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8238                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8239                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8240                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8241                     {
8242                         /* simulate B failing */
8243                         DEBUG_OPTIMISE_r(
8244                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8245                                 depth,
8246                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8247                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8248                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8249                         );
8250                         state_num = CURLYM_B_fail;
8251                         goto reenter_switch;
8252                     }
8253                 }
8254                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8255                     /* simulate B failing */
8256                     DEBUG_OPTIMISE_r(
8257                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8258                             depth,
8259                             (int) nextchr, ST.c1, ST.c2)
8260                     );
8261                     state_num = CURLYM_B_fail;
8262                     goto reenter_switch;
8263                 }
8264             }
8265
8266             if (ST.me->flags) {
8267                 /* emulate CLOSE: mark current A as captured */
8268                 U32 paren = (U32)ST.me->flags;
8269                 if (ST.count) {
8270                     CLOSE_CAPTURE(paren,
8271                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
8272                         locinput - reginfo->strbeg);
8273                 }
8274                 else
8275                     rex->offs[paren].end = -1;
8276
8277                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8278                 {
8279                     if (ST.count)
8280                         goto fake_end;
8281                     else
8282                         sayNO;
8283                 }
8284             }
8285
8286             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8287             NOT_REACHED; /* NOTREACHED */
8288
8289         case CURLYM_B_fail: /* just failed to match a B */
8290             REGCP_UNWIND(ST.cp);
8291             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8292             if (ST.minmod) {
8293                 I32 max = ARG2(ST.me);
8294                 if (max != REG_INFTY && ST.count == max)
8295                     sayNO;
8296                 goto curlym_do_A; /* try to match a further A */
8297             }
8298             /* backtrack one A */
8299             if (ST.count == ARG1(ST.me) /* min */)
8300                 sayNO;
8301             ST.count--;
8302             SET_locinput(HOPc(locinput, -ST.alen));
8303             goto curlym_do_B; /* try to match B */
8304
8305 #undef ST
8306 #define ST st->u.curly
8307
8308 #define CURLY_SETPAREN(paren, success) \
8309     if (paren) { \
8310         if (success) { \
8311             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8312                                  locinput - reginfo->strbeg); \
8313         } \
8314         else { \
8315             rex->offs[paren].end = -1; \
8316             rex->lastparen      = ST.lastparen; \
8317             rex->lastcloseparen = ST.lastcloseparen; \
8318         } \
8319     }
8320
8321         case STAR:              /*  /A*B/ where A is width 1 char */
8322             ST.paren = 0;
8323             ST.min = 0;
8324             ST.max = REG_INFTY;
8325             scan = NEXTOPER(scan);
8326             goto repeat;
8327
8328         case PLUS:              /*  /A+B/ where A is width 1 char */
8329             ST.paren = 0;
8330             ST.min = 1;
8331             ST.max = REG_INFTY;
8332             scan = NEXTOPER(scan);
8333             goto repeat;
8334
8335         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8336             ST.paren = scan->flags;     /* Which paren to set */
8337             ST.lastparen      = rex->lastparen;
8338             ST.lastcloseparen = rex->lastcloseparen;
8339             if (ST.paren > maxopenparen)
8340                 maxopenparen = ST.paren;
8341             ST.min = ARG1(scan);  /* min to match */
8342             ST.max = ARG2(scan);  /* max to match */
8343             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8344
8345             /* handle the single-char capture called as a GOSUB etc */
8346             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8347             {
8348                 char *li = locinput;
8349                 if (!regrepeat(rex, &li, scan, reginfo, 1))
8350                     sayNO;
8351                 SET_locinput(li);
8352                 goto fake_end;
8353             }
8354
8355             goto repeat;
8356
8357         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8358             ST.paren = 0;
8359             ST.min = ARG1(scan);  /* min to match */
8360             ST.max = ARG2(scan);  /* max to match */
8361             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8362           repeat:
8363             /*
8364             * Lookahead to avoid useless match attempts
8365             * when we know what character comes next.
8366             *
8367             * Used to only do .*x and .*?x, but now it allows
8368             * for )'s, ('s and (?{ ... })'s to be in the way
8369             * of the quantifier and the EXACT-like node.  -- japhy
8370             */
8371
8372             assert(ST.min <= ST.max);
8373             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8374                 ST.c1 = ST.c2 = CHRTEST_VOID;
8375             }
8376             else {
8377                 regnode *text_node = next;
8378
8379                 if (! HAS_TEXT(text_node))
8380                     FIND_NEXT_IMPT(text_node);
8381
8382                 if (! HAS_TEXT(text_node))
8383                     ST.c1 = ST.c2 = CHRTEST_VOID;
8384                 else {
8385                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8386                         ST.c1 = ST.c2 = CHRTEST_VOID;
8387                     }
8388                     else {
8389
8390                     /*  Currently we only get here when
8391
8392                         PL_rekind[OP(text_node)] == EXACT
8393
8394                         if this changes back then the macro for IS_TEXT and
8395                         friends need to change. */
8396                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8397                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8398                            reginfo))
8399                         {
8400                             sayNO;
8401                         }
8402                     }
8403                 }
8404             }
8405
8406             ST.A = scan;
8407             ST.B = next;
8408             if (minmod) {
8409                 char *li = locinput;
8410                 minmod = 0;
8411                 if (ST.min &&
8412                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8413                             < ST.min)
8414                     sayNO;
8415                 SET_locinput(li);
8416                 ST.count = ST.min;
8417                 REGCP_SET(ST.cp);
8418                 if (ST.c1 == CHRTEST_VOID)
8419                     goto curly_try_B_min;
8420
8421                 ST.oldloc = locinput;
8422
8423                 /* set ST.maxpos to the furthest point along the
8424                  * string that could possibly match */
8425                 if  (ST.max == REG_INFTY) {
8426                     ST.maxpos = reginfo->strend - 1;
8427                     if (utf8_target)
8428                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8429                             ST.maxpos--;
8430                 }
8431                 else if (utf8_target) {
8432                     int m = ST.max - ST.min;
8433                     for (ST.maxpos = locinput;
8434                          m >0 && ST.maxpos < reginfo->strend; m--)
8435                         ST.maxpos += UTF8SKIP(ST.maxpos);
8436                 }
8437                 else {
8438                     ST.maxpos = locinput + ST.max - ST.min;
8439                     if (ST.maxpos >= reginfo->strend)
8440                         ST.maxpos = reginfo->strend - 1;
8441                 }
8442                 goto curly_try_B_min_known;
8443
8444             }
8445             else {
8446                 /* avoid taking address of locinput, so it can remain
8447                  * a register var */
8448                 char *li = locinput;
8449                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8450                 if (ST.count < ST.min)
8451                     sayNO;
8452                 SET_locinput(li);
8453                 if ((ST.count > ST.min)
8454                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8455                 {
8456                     /* A{m,n} must come at the end of the string, there's
8457                      * no point in backing off ... */
8458                     ST.min = ST.count;
8459                     /* ...except that $ and \Z can match before *and* after
8460                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8461                        We may back off by one in this case. */
8462                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8463                         ST.min--;
8464                 }
8465                 REGCP_SET(ST.cp);
8466                 goto curly_try_B_max;
8467             }
8468             NOT_REACHED; /* NOTREACHED */
8469
8470         case CURLY_B_min_fail:
8471             /* failed to find B in a non-greedy match.
8472              * Handles both cases where c1,c2 valid or not */
8473
8474             REGCP_UNWIND(ST.cp);
8475             if (ST.paren) {
8476                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8477             }
8478
8479             if (ST.c1 == CHRTEST_VOID) {
8480                 /* failed -- move forward one */
8481                 char *li = locinput;
8482                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8483                     sayNO;
8484                 }
8485                 locinput = li;
8486                 ST.count++;
8487                 if (!(   ST.count <= ST.max
8488                         /* count overflow ? */
8489                      || (ST.max == REG_INFTY && ST.count > 0))
8490                 )
8491                     sayNO;
8492             }
8493             else {
8494                 int n;
8495                 /* Couldn't or didn't -- move forward. */
8496                 ST.oldloc = locinput;
8497                 if (utf8_target)
8498                     locinput += UTF8SKIP(locinput);
8499                 else
8500                     locinput++;
8501                 ST.count++;
8502
8503               curly_try_B_min_known:
8504                 /* find the next place where 'B' could work, then call B */
8505                 if (utf8_target) {
8506                     n = (ST.oldloc == locinput) ? 0 : 1;
8507                     if (ST.c1 == ST.c2) {
8508                         /* set n to utf8_distance(oldloc, locinput) */
8509                         while (locinput <= ST.maxpos
8510                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8511                         {
8512                             locinput += UTF8SKIP(locinput);
8513                             n++;
8514                         }
8515                     }
8516                     else {
8517                         /* set n to utf8_distance(oldloc, locinput) */
8518                         while (locinput <= ST.maxpos
8519                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8520                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8521                         {
8522                             locinput += UTF8SKIP(locinput);
8523                             n++;
8524                         }
8525                     }
8526                 }
8527                 else {  /* Not utf8_target */
8528                     if (ST.c1 == ST.c2) {
8529                         locinput = (char *) memchr(locinput,
8530                                                    ST.c1,
8531                                                    ST.maxpos + 1 - locinput);
8532                         if (! locinput) {
8533                             locinput = ST.maxpos + 1;
8534                         }
8535                     }
8536                     else {
8537                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8538
8539                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8540                             while (   locinput <= ST.maxpos
8541                                    && UCHARAT(locinput) != ST.c1
8542                                    && UCHARAT(locinput) != ST.c2)
8543                             {
8544                                 locinput++;
8545                             }
8546                         }
8547                         else {
8548                             /* If c1 and c2 only differ by a single bit, we can
8549                              * avoid a conditional each time through the loop,
8550                              * at the expense of a little preliminary setup and
8551                              * an extra mask each iteration.  By masking out
8552                              * that bit, we match exactly two characters, c1
8553                              * and c2, and so we don't have to test for both.
8554                              * On both ASCII and EBCDIC platforms, most of the
8555                              * ASCII-range and Latin1-range folded equivalents
8556                              * differ only in a single bit, so this is actually
8557                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8558                              * 0x61). */
8559                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8560                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8561                             while (   locinput <= ST.maxpos
8562                                    && (UCHARAT(locinput) & c1_c2_mask)
8563                                                                 != c1_masked)
8564                             {
8565                                 locinput++;
8566                             }
8567                         }
8568                     }
8569                     n = locinput - ST.oldloc;
8570                 }
8571                 if (locinput > ST.maxpos)
8572                     sayNO;
8573                 if (n) {
8574                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8575                      * at b; check that everything between oldloc and
8576                      * locinput matches */
8577                     char *li = ST.oldloc;
8578                     ST.count += n;
8579                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8580                         sayNO;
8581                     assert(n == REG_INFTY || locinput == li);
8582                 }
8583             }
8584
8585           curly_try_B_min:
8586             CURLY_SETPAREN(ST.paren, ST.count);
8587             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8588             NOT_REACHED; /* NOTREACHED */
8589
8590
8591           curly_try_B_max:
8592             /* a successful greedy match: now try to match B */
8593             {
8594                 bool could_match = locinput < reginfo->strend;
8595
8596                 /* If it could work, try it. */
8597                 if (ST.c1 != CHRTEST_VOID && could_match) {
8598                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8599                     {
8600                         could_match = memEQ(locinput,
8601                                             ST.c1_utf8,
8602                                             UTF8SKIP(locinput))
8603                                     || memEQ(locinput,
8604                                              ST.c2_utf8,
8605                                              UTF8SKIP(locinput));
8606                     }
8607                     else {
8608                         could_match = UCHARAT(locinput) == ST.c1
8609                                       || UCHARAT(locinput) == ST.c2;
8610                     }
8611                 }
8612                 if (ST.c1 == CHRTEST_VOID || could_match) {
8613                     CURLY_SETPAREN(ST.paren, ST.count);
8614                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8615                     NOT_REACHED; /* NOTREACHED */
8616                 }
8617             }
8618             /* FALLTHROUGH */
8619
8620         case CURLY_B_max_fail:
8621             /* failed to find B in a greedy match */
8622
8623             REGCP_UNWIND(ST.cp);
8624             if (ST.paren) {
8625                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8626             }
8627             /*  back up. */
8628             if (--ST.count < ST.min)
8629                 sayNO;
8630             locinput = HOPc(locinput, -1);
8631             goto curly_try_B_max;
8632
8633 #undef ST
8634
8635         case END: /*  last op of main pattern  */
8636           fake_end:
8637             if (cur_eval) {
8638                 /* we've just finished A in /(??{A})B/; now continue with B */
8639                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8640                 st->u.eval.prev_rex = rex_sv;           /* inner */
8641
8642                 /* Save *all* the positions. */
8643                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8644                 rex_sv = CUR_EVAL.prev_rex;
8645                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8646                 SET_reg_curpm(rex_sv);
8647                 rex = ReANY(rex_sv);
8648                 rexi = RXi_GET(rex);
8649
8650                 st->u.eval.prev_curlyx = cur_curlyx;
8651                 cur_curlyx = CUR_EVAL.prev_curlyx;
8652
8653                 REGCP_SET(st->u.eval.lastcp);
8654
8655                 /* Restore parens of the outer rex without popping the
8656                  * savestack */
8657                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8658
8659                 st->u.eval.prev_eval = cur_eval;
8660                 cur_eval = CUR_EVAL.prev_eval;
8661                 DEBUG_EXECUTE_r(
8662                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8663                                       depth, cur_eval););
8664                 if ( nochange_depth )
8665                     nochange_depth--;
8666
8667                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8668
8669                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8670                                     locinput); /* match B */
8671             }
8672
8673             if (locinput < reginfo->till) {
8674                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8675                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8676                                       PL_colors[4],
8677                                       (long)(locinput - startpos),
8678                                       (long)(reginfo->till - startpos),
8679                                       PL_colors[5]));
8680
8681                 sayNO_SILENT;           /* Cannot match: too short. */
8682             }
8683             sayYES;                     /* Success! */
8684
8685         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8686             DEBUG_EXECUTE_r(
8687             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8688                 depth, PL_colors[4], PL_colors[5]));
8689             sayYES;                     /* Success! */
8690
8691 #undef  ST
8692 #define ST st->u.ifmatch
8693
8694         {
8695             char *newstart;
8696
8697         case SUSPEND:   /* (?>A) */
8698             ST.wanted = 1;
8699             newstart = locinput;
8700             goto do_ifmatch;
8701
8702         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8703             ST.wanted = 0;
8704             goto ifmatch_trivial_fail_test;
8705
8706         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8707             ST.wanted = 1;
8708           ifmatch_trivial_fail_test:
8709             if (scan->flags) {
8710                 char * const s = HOPBACKc(locinput, scan->flags);
8711                 if (!s) {
8712                     /* trivial fail */
8713                     if (logical) {
8714                         logical = 0;
8715                         sw = 1 - cBOOL(ST.wanted);
8716                     }
8717                     else if (ST.wanted)
8718                         sayNO;
8719                     next = scan + ARG(scan);
8720                     if (next == scan)
8721                         next = NULL;
8722                     break;
8723                 }
8724                 newstart = s;
8725             }
8726             else
8727                 newstart = locinput;
8728
8729           do_ifmatch:
8730             ST.me = scan;
8731             ST.logical = logical;
8732             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8733
8734             /* execute body of (?...A) */
8735             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8736             NOT_REACHED; /* NOTREACHED */
8737         }
8738
8739         case IFMATCH_A_fail: /* body of (?...A) failed */
8740             ST.wanted = !ST.wanted;
8741             /* FALLTHROUGH */
8742
8743         case IFMATCH_A: /* body of (?...A) succeeded */
8744             if (ST.logical) {
8745                 sw = cBOOL(ST.wanted);
8746             }
8747             else if (!ST.wanted)
8748                 sayNO;
8749
8750             if (OP(ST.me) != SUSPEND) {
8751                 /* restore old position except for (?>...) */
8752                 locinput = st->locinput;
8753             }
8754             scan = ST.me + ARG(ST.me);
8755             if (scan == ST.me)
8756                 scan = NULL;
8757             continue; /* execute B */
8758
8759 #undef ST
8760
8761         case LONGJMP: /*  alternative with many branches compiles to
8762                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8763             next = scan + ARG(scan);
8764             if (next == scan)
8765                 next = NULL;
8766             break;
8767
8768         case COMMIT:  /*  (*COMMIT)  */
8769             reginfo->cutpoint = reginfo->strend;
8770             /* FALLTHROUGH */
8771
8772         case PRUNE:   /*  (*PRUNE)   */
8773             if (scan->flags)
8774                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8775             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8776             NOT_REACHED; /* NOTREACHED */
8777
8778         case COMMIT_next_fail:
8779             no_final = 1;
8780             /* FALLTHROUGH */
8781             sayNO;
8782             NOT_REACHED; /* NOTREACHED */
8783
8784         case OPFAIL:   /* (*FAIL)  */
8785             if (scan->flags)
8786                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8787             if (logical) {
8788                 /* deal with (?(?!)X|Y) properly,
8789                  * make sure we trigger the no branch
8790                  * of the trailing IFTHEN structure*/
8791                 sw= 0;
8792                 break;
8793             } else {
8794                 sayNO;
8795             }
8796             NOT_REACHED; /* NOTREACHED */
8797
8798 #define ST st->u.mark
8799         case MARKPOINT: /*  (*MARK:foo)  */
8800             ST.prev_mark = mark_state;
8801             ST.mark_name = sv_commit = sv_yes_mark
8802                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8803             mark_state = st;
8804             ST.mark_loc = locinput;
8805             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8806             NOT_REACHED; /* NOTREACHED */
8807
8808         case MARKPOINT_next:
8809             mark_state = ST.prev_mark;
8810             sayYES;
8811             NOT_REACHED; /* NOTREACHED */
8812
8813         case MARKPOINT_next_fail:
8814             if (popmark && sv_eq(ST.mark_name,popmark))
8815             {
8816                 if (ST.mark_loc > startpoint)
8817                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8818                 popmark = NULL; /* we found our mark */
8819                 sv_commit = ST.mark_name;
8820
8821                 DEBUG_EXECUTE_r({
8822                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8823                             depth,
8824                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8825                 });
8826             }
8827             mark_state = ST.prev_mark;
8828             sv_yes_mark = mark_state ?
8829                 mark_state->u.mark.mark_name : NULL;
8830             sayNO;
8831             NOT_REACHED; /* NOTREACHED */
8832
8833         case SKIP:  /*  (*SKIP)  */
8834             if (!scan->flags) {
8835                 /* (*SKIP) : if we fail we cut here*/
8836                 ST.mark_name = NULL;
8837                 ST.mark_loc = locinput;
8838                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8839             } else {
8840                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8841                    otherwise do nothing.  Meaning we need to scan
8842                  */
8843                 regmatch_state *cur = mark_state;
8844                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8845
8846                 while (cur) {
8847                     if ( sv_eq( cur->u.mark.mark_name,
8848                                 find ) )
8849                     {
8850                         ST.mark_name = find;
8851                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8852                     }
8853                     cur = cur->u.mark.prev_mark;
8854                 }
8855             }
8856             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8857             break;
8858
8859         case SKIP_next_fail:
8860             if (ST.mark_name) {
8861                 /* (*CUT:NAME) - Set up to search for the name as we
8862                    collapse the stack*/
8863                 popmark = ST.mark_name;
8864             } else {
8865                 /* (*CUT) - No name, we cut here.*/
8866                 if (ST.mark_loc > startpoint)
8867                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8868                 /* but we set sv_commit to latest mark_name if there
8869                    is one so they can test to see how things lead to this
8870                    cut */
8871                 if (mark_state)
8872                     sv_commit=mark_state->u.mark.mark_name;
8873             }
8874             no_final = 1;
8875             sayNO;
8876             NOT_REACHED; /* NOTREACHED */
8877 #undef ST
8878
8879         case LNBREAK: /* \R */
8880             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8881                 locinput += n;
8882             } else
8883                 sayNO;
8884             break;
8885
8886         default:
8887             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8888                           PTR2UV(scan), OP(scan));
8889             Perl_croak(aTHX_ "regexp memory corruption");
8890
8891         /* this is a point to jump to in order to increment
8892          * locinput by one character */
8893           increment_locinput:
8894             assert(!NEXTCHR_IS_EOS);
8895             if (utf8_target) {
8896                 locinput += PL_utf8skip[nextchr];
8897                 /* locinput is allowed to go 1 char off the end (signifying
8898                  * EOS), but not 2+ */
8899                 if (locinput > reginfo->strend)
8900                     sayNO;
8901             }
8902             else
8903                 locinput++;
8904             break;
8905
8906         } /* end switch */
8907
8908         /* switch break jumps here */
8909         scan = next; /* prepare to execute the next op and ... */
8910         continue;    /* ... jump back to the top, reusing st */
8911         /* NOTREACHED */
8912
8913       push_yes_state:
8914         /* push a state that backtracks on success */
8915         st->u.yes.prev_yes_state = yes_state;
8916         yes_state = st;
8917         /* FALLTHROUGH */
8918       push_state:
8919         /* push a new regex state, then continue at scan  */
8920         {
8921             regmatch_state *newst;
8922
8923             DEBUG_STACK_r({
8924                 regmatch_state *cur = st;
8925                 regmatch_state *curyes = yes_state;
8926                 U32 i;
8927                 regmatch_slab *slab = PL_regmatch_slab;
8928                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8929                     if (cur < SLAB_FIRST(slab)) {
8930                         slab = slab->prev;
8931                         cur = SLAB_LAST(slab);
8932                     }
8933                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8934                         depth,
8935                         i ? "    " : "push",
8936                         depth - i, PL_reg_name[cur->resume_state],
8937                         (curyes == cur) ? "yes" : ""
8938                     );
8939                     if (curyes == cur)
8940                         curyes = cur->u.yes.prev_yes_state;
8941                 }
8942             } else
8943                 DEBUG_STATE_pp("push")
8944             );
8945             depth++;
8946             st->locinput = locinput;
8947             newst = st+1;
8948             if (newst >  SLAB_LAST(PL_regmatch_slab))
8949                 newst = S_push_slab(aTHX);
8950             PL_regmatch_state = newst;
8951
8952             locinput = pushinput;
8953             st = newst;
8954             continue;
8955             /* NOTREACHED */
8956         }
8957     }
8958 #ifdef SOLARIS_BAD_OPTIMIZER
8959 #  undef PL_charclass
8960 #endif
8961
8962     /*
8963     * We get here only if there's trouble -- normally "case END" is
8964     * the terminating point.
8965     */
8966     Perl_croak(aTHX_ "corrupted regexp pointers");
8967     NOT_REACHED; /* NOTREACHED */
8968
8969   yes:
8970     if (yes_state) {
8971         /* we have successfully completed a subexpression, but we must now
8972          * pop to the state marked by yes_state and continue from there */
8973         assert(st != yes_state);
8974 #ifdef DEBUGGING
8975         while (st != yes_state) {
8976             st--;
8977             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8978                 PL_regmatch_slab = PL_regmatch_slab->prev;
8979                 st = SLAB_LAST(PL_regmatch_slab);
8980             }
8981             DEBUG_STATE_r({
8982                 if (no_final) {
8983                     DEBUG_STATE_pp("pop (no final)");
8984                 } else {
8985                     DEBUG_STATE_pp("pop (yes)");
8986                 }
8987             });
8988             depth--;
8989         }
8990 #else
8991         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8992             || yes_state > SLAB_LAST(PL_regmatch_slab))
8993         {
8994             /* not in this slab, pop slab */
8995             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8996             PL_regmatch_slab = PL_regmatch_slab->prev;
8997             st = SLAB_LAST(PL_regmatch_slab);
8998         }
8999         depth -= (st - yes_state);
9000 #endif
9001         st = yes_state;
9002         yes_state = st->u.yes.prev_yes_state;
9003         PL_regmatch_state = st;
9004
9005         if (no_final)
9006             locinput= st->locinput;
9007         state_num = st->resume_state + no_final;
9008         goto reenter_switch;
9009     }
9010
9011     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9012                           PL_colors[4], PL_colors[5]));
9013
9014     if (reginfo->info_aux_eval) {
9015         /* each successfully executed (?{...}) block does the equivalent of
9016          *   local $^R = do {...}
9017          * When popping the save stack, all these locals would be undone;
9018          * bypass this by setting the outermost saved $^R to the latest
9019          * value */
9020         /* I dont know if this is needed or works properly now.
9021          * see code related to PL_replgv elsewhere in this file.
9022          * Yves
9023          */
9024         if (oreplsv != GvSV(PL_replgv))
9025             sv_setsv(oreplsv, GvSV(PL_replgv));
9026     }
9027     result = 1;
9028     goto final_exit;
9029
9030   no:
9031     DEBUG_EXECUTE_r(
9032         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9033             depth,
9034             PL_colors[4], PL_colors[5])
9035         );
9036
9037   no_silent:
9038     if (no_final) {
9039         if (yes_state) {
9040             goto yes;
9041         } else {
9042             goto final_exit;
9043         }
9044     }
9045     if (depth) {
9046         /* there's a previous state to backtrack to */
9047         st--;
9048         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9049             PL_regmatch_slab = PL_regmatch_slab->prev;
9050             st = SLAB_LAST(PL_regmatch_slab);
9051         }
9052         PL_regmatch_state = st;
9053         locinput= st->locinput;
9054
9055         DEBUG_STATE_pp("pop");
9056         depth--;
9057         if (yes_state == st)
9058             yes_state = st->u.yes.prev_yes_state;
9059
9060         state_num = st->resume_state + 1; /* failure = success + 1 */
9061         PERL_ASYNC_CHECK();
9062         goto reenter_switch;
9063     }
9064     result = 0;
9065
9066   final_exit:
9067     if (rex->intflags & PREGf_VERBARG_SEEN) {
9068         SV *sv_err = get_sv("REGERROR", 1);
9069         SV *sv_mrk = get_sv("REGMARK", 1);
9070         if (result) {
9071             sv_commit = &PL_sv_no;
9072             if (!sv_yes_mark)
9073                 sv_yes_mark = &PL_sv_yes;
9074         } else {
9075             if (!sv_commit)
9076                 sv_commit = &PL_sv_yes;
9077             sv_yes_mark = &PL_sv_no;
9078         }
9079         assert(sv_err);
9080         assert(sv_mrk);
9081         sv_setsv(sv_err, sv_commit);
9082         sv_setsv(sv_mrk, sv_yes_mark);
9083     }
9084
9085
9086     if (last_pushed_cv) {
9087         dSP;
9088         /* see "Some notes about MULTICALL" above */
9089         POP_MULTICALL;
9090         PERL_UNUSED_VAR(SP);
9091     }
9092     else
9093         LEAVE_SCOPE(orig_savestack_ix);
9094
9095     assert(!result ||  locinput - reginfo->strbeg >= 0);
9096     return result ?  locinput - reginfo->strbeg : -1;
9097 }
9098
9099 /*
9100  - regrepeat - repeatedly match something simple, report how many
9101  *
9102  * What 'simple' means is a node which can be the operand of a quantifier like
9103  * '+', or {1,3}
9104  *
9105  * startposp - pointer a pointer to the start position.  This is updated
9106  *             to point to the byte following the highest successful
9107  *             match.
9108  * p         - the regnode to be repeatedly matched against.
9109  * reginfo   - struct holding match state, such as strend
9110  * max       - maximum number of things to match.
9111  * depth     - (for debugging) backtracking depth.
9112  */
9113 STATIC I32
9114 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9115             regmatch_info *const reginfo, I32 max _pDEPTH)
9116 {
9117     char *scan;     /* Pointer to current position in target string */
9118     I32 c;
9119     char *loceol = reginfo->strend;   /* local version */
9120     I32 hardcount = 0;  /* How many matches so far */
9121     bool utf8_target = reginfo->is_utf8_target;
9122     unsigned int to_complement = 0;  /* Invert the result? */
9123     UV utf8_flags;
9124     _char_class_number classnum;
9125
9126     PERL_ARGS_ASSERT_REGREPEAT;
9127
9128     scan = *startposp;
9129     if (max == REG_INFTY)
9130         max = I32_MAX;
9131     else if (! utf8_target && loceol - scan > max)
9132         loceol = scan + max;
9133
9134     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
9135      * to the maximum of how far we should go in it (leaving it set to the real
9136      * end, if the maximum permissible would take us beyond that).  This allows
9137      * us to make the loop exit condition that we haven't gone past <loceol> to
9138      * also mean that we haven't exceeded the max permissible count, saving a
9139      * test each time through the loop.  But it assumes that the OP matches a
9140      * single byte, which is true for most of the OPs below when applied to a
9141      * non-UTF-8 target.  Those relatively few OPs that don't have this
9142      * characteristic will have to compensate.
9143      *
9144      * There is no adjustment for UTF-8 targets, as the number of bytes per
9145      * character varies.  OPs will have to test both that the count is less
9146      * than the max permissible (using <hardcount> to keep track), and that we
9147      * are still within the bounds of the string (using <loceol>.  A few OPs
9148      * match a single byte no matter what the encoding.  They can omit the max
9149      * test if, for the UTF-8 case, they do the adjustment that was skipped
9150      * above.
9151      *
9152      * Thus, the code above sets things up for the common case; and exceptional
9153      * cases need extra work; the common case is to make sure <scan> doesn't
9154      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
9155      * count doesn't exceed the maximum permissible */
9156
9157     switch (OP(p)) {
9158     case REG_ANY:
9159         if (utf8_target) {
9160             while (scan < loceol && hardcount < max && *scan != '\n') {
9161                 scan += UTF8SKIP(scan);
9162                 hardcount++;
9163             }
9164         } else {
9165             scan = (char *) memchr(scan, '\n', loceol - scan);
9166             if (! scan) {
9167                 scan = loceol;
9168             }
9169         }
9170         break;
9171     case SANY:
9172         if (utf8_target) {
9173             while (scan < loceol && hardcount < max) {
9174                 scan += UTF8SKIP(scan);
9175                 hardcount++;
9176             }
9177         }
9178         else
9179             scan = loceol;
9180         break;
9181     case EXACTL:
9182         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9183         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9184             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9185         }
9186         /* FALLTHROUGH */
9187     case EXACT:
9188         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9189
9190         c = (U8)*STRING(p);
9191
9192         /* Can use a simple find if the pattern char to match on is invariant
9193          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9194          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9195          * true iff it doesn't matter if the argument is in UTF-8 or not */
9196         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9197             if (utf8_target && loceol - scan > max) {
9198                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9199                  * since here, to match at all, 1 char == 1 byte */
9200                 loceol = scan + max;
9201             }
9202             scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9203         }
9204         else if (reginfo->is_utf8_pat) {
9205             if (utf8_target) {
9206                 STRLEN scan_char_len;
9207
9208                 /* When both target and pattern are UTF-8, we have to do
9209                  * string EQ */
9210                 while (hardcount < max
9211                        && scan < loceol
9212                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9213                        && memEQ(scan, STRING(p), scan_char_len))
9214                 {
9215                     scan += scan_char_len;
9216                     hardcount++;
9217                 }
9218             }
9219             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9220
9221                 /* Target isn't utf8; convert the character in the UTF-8
9222                  * pattern to non-UTF8, and do a simple find */
9223                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9224                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9225             } /* else pattern char is above Latin1, can't possibly match the
9226                  non-UTF-8 target */
9227         }
9228         else {
9229
9230             /* Here, the string must be utf8; pattern isn't, and <c> is
9231              * different in utf8 than not, so can't compare them directly.
9232              * Outside the loop, find the two utf8 bytes that represent c, and
9233              * then look for those in sequence in the utf8 string */
9234             U8 high = UTF8_TWO_BYTE_HI(c);
9235             U8 low = UTF8_TWO_BYTE_LO(c);
9236
9237             while (hardcount < max
9238                     && scan + 1 < loceol
9239                     && UCHARAT(scan) == high
9240                     && UCHARAT(scan + 1) == low)
9241             {
9242                 scan += 2;
9243                 hardcount++;
9244             }
9245         }
9246         break;
9247
9248     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9249         assert(! reginfo->is_utf8_pat);
9250         /* FALLTHROUGH */
9251     case EXACTFAA:
9252         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9253         goto do_exactf;
9254
9255     case EXACTFL:
9256         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9257         utf8_flags = FOLDEQ_LOCALE;
9258         goto do_exactf;
9259
9260     case EXACTF:   /* This node only generated for non-utf8 patterns */
9261         assert(! reginfo->is_utf8_pat);
9262         utf8_flags = 0;
9263         goto do_exactf;
9264
9265     case EXACTFLU8:
9266         if (! utf8_target) {
9267             break;
9268         }
9269         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9270                                     | FOLDEQ_S2_FOLDS_SANE;
9271         goto do_exactf;
9272
9273     case EXACTFU_SS:
9274     case EXACTFU:
9275         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9276
9277       do_exactf: {
9278         int c1, c2;
9279         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9280
9281         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9282
9283         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9284                                         reginfo))
9285         {
9286             if (c1 == CHRTEST_VOID) {
9287                 /* Use full Unicode fold matching */
9288                 char *tmpeol = reginfo->strend;
9289                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9290                 while (hardcount < max
9291                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9292                                              STRING(p), NULL, pat_len,
9293                                              reginfo->is_utf8_pat, utf8_flags))
9294                 {
9295                     scan = tmpeol;
9296                     tmpeol = reginfo->strend;
9297                     hardcount++;
9298                 }
9299             }
9300             else if (utf8_target) {
9301                 if (c1 == c2) {
9302                     while (scan < loceol
9303                            && hardcount < max
9304                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9305                     {
9306                         scan += UTF8SKIP(scan);
9307                         hardcount++;
9308                     }
9309                 }
9310                 else {
9311                     while (scan < loceol
9312                            && hardcount < max
9313                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9314                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9315                     {
9316                         scan += UTF8SKIP(scan);
9317                         hardcount++;
9318                     }
9319                 }
9320             }
9321             else if (c1 == c2) {
9322                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c1);
9323             }
9324             else {
9325                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9326                  * a conditional each time through the loop if the characters
9327                  * differ only in a single bit, as is the usual situation */
9328                 U8 c1_c2_bits_differing = c1 ^ c2;
9329
9330                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9331                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9332
9333                     scan = (char *) find_span_end_mask((U8 *) scan,
9334                                                        (U8 *) loceol,
9335                                                        c1 & c1_c2_mask,
9336                                                        c1_c2_mask);
9337                 }
9338                 else {
9339                     while (    scan < loceol
9340                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9341                     {
9342                         scan++;
9343                     }
9344                 }
9345             }
9346         }
9347         break;
9348     }
9349     case ANYOFPOSIXL:
9350     case ANYOFL:
9351         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9352
9353         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9354             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9355         }
9356         /* FALLTHROUGH */
9357     case ANYOFD:
9358     case ANYOF:
9359         if (utf8_target) {
9360             while (hardcount < max
9361                    && scan < loceol
9362                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9363             {
9364                 scan += UTF8SKIP(scan);
9365                 hardcount++;
9366             }
9367         }
9368         else if (ANYOF_FLAGS(p)) {
9369             while (scan < loceol
9370                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9371                 scan++;
9372         }
9373         else {
9374             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9375                 scan++;
9376         }
9377         break;
9378
9379     case ANYOFM:
9380         if (utf8_target && loceol - scan > max) {
9381
9382             /* We didn't adjust <loceol> at the beginning of this routine
9383              * because is UTF-8, but it is actually ok to do so, since here, to
9384              * match, 1 char == 1 byte. */
9385             loceol = scan + max;
9386         }
9387
9388         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9389         break;
9390
9391     case NANYOFM:
9392         if (utf8_target) {
9393             while (     hardcount < max
9394                    &&   scan < loceol
9395                    &&  (*scan & FLAGS(p)) != ARG(p))
9396             {
9397                 scan += UTF8SKIP(scan);
9398                 hardcount++;
9399             }
9400         }
9401         else {
9402             scan = (char *) find_next_masked((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9403         }
9404         break;
9405
9406     case ASCII:
9407         if (utf8_target && loceol - scan > max) {
9408             loceol = scan + max;
9409         }
9410
9411         scan = find_next_non_ascii(scan, loceol, utf8_target);
9412         break;
9413
9414     case NASCII:
9415         if (utf8_target) {
9416             while (     hardcount < max
9417                    &&   scan < loceol
9418                    && ! isASCII_utf8_safe(scan, loceol))
9419             {
9420                 scan += UTF8SKIP(scan);
9421                 hardcount++;
9422             }
9423         }
9424         else {
9425             scan = find_next_ascii(scan, loceol, utf8_target);
9426         }
9427         break;
9428
9429     /* The argument (FLAGS) to all the POSIX node types is the class number */
9430
9431     case NPOSIXL:
9432         to_complement = 1;
9433         /* FALLTHROUGH */
9434
9435     case POSIXL:
9436         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9437         if (! utf8_target) {
9438             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9439                                                                    *scan)))
9440             {
9441                 scan++;
9442             }
9443         } else {
9444             while (hardcount < max && scan < loceol
9445                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9446                                                                   (U8 *) scan,
9447                                                                   (U8 *) loceol)))
9448             {
9449                 scan += UTF8SKIP(scan);
9450                 hardcount++;
9451             }
9452         }
9453         break;
9454
9455     case POSIXD:
9456         if (utf8_target) {
9457             goto utf8_posix;
9458         }
9459         /* FALLTHROUGH */
9460
9461     case POSIXA:
9462         if (utf8_target && loceol - scan > max) {
9463
9464             /* We didn't adjust <loceol> at the beginning of this routine
9465              * because is UTF-8, but it is actually ok to do so, since here, to
9466              * match, 1 char == 1 byte. */
9467             loceol = scan + max;
9468         }
9469         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9470             scan++;
9471         }
9472         break;
9473
9474     case NPOSIXD:
9475         if (utf8_target) {
9476             to_complement = 1;
9477             goto utf8_posix;
9478         }
9479         /* FALLTHROUGH */
9480
9481     case NPOSIXA:
9482         if (! utf8_target) {
9483             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9484                 scan++;
9485             }
9486         }
9487         else {
9488
9489             /* The complement of something that matches only ASCII matches all
9490              * non-ASCII, plus everything in ASCII that isn't in the class. */
9491             while (hardcount < max && scan < loceol
9492                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9493                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9494             {
9495                 scan += UTF8SKIP(scan);
9496                 hardcount++;
9497             }
9498         }
9499         break;
9500
9501     case NPOSIXU:
9502         to_complement = 1;
9503         /* FALLTHROUGH */
9504
9505     case POSIXU:
9506         if (! utf8_target) {
9507             while (scan < loceol && to_complement
9508                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9509             {
9510                 scan++;
9511             }
9512         }
9513         else {
9514           utf8_posix:
9515             classnum = (_char_class_number) FLAGS(p);
9516             switch (classnum) {
9517                 default:
9518                     while (   hardcount < max && scan < loceol
9519                            && to_complement ^ cBOOL(_invlist_contains_cp(
9520                                               PL_XPosix_ptrs[classnum],
9521                                               utf8_to_uvchr_buf((U8 *) scan,
9522                                                                 (U8 *) loceol,
9523                                                                 NULL))))
9524                     {
9525                         scan += UTF8SKIP(scan);
9526                         hardcount++;
9527                     }
9528                     break;
9529
9530                     /* For the classes below, the knowledge of how to handle
9531                      * every code point is compiled in to Perl via a macro.
9532                      * This code is written for making the loops as tight as
9533                      * possible.  It could be refactored to save space instead.
9534                      * */
9535
9536                 case _CC_ENUM_SPACE:
9537                     while (hardcount < max
9538                            && scan < loceol
9539                            && (to_complement
9540                                ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9541                     {
9542                         scan += UTF8SKIP(scan);
9543                         hardcount++;
9544                     }
9545                     break;
9546                 case _CC_ENUM_BLANK:
9547                     while (hardcount < max
9548                            && scan < loceol
9549                            && (to_complement
9550                                 ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9551                     {
9552                         scan += UTF8SKIP(scan);
9553                         hardcount++;
9554                     }
9555                     break;
9556                 case _CC_ENUM_XDIGIT:
9557                     while (hardcount < max
9558                            && scan < loceol
9559                            && (to_complement
9560                                ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9561                     {
9562                         scan += UTF8SKIP(scan);
9563                         hardcount++;
9564                     }
9565                     break;
9566                 case _CC_ENUM_VERTSPACE:
9567                     while (hardcount < max
9568                            && scan < loceol
9569                            && (to_complement
9570                                ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9571                     {
9572                         scan += UTF8SKIP(scan);
9573                         hardcount++;
9574                     }
9575                     break;
9576                 case _CC_ENUM_CNTRL:
9577                     while (hardcount < max
9578                            && scan < loceol
9579                            && (to_complement
9580                                ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9581                     {
9582                         scan += UTF8SKIP(scan);
9583                         hardcount++;
9584                     }
9585                     break;
9586             }
9587         }
9588         break;
9589
9590     case LNBREAK:
9591         if (utf8_target) {
9592             while (hardcount < max && scan < loceol &&
9593                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9594                 scan += c;
9595                 hardcount++;
9596             }
9597         } else {
9598             /* LNBREAK can match one or two latin chars, which is ok, but we
9599              * have to use hardcount in this situation, and throw away the
9600              * adjustment to <loceol> done before the switch statement */
9601             loceol = reginfo->strend;
9602             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9603                 scan+=c;
9604                 hardcount++;
9605             }
9606         }
9607         break;
9608
9609     case BOUNDL:
9610     case NBOUNDL:
9611         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9612         /* FALLTHROUGH */
9613     case BOUND:
9614     case BOUNDA:
9615     case BOUNDU:
9616     case EOS:
9617     case GPOS:
9618     case KEEPS:
9619     case NBOUND:
9620     case NBOUNDA:
9621     case NBOUNDU:
9622     case OPFAIL:
9623     case SBOL:
9624     case SEOL:
9625         /* These are all 0 width, so match right here or not at all. */
9626         break;
9627
9628     default:
9629         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9630         NOT_REACHED; /* NOTREACHED */
9631
9632     }
9633
9634     if (hardcount)
9635         c = hardcount;
9636     else
9637         c = scan - *startposp;
9638     *startposp = scan;
9639
9640     DEBUG_r({
9641         GET_RE_DEBUG_FLAGS_DECL;
9642         DEBUG_EXECUTE_r({
9643             SV * const prop = sv_newmortal();
9644             regprop(prog, prop, p, reginfo, NULL);
9645             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9646                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9647         });
9648     });
9649
9650     return(c);
9651 }
9652
9653
9654 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9655 /*
9656 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9657 create a copy so that changes the caller makes won't change the shared one.
9658 If <altsvp> is non-null, will return NULL in it, for back-compat.
9659  */
9660 SV *
9661 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9662 {
9663     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9664
9665     if (altsvp) {
9666         *altsvp = NULL;
9667     }
9668
9669     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9670 }
9671
9672 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9673
9674 /*
9675  - reginclass - determine if a character falls into a character class
9676
9677   n is the ANYOF-type regnode
9678   p is the target string
9679   p_end points to one byte beyond the end of the target string
9680   utf8_target tells whether p is in UTF-8.
9681
9682   Returns true if matched; false otherwise.
9683
9684   Note that this can be a synthetic start class, a combination of various
9685   nodes, so things you think might be mutually exclusive, such as locale,
9686   aren't.  It can match both locale and non-locale
9687
9688  */
9689
9690 STATIC bool
9691 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9692 {
9693     dVAR;
9694     const char flags = ANYOF_FLAGS(n);
9695     bool match = FALSE;
9696     UV c = *p;
9697
9698     PERL_ARGS_ASSERT_REGINCLASS;
9699
9700     /* If c is not already the code point, get it.  Note that
9701      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9702     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9703         STRLEN c_len = 0;
9704         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9705         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9706         if (c_len == (STRLEN)-1) {
9707             _force_out_malformed_utf8_message(p, p_end,
9708                                               utf8n_flags,
9709                                               1 /* 1 means die */ );
9710             NOT_REACHED; /* NOTREACHED */
9711         }
9712         if (     c > 255
9713             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9714             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9715         {
9716             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9717         }
9718     }
9719
9720     /* If this character is potentially in the bitmap, check it */
9721     if (c < NUM_ANYOF_CODE_POINTS) {
9722         if (ANYOF_BITMAP_TEST(n, c))
9723             match = TRUE;
9724         else if ((flags
9725                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9726                   && OP(n) == ANYOFD
9727                   && ! utf8_target
9728                   && ! isASCII(c))
9729         {
9730             match = TRUE;
9731         }
9732         else if (flags & ANYOF_LOCALE_FLAGS) {
9733             if ((flags & ANYOFL_FOLD)
9734                 && c < 256
9735                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9736             {
9737                 match = TRUE;
9738             }
9739             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9740                      && c < 256
9741             ) {
9742
9743                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9744                  * if the class includes the Posix character class given by
9745                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9746                  * complemented Posix class given by int(bit/2).  So we loop
9747                  * through the bits, each time changing whether we complement
9748                  * the result or not.  Suppose for the sake of illustration
9749                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9750                  * is set, it means there is a match for this ANYOF node if the
9751                  * character is in the class given by the expression (0 / 2 = 0
9752                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9753                  * and since 'to_complement' is 0, the result will stay TRUE,
9754                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9755                  * bit 1 is 1.  That means there is a match if the character
9756                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9757                  * but will on bit 1.  On the second iteration 'to_complement'
9758                  * will be 1, so the exclusive or will reverse things, so we
9759                  * are testing for \W.  On the third iteration, 'to_complement'
9760                  * will be 0, and we would be testing for \s; the fourth
9761                  * iteration would test for \S, etc.
9762                  *
9763                  * Note that this code assumes that all the classes are closed
9764                  * under folding.  For example, if a character matches \w, then
9765                  * its fold does too; and vice versa.  This should be true for
9766                  * any well-behaved locale for all the currently defined Posix
9767                  * classes, except for :lower: and :upper:, which are handled
9768                  * by the pseudo-class :cased: which matches if either of the
9769                  * other two does.  To get rid of this assumption, an outer
9770                  * loop could be used below to iterate over both the source
9771                  * character, and its fold (if different) */
9772
9773                 int count = 0;
9774                 int to_complement = 0;
9775
9776                 while (count < ANYOF_MAX) {
9777                     if (ANYOF_POSIXL_TEST(n, count)
9778                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9779                     {
9780                         match = TRUE;
9781                         break;
9782                     }
9783                     count++;
9784                     to_complement ^= 1;
9785                 }
9786             }
9787         }
9788     }
9789
9790
9791     /* If the bitmap didn't (or couldn't) match, and something outside the
9792      * bitmap could match, try that. */
9793     if (!match) {
9794         if (c >= NUM_ANYOF_CODE_POINTS
9795             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9796         {
9797             match = TRUE;       /* Everything above the bitmap matches */
9798         }
9799             /* Here doesn't match everything above the bitmap.  If there is
9800              * some information available beyond the bitmap, we may find a
9801              * match in it.  If so, this is most likely because the code point
9802              * is outside the bitmap range.  But rarely, it could be because of
9803              * some other reason.  If so, various flags are set to indicate
9804              * this possibility.  On ANYOFD nodes, there may be matches that
9805              * happen only when the target string is UTF-8; or for other node
9806              * types, because runtime lookup is needed, regardless of the
9807              * UTF-8ness of the target string.  Finally, under /il, there may
9808              * be some matches only possible if the locale is a UTF-8 one. */
9809         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9810                  && (   c >= NUM_ANYOF_CODE_POINTS
9811                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9812                          && (   UNLIKELY(OP(n) != ANYOFD)
9813                              || (utf8_target && ! isASCII_uni(c)
9814 #                               if NUM_ANYOF_CODE_POINTS > 256
9815                                                                  && c < 256
9816 #                               endif
9817                                 )))
9818                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9819                          && IN_UTF8_CTYPE_LOCALE)))
9820         {
9821             SV* only_utf8_locale = NULL;
9822             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9823                                                        &only_utf8_locale, NULL);
9824             if (sw) {
9825                 U8 utf8_buffer[2];
9826                 U8 * utf8_p;
9827                 if (utf8_target) {
9828                     utf8_p = (U8 *) p;
9829                 } else { /* Convert to utf8 */
9830                     utf8_p = utf8_buffer;
9831                     append_utf8_from_native_byte(*p, &utf8_p);
9832                     utf8_p = utf8_buffer;
9833                 }
9834
9835                 if (swash_fetch(sw, utf8_p, TRUE)) {
9836                     match = TRUE;
9837                 }
9838             }
9839             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9840                 match = _invlist_contains_cp(only_utf8_locale, c);
9841             }
9842         }
9843
9844         if (UNICODE_IS_SUPER(c)
9845             && (flags
9846                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9847             && OP(n) != ANYOFD
9848             && ckWARN_d(WARN_NON_UNICODE))
9849         {
9850             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9851                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9852         }
9853     }
9854
9855 #if ANYOF_INVERT != 1
9856     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9857      * use it, don't */
9858 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9859 #endif
9860
9861     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9862     return (flags & ANYOF_INVERT) ^ match;
9863 }
9864
9865 STATIC U8 *
9866 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9867 {
9868     /* return the position 'off' UTF-8 characters away from 's', forward if
9869      * 'off' >= 0, backwards if negative.  But don't go outside of position
9870      * 'lim', which better be < s  if off < 0 */
9871
9872     PERL_ARGS_ASSERT_REGHOP3;
9873
9874     if (off >= 0) {
9875         while (off-- && s < lim) {
9876             /* XXX could check well-formedness here */
9877             U8 *new_s = s + UTF8SKIP(s);
9878             if (new_s > lim) /* lim may be in the middle of a long character */
9879                 return s;
9880             s = new_s;
9881         }
9882     }
9883     else {
9884         while (off++ && s > lim) {
9885             s--;
9886             if (UTF8_IS_CONTINUED(*s)) {
9887                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9888                     s--;
9889                 if (! UTF8_IS_START(*s)) {
9890                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9891                 }
9892             }
9893             /* XXX could check well-formedness here */
9894         }
9895     }
9896     return s;
9897 }
9898
9899 STATIC U8 *
9900 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9901 {
9902     PERL_ARGS_ASSERT_REGHOP4;
9903
9904     if (off >= 0) {
9905         while (off-- && s < rlim) {
9906             /* XXX could check well-formedness here */
9907             s += UTF8SKIP(s);
9908         }
9909     }
9910     else {
9911         while (off++ && s > llim) {
9912             s--;
9913             if (UTF8_IS_CONTINUED(*s)) {
9914                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9915                     s--;
9916                 if (! UTF8_IS_START(*s)) {
9917                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9918                 }
9919             }
9920             /* XXX could check well-formedness here */
9921         }
9922     }
9923     return s;
9924 }
9925
9926 /* like reghop3, but returns NULL on overrun, rather than returning last
9927  * char pos */
9928
9929 STATIC U8 *
9930 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9931 {
9932     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9933
9934     if (off >= 0) {
9935         while (off-- && s < lim) {
9936             /* XXX could check well-formedness here */
9937             s += UTF8SKIP(s);
9938         }
9939         if (off >= 0)
9940             return NULL;
9941     }
9942     else {
9943         while (off++ && s > lim) {
9944             s--;
9945             if (UTF8_IS_CONTINUED(*s)) {
9946                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9947                     s--;
9948                 if (! UTF8_IS_START(*s)) {
9949                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9950                 }
9951             }
9952             /* XXX could check well-formedness here */
9953         }
9954         if (off <= 0)
9955             return NULL;
9956     }
9957     return s;
9958 }
9959
9960
9961 /* when executing a regex that may have (?{}), extra stuff needs setting
9962    up that will be visible to the called code, even before the current
9963    match has finished. In particular:
9964
9965    * $_ is localised to the SV currently being matched;
9966    * pos($_) is created if necessary, ready to be updated on each call-out
9967      to code;
9968    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9969      isn't set until the current pattern is successfully finished), so that
9970      $1 etc of the match-so-far can be seen;
9971    * save the old values of subbeg etc of the current regex, and  set then
9972      to the current string (again, this is normally only done at the end
9973      of execution)
9974 */
9975
9976 static void
9977 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9978 {
9979     MAGIC *mg;
9980     regexp *const rex = ReANY(reginfo->prog);
9981     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9982
9983     eval_state->rex = rex;
9984
9985     if (reginfo->sv) {
9986         /* Make $_ available to executed code. */
9987         if (reginfo->sv != DEFSV) {
9988             SAVE_DEFSV;
9989             DEFSV_set(reginfo->sv);
9990         }
9991
9992         if (!(mg = mg_find_mglob(reginfo->sv))) {
9993             /* prepare for quick setting of pos */
9994             mg = sv_magicext_mglob(reginfo->sv);
9995             mg->mg_len = -1;
9996         }
9997         eval_state->pos_magic = mg;
9998         eval_state->pos       = mg->mg_len;
9999         eval_state->pos_flags = mg->mg_flags;
10000     }
10001     else
10002         eval_state->pos_magic = NULL;
10003
10004     if (!PL_reg_curpm) {
10005         /* PL_reg_curpm is a fake PMOP that we can attach the current
10006          * regex to and point PL_curpm at, so that $1 et al are visible
10007          * within a /(?{})/. It's just allocated once per interpreter the
10008          * first time its needed */
10009         Newxz(PL_reg_curpm, 1, PMOP);
10010 #ifdef USE_ITHREADS
10011         {
10012             SV* const repointer = &PL_sv_undef;
10013             /* this regexp is also owned by the new PL_reg_curpm, which
10014                will try to free it.  */
10015             av_push(PL_regex_padav, repointer);
10016             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10017             PL_regex_pad = AvARRAY(PL_regex_padav);
10018         }
10019 #endif
10020     }
10021     SET_reg_curpm(reginfo->prog);
10022     eval_state->curpm = PL_curpm;
10023     PL_curpm_under = PL_curpm;
10024     PL_curpm = PL_reg_curpm;
10025     if (RXp_MATCH_COPIED(rex)) {
10026         /*  Here is a serious problem: we cannot rewrite subbeg,
10027             since it may be needed if this match fails.  Thus
10028             $` inside (?{}) could fail... */
10029         eval_state->subbeg     = rex->subbeg;
10030         eval_state->sublen     = rex->sublen;
10031         eval_state->suboffset  = rex->suboffset;
10032         eval_state->subcoffset = rex->subcoffset;
10033 #ifdef PERL_ANY_COW
10034         eval_state->saved_copy = rex->saved_copy;
10035 #endif
10036         RXp_MATCH_COPIED_off(rex);
10037     }
10038     else
10039         eval_state->subbeg = NULL;
10040     rex->subbeg = (char *)reginfo->strbeg;
10041     rex->suboffset = 0;
10042     rex->subcoffset = 0;
10043     rex->sublen = reginfo->strend - reginfo->strbeg;
10044 }
10045
10046
10047 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10048
10049 static void
10050 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10051 {
10052     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10053     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10054     regmatch_slab *s;
10055
10056     Safefree(aux->poscache);
10057
10058     if (eval_state) {
10059
10060         /* undo the effects of S_setup_eval_state() */
10061
10062         if (eval_state->subbeg) {
10063             regexp * const rex = eval_state->rex;
10064             rex->subbeg     = eval_state->subbeg;
10065             rex->sublen     = eval_state->sublen;
10066             rex->suboffset  = eval_state->suboffset;
10067             rex->subcoffset = eval_state->subcoffset;
10068 #ifdef PERL_ANY_COW
10069             rex->saved_copy = eval_state->saved_copy;
10070 #endif
10071             RXp_MATCH_COPIED_on(rex);
10072         }
10073         if (eval_state->pos_magic)
10074         {
10075             eval_state->pos_magic->mg_len = eval_state->pos;
10076             eval_state->pos_magic->mg_flags =
10077                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10078                | (eval_state->pos_flags & MGf_BYTES);
10079         }
10080
10081         PL_curpm = eval_state->curpm;
10082     }
10083
10084     PL_regmatch_state = aux->old_regmatch_state;
10085     PL_regmatch_slab  = aux->old_regmatch_slab;
10086
10087     /* free all slabs above current one - this must be the last action
10088      * of this function, as aux and eval_state are allocated within
10089      * slabs and may be freed here */
10090
10091     s = PL_regmatch_slab->next;
10092     if (s) {
10093         PL_regmatch_slab->next = NULL;
10094         while (s) {
10095             regmatch_slab * const osl = s;
10096             s = s->next;
10097             Safefree(osl);
10098         }
10099     }
10100 }
10101
10102
10103 STATIC void
10104 S_to_utf8_substr(pTHX_ regexp *prog)
10105 {
10106     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10107      * on the converted value */
10108
10109     int i = 1;
10110
10111     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10112
10113     do {
10114         if (prog->substrs->data[i].substr
10115             && !prog->substrs->data[i].utf8_substr) {
10116             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10117             prog->substrs->data[i].utf8_substr = sv;
10118             sv_utf8_upgrade(sv);
10119             if (SvVALID(prog->substrs->data[i].substr)) {
10120                 if (SvTAIL(prog->substrs->data[i].substr)) {
10121                     /* Trim the trailing \n that fbm_compile added last
10122                        time.  */
10123                     SvCUR_set(sv, SvCUR(sv) - 1);
10124                     /* Whilst this makes the SV technically "invalid" (as its
10125                        buffer is no longer followed by "\0") when fbm_compile()
10126                        adds the "\n" back, a "\0" is restored.  */
10127                     fbm_compile(sv, FBMcf_TAIL);
10128                 } else
10129                     fbm_compile(sv, 0);
10130             }
10131             if (prog->substrs->data[i].substr == prog->check_substr)
10132                 prog->check_utf8 = sv;
10133         }
10134     } while (i--);
10135 }
10136
10137 STATIC bool
10138 S_to_byte_substr(pTHX_ regexp *prog)
10139 {
10140     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10141      * on the converted value; returns FALSE if can't be converted. */
10142
10143     int i = 1;
10144
10145     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10146
10147     do {
10148         if (prog->substrs->data[i].utf8_substr
10149             && !prog->substrs->data[i].substr) {
10150             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10151             if (! sv_utf8_downgrade(sv, TRUE)) {
10152                 return FALSE;
10153             }
10154             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10155                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10156                     /* Trim the trailing \n that fbm_compile added last
10157                         time.  */
10158                     SvCUR_set(sv, SvCUR(sv) - 1);
10159                     fbm_compile(sv, FBMcf_TAIL);
10160                 } else
10161                     fbm_compile(sv, 0);
10162             }
10163             prog->substrs->data[i].substr = sv;
10164             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10165                 prog->check_substr = sv;
10166         }
10167     } while (i--);
10168
10169     return TRUE;
10170 }
10171
10172 #ifndef PERL_IN_XSUB_RE
10173
10174 bool
10175 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10176 {
10177     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10178      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10179      * the larger string bounded by 'strbeg' and 'strend'.
10180      *
10181      * 'cp' needs to be assigned (if not a future version of the Unicode
10182      * Standard could make it something that combines with adjacent characters,
10183      * so code using it would then break), and there has to be a GCB break
10184      * before and after the character. */
10185
10186     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10187     const U8 * prev_cp_start;
10188
10189     PERL_ARGS_ASSERT__IS_GRAPHEME;
10190
10191     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10192         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10193     {
10194         /* These are considered graphemes */
10195         return TRUE;
10196     }
10197
10198     /* Otherwise, unassigned code points are forbidden */
10199     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10200                                     _invlist_search(PL_Assigned_invlist, cp))))
10201     {
10202         return FALSE;
10203     }
10204
10205     cp_gcb_val = getGCB_VAL_CP(cp);
10206
10207     /* Find the GCB value of the previous code point in the input */
10208     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10209     if (UNLIKELY(prev_cp_start == s)) {
10210         prev_cp_gcb_val = GCB_EDGE;
10211     }
10212     else {
10213         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10214     }
10215
10216     /* And check that is a grapheme boundary */
10217     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10218                 TRUE /* is UTF-8 encoded */ ))
10219     {
10220         return FALSE;
10221     }
10222
10223     /* Similarly verify there is a break between the current character and the
10224      * following one */
10225     s += UTF8SKIP(s);
10226     if (s >= strend) {
10227         next_cp_gcb_val = GCB_EDGE;
10228     }
10229     else {
10230         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10231     }
10232
10233     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10234 }
10235
10236 /*
10237 =head1 Unicode Support
10238
10239 =for apidoc isSCRIPT_RUN
10240
10241 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10242 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10243 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10244 two degenerate cases given below, this function returns TRUE iff all code
10245 points in it come from any combination of three "scripts" given by the Unicode
10246 "Script Extensions" property: Common, Inherited, and possibly one other.
10247 Additionally all decimal digits must come from the same consecutive sequence of
10248 10.
10249
10250 For example, if all the characters in the sequence are Greek, or Common, or
10251 Inherited, this function will return TRUE, provided any decimal digits in it
10252 are the ASCII digits "0".."9".  For scripts (unlike Greek) that have their own
10253 digits defined this will accept either digits from that set or from 0..9, but
10254 not a combination of the two.  Some scripts, such as Arabic, have more than one
10255 set of digits.  All digits must come from the same set for this function to
10256 return TRUE.
10257
10258 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10259 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10260 C<SCX_INVALID> if the function returns FALSE.
10261
10262 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10263 will be C<SCX_INVALID>.
10264
10265 If the sequence contains a single code point which is unassigned to a character
10266 in the version of Unicode being used, the function will return TRUE, and the
10267 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10268 in the input sequence will result in the function treating the input as not
10269 being a script run.
10270
10271 The returned script will be C<SCX_Inherited> iff all the code points in it are
10272 from the Inherited script.
10273
10274 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10275 it are from the Inherited or Common scripts.
10276
10277 =cut
10278
10279 */
10280
10281 bool
10282 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10283 {
10284     /* Basically, it looks at each character in the sequence to see if the
10285      * above conditions are met; if not it fails.  It uses an inversion map to
10286      * find the enum corresponding to the script of each character.  But this
10287      * is complicated by the fact that a few code points can be in any of
10288      * several scripts.  The data has been constructed so that there are
10289      * additional enum values (all negative) for these situations.  The
10290      * absolute value of those is an index into another table which contains
10291      * pointers to auxiliary tables for each such situation.  Each aux array
10292      * lists all the scripts for the given situation.  There is another,
10293      * parallel, table that gives the number of entries in each aux table.
10294      * These are all defined in charclass_invlists.h */
10295
10296     /* XXX Here are the additional things UTS 39 says could be done:
10297      *
10298      * Forbid sequences of the same nonspacing mark
10299      *
10300      * Check to see that all the characters are in the sets of exemplar
10301      * characters for at least one language in the Unicode Common Locale Data
10302      * Repository [CLDR]. */
10303
10304
10305     /* Things that match /\d/u */
10306     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10307     UV * decimals_array = invlist_array(decimals_invlist);
10308
10309     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10310      * not currently known) */
10311     UV zero_of_run = 0;
10312
10313     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10314     SCX_enum script_of_char = SCX_INVALID;
10315
10316     /* If the script remains not fully determined from iteration to iteration,
10317      * this is the current intersection of the possiblities.  */
10318     SCX_enum * intersection = NULL;
10319     PERL_UINT_FAST8_T intersection_len = 0;
10320
10321     bool retval = TRUE;
10322     SCX_enum * ret_script = NULL;
10323
10324     assert(send >= s);
10325
10326     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10327
10328     /* All code points in 0..255 are either Common or Latin, so must be a
10329      * script run.  We can return immediately unless we need to know which
10330      * script it is. */
10331     if (! utf8_target && LIKELY(send > s)) {
10332         if (ret_script == NULL) {
10333             return TRUE;
10334         }
10335
10336         /* If any character is Latin, the run is Latin */
10337         while (s < send) {
10338             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10339                 *ret_script = SCX_Latin;
10340                 return TRUE;
10341             }
10342         }
10343
10344         /* Here, all are Common */
10345         *ret_script = SCX_Common;
10346         return TRUE;
10347     }
10348
10349     /* Look at each character in the sequence */
10350     while (s < send) {
10351         /* If the current character being examined is a digit, this is the code
10352          * point of the zero for its sequence of 10 */
10353         UV zero_of_char;
10354
10355         UV cp;
10356
10357         /* The code allows all scripts to use the ASCII digits.  This is
10358          * because they are used in commerce even in scripts that have their
10359          * own set.  Hence any ASCII ones found are ok, unless and until a
10360          * digit from another set has already been encountered.  (The other
10361          * digit ranges in Common are not similarly blessed) */
10362         if (UNLIKELY(isDIGIT(*s))) {
10363             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10364                 retval = FALSE;
10365                 break;
10366             }
10367             if (zero_of_run) {
10368                 if (zero_of_run != '0') {
10369                     retval = FALSE;
10370                     break;
10371                 }
10372             }
10373             else {
10374                 zero_of_run = '0';
10375             }
10376             s++;
10377             continue;
10378         }
10379
10380         /* Here, isn't an ASCII digit.  Find the code point of the character */
10381         if (! UTF8_IS_INVARIANT(*s)) {
10382             Size_t len;
10383             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10384             s += len;
10385         }
10386         else {
10387             cp = *(s++);
10388         }
10389
10390         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10391          * digit in that script.  We can skip the rest of this code for this
10392          * character. */
10393         if (UNLIKELY(   zero_of_run
10394                      && cp >= zero_of_run
10395                      && cp - zero_of_run <= 9))
10396         {
10397             continue;
10398         }
10399
10400         /* Find the character's script.  The correct values are hard-coded here
10401          * for small-enough code points. */
10402         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10403                                unlikely to change */
10404             if (       cp > 255
10405                 || (   isALPHA_L1(cp)
10406                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10407             {
10408                 script_of_char = SCX_Latin;
10409             }
10410             else {
10411                 script_of_char = SCX_Common;
10412             }
10413         }
10414         else {
10415             script_of_char = _Perl_SCX_invmap[
10416                                        _invlist_search(PL_SCX_invlist, cp)];
10417         }
10418
10419         /* We arbitrarily accept a single unassigned character, but not in
10420          * combination with anything else, and not a run of them. */
10421         if (   UNLIKELY(script_of_run == SCX_Unknown)
10422             || UNLIKELY(   script_of_run != SCX_INVALID
10423                         && script_of_char == SCX_Unknown))
10424         {
10425             retval = FALSE;
10426             break;
10427         }
10428
10429         /* For the first character, or the run is inherited, the run's script
10430          * is set to the char's */
10431         if (   UNLIKELY(script_of_run == SCX_INVALID)
10432             || UNLIKELY(script_of_run == SCX_Inherited))
10433         {
10434             script_of_run = script_of_char;
10435         }
10436
10437         /* For the character's script to be Unknown, it must be the first
10438          * character in the sequence (for otherwise a test above would have
10439          * prevented us from reaching here), and we have set the run's script
10440          * to it.  Nothing further to be done for this character */
10441         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10442             continue;
10443         }
10444
10445         /* We accept 'inherited' script characters currently even at the
10446          * beginning.  (We know that no characters in Inherited are digits, or
10447          * we'd have to check for that) */
10448         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10449             continue;
10450         }
10451
10452         /* If the run so far is Common, and the new character isn't, change the
10453          * run's script to that of this character */
10454         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10455
10456             /* But Common contains several sets of digits.  Only the '0' set
10457              * can be part of another script. */
10458             if (zero_of_run && zero_of_run != '0') {
10459                 retval = FALSE;
10460                 break;
10461             }
10462
10463             script_of_run = script_of_char;
10464         }
10465
10466         /* Now we can see if the script of the character is the same as that of
10467          * the run */
10468         if (LIKELY(script_of_char == script_of_run)) {
10469             /* By far the most common case */
10470             goto scripts_match;
10471         }
10472
10473         /* Here, the script of the run isn't Common.  But characters in Common
10474          * match any script */
10475         if (script_of_char == SCX_Common) {
10476             goto scripts_match;
10477         }
10478
10479 #ifndef HAS_SCX_AUX_TABLES
10480
10481         /* Too early a Unicode version to have a code point belonging to more
10482          * than one script, so, if the scripts don't exactly match, fail */
10483         PERL_UNUSED_VAR(intersection_len);
10484         retval = FALSE;
10485         break;
10486
10487 #else
10488
10489         /* Here there is no exact match between the character's script and the
10490          * run's.  And we've handled the special cases of scripts Unknown,
10491          * Inherited, and Common.
10492          *
10493          * Negative script numbers signify that the value may be any of several
10494          * scripts, and we need to look at auxiliary information to make our
10495          * deterimination.  But if both are non-negative, we can fail now */
10496         if (LIKELY(script_of_char >= 0)) {
10497             const SCX_enum * search_in;
10498             PERL_UINT_FAST8_T search_in_len;
10499             PERL_UINT_FAST8_T i;
10500
10501             if (LIKELY(script_of_run >= 0)) {
10502                 retval = FALSE;
10503                 break;
10504             }
10505
10506             /* Use the previously constructed set of possible scripts, if any.
10507              * */
10508             if (intersection) {
10509                 search_in = intersection;
10510                 search_in_len = intersection_len;
10511             }
10512             else {
10513                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10514                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10515             }
10516
10517             for (i = 0; i < search_in_len; i++) {
10518                 if (search_in[i] == script_of_char) {
10519                     script_of_run = script_of_char;
10520                     goto scripts_match;
10521                 }
10522             }
10523
10524             retval = FALSE;
10525             break;
10526         }
10527         else if (LIKELY(script_of_run >= 0)) {
10528             /* script of character could be one of several, but run is a single
10529              * script */
10530             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10531             const PERL_UINT_FAST8_T search_in_len
10532                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10533             PERL_UINT_FAST8_T i;
10534
10535             for (i = 0; i < search_in_len; i++) {
10536                 if (search_in[i] == script_of_run) {
10537                     script_of_char = script_of_run;
10538                     goto scripts_match;
10539                 }
10540             }
10541
10542             retval = FALSE;
10543             break;
10544         }
10545         else {
10546             /* Both run and char could be in one of several scripts.  If the
10547              * intersection is empty, then this character isn't in this script
10548              * run.  Otherwise, we need to calculate the intersection to use
10549              * for future iterations of the loop, unless we are already at the
10550              * final character */
10551             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10552             const PERL_UINT_FAST8_T char_len
10553                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10554             const SCX_enum * search_run;
10555             PERL_UINT_FAST8_T run_len;
10556
10557             SCX_enum * new_overlap = NULL;
10558             PERL_UINT_FAST8_T i, j;
10559
10560             if (intersection) {
10561                 search_run = intersection;
10562                 run_len = intersection_len;
10563             }
10564             else {
10565                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10566                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10567             }
10568
10569             intersection_len = 0;
10570
10571             for (i = 0; i < run_len; i++) {
10572                 for (j = 0; j < char_len; j++) {
10573                     if (search_run[i] == search_char[j]) {
10574
10575                         /* Here, the script at i,j matches.  That means this
10576                          * character is in the run.  But continue on to find
10577                          * the complete intersection, for the next loop
10578                          * iteration, and for the digit check after it.
10579                          *
10580                          * On the first found common script, we malloc space
10581                          * for the intersection list for the worst case of the
10582                          * intersection, which is the minimum of the number of
10583                          * scripts remaining in each set. */
10584                         if (intersection_len == 0) {
10585                             Newx(new_overlap,
10586                                  MIN(run_len - i, char_len - j),
10587                                  SCX_enum);
10588                         }
10589                         new_overlap[intersection_len++] = search_run[i];
10590                     }
10591                 }
10592             }
10593
10594             /* Here we've looked through everything.  If they have no scripts
10595              * in common, not a run */
10596             if (intersection_len == 0) {
10597                 retval = FALSE;
10598                 break;
10599             }
10600
10601             /* If there is only a single script in common, set to that.
10602              * Otherwise, use the intersection going forward */
10603             Safefree(intersection);
10604             intersection = NULL;
10605             if (intersection_len == 1) {
10606                 script_of_run = script_of_char = new_overlap[0];
10607                 Safefree(new_overlap);
10608                 new_overlap = NULL;
10609             }
10610             else {
10611                 intersection = new_overlap;
10612             }
10613         }
10614
10615 #endif
10616
10617   scripts_match:
10618
10619         /* Here, the script of the character is compatible with that of the
10620          * run.  That means that in most cases, it continues the script run.
10621          * Either it and the run match exactly, or one or both can be in any of
10622          * several scripts, and the intersection is not empty.  However, if the
10623          * character is a decimal digit, it could still mean failure if it is
10624          * from the wrong sequence of 10.  So, we need to look at if it's a
10625          * digit.  We've already handled the 10 decimal digits, and the next
10626          * lowest one is this one: */
10627         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10628             continue;   /* Not a digit; this character is part of the run */
10629         }
10630
10631         /* If we have a definitive '0' for the script of this character, we
10632          * know that for this to be a digit, it must be in the range of +0..+9
10633          * of that zero. */
10634         if (   script_of_char >= 0
10635             && (zero_of_char = script_zeros[script_of_char]))
10636         {
10637             if (   cp < zero_of_char
10638                 || cp > zero_of_char + 9)
10639             {
10640                 continue;   /* Not a digit; this character is part of the run
10641                              */
10642             }
10643
10644         }
10645         else {  /* Need to look up if this character is a digit or not */
10646             SSize_t index_of_zero_of_char;
10647             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10648             if (     UNLIKELY(index_of_zero_of_char < 0)
10649                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10650             {
10651                 continue;   /* Not a digit; this character is part of the run.
10652                              */
10653             }
10654
10655             zero_of_char = decimals_array[index_of_zero_of_char];
10656         }
10657
10658         /* Here, the character is a decimal digit, and the zero of its sequence
10659          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
10660          * they better be the same. */
10661         if (zero_of_run) {
10662             if (zero_of_run != zero_of_char) {
10663                 retval = FALSE;
10664                 break;
10665             }
10666         }
10667         else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
10668
10669             /* Here, the script run isn't Common, but the current digit is in
10670              * Common, and isn't '0'-'9' (those were handled earlier).   Only
10671              * '0'-'9' are acceptable in non-Common scripts. */
10672             retval = FALSE;
10673             break;
10674         }
10675         else {  /* Otherwise we now have a zero for this run */
10676             zero_of_run = zero_of_char;
10677         }
10678     } /* end of looping through CLOSESR text */
10679
10680     Safefree(intersection);
10681
10682     if (ret_script != NULL) {
10683         if (retval) {
10684             *ret_script = script_of_run;
10685         }
10686         else {
10687             *ret_script = SCX_INVALID;
10688         }
10689     }
10690
10691     return retval;
10692 }
10693
10694 #endif /* ifndef PERL_IN_XSUB_RE */
10695
10696 /*
10697  * ex: set ts=8 sts=4 sw=4 et:
10698  */