regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid only if 'c', the character being looke-up, is an invariant under
 111  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 112  * everything matchable is straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 114                                 ? reginclass(prog,p,c,c+1,u)                \
 115                                 : ANYOF_BITMAP_TEST(p,*(c)))
 116
 117 /*
 118  * Forwards.
 119  */
 120
 121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 122 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 123
 124 #define HOPc(pos,off) \
 125         (char *)(reginfo->is_utf8_target \
 126             ? reghop3((U8*)pos, off, \
 127                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 128             : (U8*)(pos + off))
 129
 130 #define HOPBACKc(pos, off) \
 131         (char*)(reginfo->is_utf8_target \
 132             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \
 133             : (pos - off >= reginfo->strbeg)    \
 134                 ? (U8*)pos - off                \
 135                 : NULL)
 136
 137 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 138 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 139
 140 /* lim must be +ve. Returns NULL on overshoot */
 141 #define HOPMAYBE3(pos,off,lim) \
 142         (reginfo->is_utf8_target                        \
 143             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 144             : ((U8*)pos + off <= lim)                   \
 145                 ? (U8*)pos + off                        \
 146                 : NULL)
 147
 148 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 149  * off must be >=0; args should be vars rather than expressions */
 150 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 151     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 152     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 153
 154 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 155     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 156     : (U8*)(pos + off))
 157 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 158
 159 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 160 #define NEXTCHR_IS_EOS (nextchr < 0)
 161
 162 #define SET_nextchr \
 163     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 164
 165 #define SET_locinput(p) \
 166     locinput = (p);  \
 167     SET_nextchr
 168
 169
 170 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 171         if (!swash_ptr) {                                                     \
 172             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 173             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 174                                          1, 0, invlist, &flags);              \
 175             assert(swash_ptr);                                                \
 176         }                                                                     \
 177     } STMT_END
 178
 179 /* If in debug mode, we test that a known character properly matches */
 180 #ifdef DEBUGGING
 181 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 182                                           property_name,                      \
 183                                           invlist,                            \
 184                                           utf8_char_in_property)              \
 185         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 186         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 187 #else
 188 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 189                                           property_name,                      \
 190                                           invlist,                            \
 191                                           utf8_char_in_property)              \
 192         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 193 #endif
 194
 195 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 196                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 197                                         "",                                   \
 198                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 199                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 200
 201 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 202 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 203
 204 /* for use after a quantifier and before an EXACT-like node -- japhy */
 205 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 206  *
 207  * NOTE that *nothing* that affects backtracking should be in here, specifically
 208  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 209  * node that is in between two EXACT like nodes when ascertaining what the required
 210  * "follow" character is. This should probably be moved to regex compile time
 211  * although it may be done at run time beause of the REF possibility - more
 212  * investigation required. -- demerphq
 213 */
 214 #define JUMPABLE(rn) (                                                             \
 215     OP(rn) == OPEN ||                                                              \
 216     (OP(rn) == CLOSE &&                                                            \
 217      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 218     OP(rn) == EVAL ||                                                              \
 219     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 220     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 221     OP(rn) == KEEPS ||                                                             \
 222     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 223 )
 224 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 225
 226 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 227
 228 #if 0
 229 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 230    we don't need this definition.  XXX These are now out-of-sync*/
 231 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 232 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 233 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 234
 235 #else
 236 /* ... so we use this as its faster. */
 237 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 238 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 239 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 240 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 241
 242 #endif
 243
 244 /*
 245   Search for mandatory following text node; for lookahead, the text must
 246   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 247 */
 248 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 249     while (JUMPABLE(rn)) { \
 250         const OPCODE type = OP(rn); \
 251         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 252             rn = NEXTOPER(NEXTOPER(rn)); \
 253         else if (type == PLUS) \
 254             rn = NEXTOPER(rn); \
 255         else if (type == IFMATCH) \
 256             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 257         else rn += NEXT_OFF(rn); \
 258     } \
 259 } STMT_END
 260
 261 #define SLAB_FIRST(s) (&(s)->states[0])
 262 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 263
 264 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 265 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 266 static regmatch_state * S_push_slab(pTHX);
 267
 268 #define REGCP_PAREN_ELEMS 3
 269 #define REGCP_OTHER_ELEMS 3
 270 #define REGCP_FRAME_ELEMS 1
 271 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 272  * are needed for the regexp context stack bookkeeping. */
 273
 274 STATIC CHECKPOINT
 275 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 276 {
 277     const int retval = PL_savestack_ix;
 278     const int paren_elems_to_push =
 279                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 280     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 281     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 282     I32 p;
 283     GET_RE_DEBUG_FLAGS_DECL;
 284
 285     PERL_ARGS_ASSERT_REGCPPUSH;
 286
 287     if (paren_elems_to_push < 0)
 288         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 289                    (int)paren_elems_to_push, (int)maxopenparen,
 290                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 291
 292     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 293         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 294                    " out of range (%lu-%ld)",
 295                    total_elems,
 296                    (unsigned long)maxopenparen,
 297                    (long)parenfloor);
 298
 299     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 300
 301     DEBUG_BUFFERS_r(
 302         if ((int)maxopenparen > (int)parenfloor)
 303             Perl_re_exec_indentf( aTHX_
 304                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 305                 depth,
 306                 PTR2UV(rex),
 307                 PTR2UV(rex->offs)
 308             );
 309     );
 310     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 311 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 312         SSPUSHIV(rex->offs[p].end);
 313         SSPUSHIV(rex->offs[p].start);
 314         SSPUSHINT(rex->offs[p].start_tmp);
 315         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 316             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 317             depth,
 318             (UV)p,
 319             (IV)rex->offs[p].start,
 320             (IV)rex->offs[p].start_tmp,
 321             (IV)rex->offs[p].end
 322         ));
 323     }
 324 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 325     SSPUSHINT(maxopenparen);
 326     SSPUSHINT(rex->lastparen);
 327     SSPUSHINT(rex->lastcloseparen);
 328     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 329
 330     return retval;
 331 }
 332
 333 /* These are needed since we do not localize EVAL nodes: */
 334 #define REGCP_SET(cp)                                           \
 335     DEBUG_STATE_r(                                              \
 336         Perl_re_exec_indentf( aTHX_                                         \
 337             "Setting an EVAL scope, savestack=%"IVdf",\n",      \
 338             depth, (IV)PL_savestack_ix                          \
 339         )                                                       \
 340     );                                                          \
 341     cp = PL_savestack_ix
 342
 343 #define REGCP_UNWIND(cp)                                        \
 344     DEBUG_STATE_r(                                              \
 345         if (cp != PL_savestack_ix)                              \
 346             Perl_re_exec_indentf( aTHX_                                     \
 347                 "Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n",\
 348                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 349             )                                                   \
 350     );                                                          \
 351     regcpblow(cp)
 352
 353 #define UNWIND_PAREN(lp, lcp)               \
 354     for (n = rex->lastparen; n > lp; n--)   \
 355         rex->offs[n].end = -1;              \
 356     rex->lastparen = n;                     \
 357     rex->lastcloseparen = lcp;
 358
 359
 360 STATIC void
 361 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 362 {
 363     UV i;
 364     U32 paren;
 365     GET_RE_DEBUG_FLAGS_DECL;
 366
 367     PERL_ARGS_ASSERT_REGCPPOP;
 368
 369     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 370     i = SSPOPUV;
 371     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 372     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 373     rex->lastcloseparen = SSPOPINT;
 374     rex->lastparen = SSPOPINT;
 375     *maxopenparen_p = SSPOPINT;
 376
 377     i -= REGCP_OTHER_ELEMS;
 378     /* Now restore the parentheses context. */
 379     DEBUG_BUFFERS_r(
 380         if (i || rex->lastparen + 1 <= rex->nparens)
 381             Perl_re_exec_indentf( aTHX_
 382                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 383                 depth,
 384                 PTR2UV(rex),
 385                 PTR2UV(rex->offs)
 386             );
 387     );
 388     paren = *maxopenparen_p;
 389     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 390         SSize_t tmps;
 391         rex->offs[paren].start_tmp = SSPOPINT;
 392         rex->offs[paren].start = SSPOPIV;
 393         tmps = SSPOPIV;
 394         if (paren <= rex->lastparen)
 395             rex->offs[paren].end = tmps;
 396         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 397             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 398             depth,
 399             (UV)paren,
 400             (IV)rex->offs[paren].start,
 401             (IV)rex->offs[paren].start_tmp,
 402             (IV)rex->offs[paren].end,
 403             (paren > rex->lastparen ? "(skipped)" : ""));
 404         );
 405         paren--;
 406     }
 407 #if 1
 408     /* It would seem that the similar code in regtry()
 409      * already takes care of this, and in fact it is in
 410      * a better location to since this code can #if 0-ed out
 411      * but the code in regtry() is needed or otherwise tests
 412      * requiring null fields (pat.t#187 and split.t#{13,14}
 413      * (as of patchlevel 7877)  will fail.  Then again,
 414      * this code seems to be necessary or otherwise
 415      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 416      * --jhi updated by dapm */
 417     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 418         if (i > *maxopenparen_p)
 419             rex->offs[i].start = -1;
 420         rex->offs[i].end = -1;
 421         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 422             "    \\%"UVuf": %s   ..-1 undeffing\n",
 423             depth,
 424             (UV)i,
 425             (i > *maxopenparen_p) ? "-1" : "  "
 426         ));
 427     }
 428 #endif
 429 }
 430
 431 /* restore the parens and associated vars at savestack position ix,
 432  * but without popping the stack */
 433
 434 STATIC void
 435 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 436 {
 437     I32 tmpix = PL_savestack_ix;
 438     PERL_ARGS_ASSERT_REGCP_RESTORE;
 439
 440     PL_savestack_ix = ix;
 441     regcppop(rex, maxopenparen_p);
 442     PL_savestack_ix = tmpix;
 443 }
 444
 445 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 446
 447 STATIC bool
 448 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 449 {
 450     /* Returns a boolean as to whether or not 'character' is a member of the
 451      * Posix character class given by 'classnum' that should be equivalent to a
 452      * value in the typedef '_char_class_number'.
 453      *
 454      * Ideally this could be replaced by a just an array of function pointers
 455      * to the C library functions that implement the macros this calls.
 456      * However, to compile, the precise function signatures are required, and
 457      * these may vary from platform to to platform.  To avoid having to figure
 458      * out what those all are on each platform, I (khw) am using this method,
 459      * which adds an extra layer of function call overhead (unless the C
 460      * optimizer strips it away).  But we don't particularly care about
 461      * performance with locales anyway. */
 462
 463     switch ((_char_class_number) classnum) {
 464         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 465         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 466         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 467         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 468         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 469                                         || isUPPER_LC(character);
 470         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 471         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 472         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 473         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 474         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 475         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 476         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 477         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 478         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 479         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 480         default:    /* VERTSPACE should never occur in locales */
 481             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 482     }
 483
 484     NOT_REACHED; /* NOTREACHED */
 485     return FALSE;
 486 }
 487
 488 STATIC bool
 489 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 490 {
 491     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 492      * 'character' is a member of the Posix character class given by 'classnum'
 493      * that should be equivalent to a value in the typedef
 494      * '_char_class_number'.
 495      *
 496      * This just calls isFOO_lc on the code point for the character if it is in
 497      * the range 0-255.  Outside that range, all characters use Unicode
 498      * rules, ignoring any locale.  So use the Unicode function if this class
 499      * requires a swash, and use the Unicode macro otherwise. */
 500
 501     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 502
 503     if (UTF8_IS_INVARIANT(*character)) {
 504         return isFOO_lc(classnum, *character);
 505     }
 506     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 507         return isFOO_lc(classnum,
 508                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 509     }
 510
 511     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 512
 513     if (classnum < _FIRST_NON_SWASH_CC) {
 514
 515         /* Initialize the swash unless done already */
 516         if (! PL_utf8_swash_ptrs[classnum]) {
 517             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 518             PL_utf8_swash_ptrs[classnum] =
 519                     _core_swash_init("utf8",
 520                                      "",
 521                                      &PL_sv_undef, 1, 0,
 522                                      PL_XPosix_ptrs[classnum], &flags);
 523         }
 524
 525         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 526                                  character,
 527                                  TRUE /* is UTF */ ));
 528     }
 529
 530     switch ((_char_class_number) classnum) {
 531         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 532         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 533         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 534         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 535         default:                 break;
 536     }
 537
 538     return FALSE; /* Things like CNTRL are always below 256 */
 539 }
 540
 541 /*
 542  * pregexec and friends
 543  */
 544
 545 #ifndef PERL_IN_XSUB_RE
 546 /*
 547  - pregexec - match a regexp against a string
 548  */
 549 I32
 550 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 551          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 552 /* stringarg: the point in the string at which to begin matching */
 553 /* strend:    pointer to null at end of string */
 554 /* strbeg:    real beginning of string */
 555 /* minend:    end of match must be >= minend bytes after stringarg. */
 556 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 557  *            itself is accessed via the pointers above */
 558 /* nosave:    For optimizations. */
 559 {
 560     PERL_ARGS_ASSERT_PREGEXEC;
 561
 562     return
 563         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 564                       nosave ? 0 : REXEC_COPY_STR);
 565 }
 566 #endif
 567
 568
 569
 570 /* re_intuit_start():
 571  *
 572  * Based on some optimiser hints, try to find the earliest position in the
 573  * string where the regex could match.
 574  *
 575  *   rx:     the regex to match against
 576  *   sv:     the SV being matched: only used for utf8 flag; the string
 577  *           itself is accessed via the pointers below. Note that on
 578  *           something like an overloaded SV, SvPOK(sv) may be false
 579  *           and the string pointers may point to something unrelated to
 580  *           the SV itself.
 581  *   strbeg: real beginning of string
 582  *   strpos: the point in the string at which to begin matching
 583  *   strend: pointer to the byte following the last char of the string
 584  *   flags   currently unused; set to 0
 585  *   data:   currently unused; set to NULL
 586  *
 587  * The basic idea of re_intuit_start() is to use some known information
 588  * about the pattern, namely:
 589  *
 590  *   a) the longest known anchored substring (i.e. one that's at a
 591  *      constant offset from the beginning of the pattern; but not
 592  *      necessarily at a fixed offset from the beginning of the
 593  *      string);
 594  *   b) the longest floating substring (i.e. one that's not at a constant
 595  *      offset from the beginning of the pattern);
 596  *   c) Whether the pattern is anchored to the string; either
 597  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 598  *      or anchored to pos(): /\G/;
 599  *   d) A start class: a real or synthetic character class which
 600  *      represents which characters are legal at the start of the pattern;
 601  *
 602  * to either quickly reject the match, or to find the earliest position
 603  * within the string at which the pattern might match, thus avoiding
 604  * running the full NFA engine at those earlier locations, only to
 605  * eventually fail and retry further along.
 606  *
 607  * Returns NULL if the pattern can't match, or returns the address within
 608  * the string which is the earliest place the match could occur.
 609  *
 610  * The longest of the anchored and floating substrings is called 'check'
 611  * and is checked first. The other is called 'other' and is checked
 612  * second. The 'other' substring may not be present.  For example,
 613  *
 614  *    /(abc|xyz)ABC\d{0,3}DEFG/
 615  *
 616  * will have
 617  *
 618  *   check substr (float)    = "DEFG", offset 6..9 chars
 619  *   other substr (anchored) = "ABC",  offset 3..3 chars
 620  *   stclass = [ax]
 621  *
 622  * Be aware that during the course of this function, sometimes 'anchored'
 623  * refers to a substring being anchored relative to the start of the
 624  * pattern, and sometimes to the pattern itself being anchored relative to
 625  * the string. For example:
 626  *
 627  *   /\dabc/:   "abc" is anchored to the pattern;
 628  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 629  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 630  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 631  *                    but the pattern is anchored to the string.
 632  */
 633
 634 char *
 635 Perl_re_intuit_start(pTHX_
 636                     REGEXP * const rx,
 637                     SV *sv,
 638                     const char * const strbeg,
 639                     char *strpos,
 640                     char *strend,
 641                     const U32 flags,
 642                     re_scream_pos_data *data)
 643 {
 644     struct regexp *const prog = ReANY(rx);
 645     SSize_t start_shift = prog->check_offset_min;
 646     /* Should be nonnegative! */
 647     SSize_t end_shift   = 0;
 648     /* current lowest pos in string where the regex can start matching */
 649     char *rx_origin = strpos;
 650     SV *check;
 651     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 652     U8   other_ix = 1 - prog->substrs->check_ix;
 653     bool ml_anch = 0;
 654     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 655     char *check_at = NULL;              /* check substr found at this pos */
 656     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 657     RXi_GET_DECL(prog,progi);
 658     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 659     regmatch_info *const reginfo = &reginfo_buf;
 660     GET_RE_DEBUG_FLAGS_DECL;
 661
 662     PERL_ARGS_ASSERT_RE_INTUIT_START;
 663     PERL_UNUSED_ARG(flags);
 664     PERL_UNUSED_ARG(data);
 665
 666     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 667                 "Intuit: trying to determine minimum start position...\n"));
 668
 669     /* for now, assume that all substr offsets are positive. If at some point
 670      * in the future someone wants to do clever things with lookbehind and
 671      * -ve offsets, they'll need to fix up any code in this function
 672      * which uses these offsets. See the thread beginning
 673      * <20140113145929.GF27210@iabyn.com>
 674      */
 675     assert(prog->substrs->data[0].min_offset >= 0);
 676     assert(prog->substrs->data[0].max_offset >= 0);
 677     assert(prog->substrs->data[1].min_offset >= 0);
 678     assert(prog->substrs->data[1].max_offset >= 0);
 679     assert(prog->substrs->data[2].min_offset >= 0);
 680     assert(prog->substrs->data[2].max_offset >= 0);
 681
 682     /* for now, assume that if both present, that the floating substring
 683      * doesn't start before the anchored substring.
 684      * If you break this assumption (e.g. doing better optimisations
 685      * with lookahead/behind), then you'll need to audit the code in this
 686      * function carefully first
 687      */
 688     assert(
 689             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 690               && (prog->float_utf8    || prog->float_substr))
 691            || (prog->float_min_offset >= prog->anchored_offset));
 692
 693     /* byte rather than char calculation for efficiency. It fails
 694      * to quickly reject some cases that can't match, but will reject
 695      * them later after doing full char arithmetic */
 696     if (prog->minlen > strend - strpos) {
 697         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 698                               "  String too short...\n"));
 699         goto fail;
 700     }
 701
 702     RX_MATCH_UTF8_set(rx,utf8_target);
 703     reginfo->is_utf8_target = cBOOL(utf8_target);
 704     reginfo->info_aux = NULL;
 705     reginfo->strbeg = strbeg;
 706     reginfo->strend = strend;
 707     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 708     reginfo->intuit = 1;
 709     /* not actually used within intuit, but zero for safety anyway */
 710     reginfo->poscache_maxiter = 0;
 711
 712     if (utf8_target) {
 713         if ((!prog->anchored_utf8 && prog->anchored_substr)
 714                 || (!prog->float_utf8 && prog->float_substr))
 715             to_utf8_substr(prog);
 716         check = prog->check_utf8;
 717     } else {
 718         if (!prog->check_substr && prog->check_utf8) {
 719             if (! to_byte_substr(prog)) {
 720                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 721             }
 722         }
 723         check = prog->check_substr;
 724     }
 725
 726     /* dump the various substring data */
 727     DEBUG_OPTIMISE_MORE_r({
 728         int i;
 729         for (i=0; i<=2; i++) {
 730             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 731                                   : prog->substrs->data[i].substr);
 732             if (!sv)
 733                 continue;
 734
 735             Perl_re_printf( aTHX_
 736                 "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 737                 " useful=%"IVdf" utf8=%d [%s]\n",
 738                 i,
 739                 (IV)prog->substrs->data[i].min_offset,
 740                 (IV)prog->substrs->data[i].max_offset,
 741                 (IV)prog->substrs->data[i].end_shift,
 742                 BmUSEFUL(sv),
 743                 utf8_target ? 1 : 0,
 744                 SvPEEK(sv));
 745         }
 746     });
 747
 748     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 749
 750         /* ml_anch: check after \n?
 751          *
 752          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 753          * with /.*.../, these flags will have been added by the
 754          * compiler:
 755          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 756          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 757          */
 758         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 759                    && !(prog->intflags & PREGf_IMPLICIT);
 760
 761         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 762             /* we are only allowed to match at BOS or \G */
 763
 764             /* trivially reject if there's a BOS anchor and we're not at BOS.
 765              *
 766              * Note that we don't try to do a similar quick reject for
 767              * \G, since generally the caller will have calculated strpos
 768              * based on pos() and gofs, so the string is already correctly
 769              * anchored by definition; and handling the exceptions would
 770              * be too fiddly (e.g. REXEC_IGNOREPOS).
 771              */
 772             if (   strpos != strbeg
 773                 && (prog->intflags & PREGf_ANCH_SBOL))
 774             {
 775                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 776                                 "  Not at start...\n"));
 777                 goto fail;
 778             }
 779
 780             /* in the presence of an anchor, the anchored (relative to the
 781              * start of the regex) substr must also be anchored relative
 782              * to strpos. So quickly reject if substr isn't found there.
 783              * This works for \G too, because the caller will already have
 784              * subtracted gofs from pos, and gofs is the offset from the
 785              * \G to the start of the regex. For example, in /.abc\Gdef/,
 786              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 787              * caller will have set strpos=pos()-4; we look for the substr
 788              * at position pos()-4+1, which lines up with the "a" */
 789
 790             if (prog->check_offset_min == prog->check_offset_max) {
 791                 /* Substring at constant offset from beg-of-str... */
 792                 SSize_t slen = SvCUR(check);
 793                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 794
 795                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 796                     "  Looking for check substr at fixed offset %"IVdf"...\n",
 797                     (IV)prog->check_offset_min));
 798
 799                 if (SvTAIL(check)) {
 800                     /* In this case, the regex is anchored at the end too.
 801                      * Unless it's a multiline match, the lengths must match
 802                      * exactly, give or take a \n.  NB: slen >= 1 since
 803                      * the last char of check is \n */
 804                     if (!multiline
 805                         && (   strend - s > slen
 806                             || strend - s < slen - 1
 807                             || (strend - s == slen && strend[-1] != '\n')))
 808                     {
 809                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 810                                             "  String too long...\n"));
 811                         goto fail_finish;
 812                     }
 813                     /* Now should match s[0..slen-2] */
 814                     slen--;
 815                 }
 816                 if (slen && (*SvPVX_const(check) != *s
 817                     || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 818                 {
 819                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 820                                     "  String not equal...\n"));
 821                     goto fail_finish;
 822                 }
 823
 824                 check_at = s;
 825                 goto success_at_start;
 826             }
 827         }
 828     }
 829
 830     end_shift = prog->check_end_shift;
 831
 832 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 833     if (end_shift < 0)
 834         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 835                    (IV)end_shift, RX_PRECOMP(prog));
 836 #endif
 837
 838   restart:
 839
 840     /* This is the (re)entry point of the main loop in this function.
 841      * The goal of this loop is to:
 842      * 1) find the "check" substring in the region rx_origin..strend
 843      *    (adjusted by start_shift / end_shift). If not found, reject
 844      *    immediately.
 845      * 2) If it exists, look for the "other" substr too if defined; for
 846      *    example, if the check substr maps to the anchored substr, then
 847      *    check the floating substr, and vice-versa. If not found, go
 848      *    back to (1) with rx_origin suitably incremented.
 849      * 3) If we find an rx_origin position that doesn't contradict
 850      *    either of the substrings, then check the possible additional
 851      *    constraints on rx_origin of /^.../m or a known start class.
 852      *    If these fail, then depending on which constraints fail, jump
 853      *    back to here, or to various other re-entry points further along
 854      *    that skip some of the first steps.
 855      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 856      *    substring. If the start position was determined to be at the
 857      *    beginning of the string  - so, not rejected, but not optimised,
 858      *    since we have to run regmatch from position 0 - decrement the
 859      *    BmUSEFUL() count. Otherwise increment it.
 860      */
 861
 862
 863     /* first, look for the 'check' substring */
 864
 865     {
 866         U8* start_point;
 867         U8* end_point;
 868
 869         DEBUG_OPTIMISE_MORE_r({
 870             Perl_re_printf( aTHX_
 871                 "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 872                 " Start shift: %"IVdf" End shift %"IVdf
 873                 " Real end Shift: %"IVdf"\n",
 874                 (IV)(rx_origin - strbeg),
 875                 (IV)prog->check_offset_min,
 876                 (IV)start_shift,
 877                 (IV)end_shift,
 878                 (IV)prog->check_end_shift);
 879         });
 880
 881         end_point = HOP3(strend, -end_shift, strbeg);
 882         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 883         if (!start_point)
 884             goto fail_finish;
 885
 886
 887         /* If the regex is absolutely anchored to either the start of the
 888          * string (SBOL) or to pos() (ANCH_GPOS), then
 889          * check_offset_max represents an upper bound on the string where
 890          * the substr could start. For the ANCH_GPOS case, we assume that
 891          * the caller of intuit will have already set strpos to
 892          * pos()-gofs, so in this case strpos + offset_max will still be
 893          * an upper bound on the substr.
 894          */
 895         if (!ml_anch
 896             && prog->intflags & PREGf_ANCH
 897             && prog->check_offset_max != SSize_t_MAX)
 898         {
 899             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 900             const char * const anchor =
 901                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 902
 903             /* do a bytes rather than chars comparison. It's conservative;
 904              * so it skips doing the HOP if the result can't possibly end
 905              * up earlier than the old value of end_point.
 906              */
 907             if ((char*)end_point - anchor > prog->check_offset_max) {
 908                 end_point = HOP3lim((U8*)anchor,
 909                                 prog->check_offset_max,
 910                                 end_point -len)
 911                             + len;
 912             }
 913         }
 914
 915         check_at = fbm_instr( start_point, end_point,
 916                       check, multiline ? FBMrf_MULTILINE : 0);
 917
 918         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 919             "  doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
 920             (IV)((char*)start_point - strbeg),
 921             (IV)((char*)end_point   - strbeg),
 922             (IV)(check_at ? check_at - strbeg : -1)
 923         ));
 924
 925         /* Update the count-of-usability, remove useless subpatterns,
 926             unshift s.  */
 927
 928         DEBUG_EXECUTE_r({
 929             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 930                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 931             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
 932                               (check_at ? "Found" : "Did not find"),
 933                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 934                     ? "anchored" : "floating"),
 935                 quoted,
 936                 RE_SV_TAIL(check),
 937                 (check_at ? " at offset " : "...\n") );
 938         });
 939
 940         if (!check_at)
 941             goto fail_finish;
 942         /* set rx_origin to the minimum position where the regex could start
 943          * matching, given the constraint of the just-matched check substring.
 944          * But don't set it lower than previously.
 945          */
 946
 947         if (check_at - rx_origin > prog->check_offset_max)
 948             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 949         /* Finish the diagnostic message */
 950         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 951             "%ld (rx_origin now %"IVdf")...\n",
 952             (long)(check_at - strbeg),
 953             (IV)(rx_origin - strbeg)
 954         ));
 955     }
 956
 957
 958     /* now look for the 'other' substring if defined */
 959
 960     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 961                     : prog->substrs->data[other_ix].substr)
 962     {
 963         /* Take into account the "other" substring. */
 964         char *last, *last1;
 965         char *s;
 966         SV* must;
 967         struct reg_substr_datum *other;
 968
 969       do_other_substr:
 970         other = &prog->substrs->data[other_ix];
 971
 972         /* if "other" is anchored:
 973          * we've previously found a floating substr starting at check_at.
 974          * This means that the regex origin must lie somewhere
 975          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 976          * and max:                 HOP3(check_at, -check_offset_min)
 977          * (except that min will be >= strpos)
 978          * So the fixed  substr must lie somewhere between
 979          *  HOP3(min, anchored_offset)
 980          *  HOP3(max, anchored_offset) + SvCUR(substr)
 981          */
 982
 983         /* if "other" is floating
 984          * Calculate last1, the absolute latest point where the
 985          * floating substr could start in the string, ignoring any
 986          * constraints from the earlier fixed match. It is calculated
 987          * as follows:
 988          *
 989          * strend - prog->minlen (in chars) is the absolute latest
 990          * position within the string where the origin of the regex
 991          * could appear. The latest start point for the floating
 992          * substr is float_min_offset(*) on from the start of the
 993          * regex.  last1 simply combines thee two offsets.
 994          *
 995          * (*) You might think the latest start point should be
 996          * float_max_offset from the regex origin, and technically
 997          * you'd be correct. However, consider
 998          *    /a\d{2,4}bcd\w/
 999          * Here, float min, max are 3,5 and minlen is 7.
1000          * This can match either
1001          *    /a\d\dbcd\w/
1002          *    /a\d\d\dbcd\w/
1003          *    /a\d\d\d\dbcd\w/
1004          * In the first case, the regex matches minlen chars; in the
1005          * second, minlen+1, in the third, minlen+2.
1006          * In the first case, the floating offset is 3 (which equals
1007          * float_min), in the second, 4, and in the third, 5 (which
1008          * equals float_max). In all cases, the floating string bcd
1009          * can never start more than 4 chars from the end of the
1010          * string, which equals minlen - float_min. As the substring
1011          * starts to match more than float_min from the start of the
1012          * regex, it makes the regex match more than minlen chars,
1013          * and the two cancel each other out. So we can always use
1014          * float_min - minlen, rather than float_max - minlen for the
1015          * latest position in the string.
1016          *
1017          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1018          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1019          */
1020
1021         assert(prog->minlen >= other->min_offset);
1022         last1 = HOP3c(strend,
1023                         other->min_offset - prog->minlen, strbeg);
1024
1025         if (other_ix) {/* i.e. if (other-is-float) */
1026             /* last is the latest point where the floating substr could
1027              * start, *given* any constraints from the earlier fixed
1028              * match. This constraint is that the floating string starts
1029              * <= float_max_offset chars from the regex origin (rx_origin).
1030              * If this value is less than last1, use it instead.
1031              */
1032             assert(rx_origin <= last1);
1033             last =
1034                 /* this condition handles the offset==infinity case, and
1035                  * is a short-cut otherwise. Although it's comparing a
1036                  * byte offset to a char length, it does so in a safe way,
1037                  * since 1 char always occupies 1 or more bytes,
1038                  * so if a string range is  (last1 - rx_origin) bytes,
1039                  * it will be less than or equal to  (last1 - rx_origin)
1040                  * chars; meaning it errs towards doing the accurate HOP3
1041                  * rather than just using last1 as a short-cut */
1042                 (last1 - rx_origin) < other->max_offset
1043                     ? last1
1044                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1045         }
1046         else {
1047             assert(strpos + start_shift <= check_at);
1048             last = HOP4c(check_at, other->min_offset - start_shift,
1049                         strbeg, strend);
1050         }
1051
1052         s = HOP3c(rx_origin, other->min_offset, strend);
1053         if (s < other_last)     /* These positions already checked */
1054             s = other_last;
1055
1056         must = utf8_target ? other->utf8_substr : other->substr;
1057         assert(SvPOK(must));
1058         {
1059             char *from = s;
1060             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1061
1062             if (to > strend)
1063                 to = strend;
1064             if (from > to) {
1065                 s = NULL;
1066                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1067                     "  skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1068                     (IV)(from - strbeg),
1069                     (IV)(to   - strbeg)
1070                 ));
1071             }
1072             else {
1073                 s = fbm_instr(
1074                     (unsigned char*)from,
1075                     (unsigned char*)to,
1076                     must,
1077                     multiline ? FBMrf_MULTILINE : 0
1078                 );
1079                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1080                     "  doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1081                     (IV)(from - strbeg),
1082                     (IV)(to   - strbeg),
1083                     (IV)(s ? s - strbeg : -1)
1084                 ));
1085             }
1086         }
1087
1088         DEBUG_EXECUTE_r({
1089             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1090                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1091             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1092                 s ? "Found" : "Contradicts",
1093                 other_ix ? "floating" : "anchored",
1094                 quoted, RE_SV_TAIL(must));
1095         });
1096
1097
1098         if (!s) {
1099             /* last1 is latest possible substr location. If we didn't
1100              * find it before there, we never will */
1101             if (last >= last1) {
1102                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1103                                         "; giving up...\n"));
1104                 goto fail_finish;
1105             }
1106
1107             /* try to find the check substr again at a later
1108              * position. Maybe next time we'll find the "other" substr
1109              * in range too */
1110             other_last = HOP3c(last, 1, strend) /* highest failure */;
1111             rx_origin =
1112                 other_ix /* i.e. if other-is-float */
1113                     ? HOP3c(rx_origin, 1, strend)
1114                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1115             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1116                 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1117                 (other_ix ? "floating" : "anchored"),
1118                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1119                 (IV)(rx_origin - strbeg)
1120             ));
1121             goto restart;
1122         }
1123         else {
1124             if (other_ix) { /* if (other-is-float) */
1125                 /* other_last is set to s, not s+1, since its possible for
1126                  * a floating substr to fail first time, then succeed
1127                  * second time at the same floating position; e.g.:
1128                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1129                  * The first time round, anchored and float match at
1130                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1131                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1132                  */
1133                 other_last = s;
1134             }
1135             else {
1136                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1137                 other_last = HOP3c(s, 1, strend);
1138             }
1139             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1140                 " at offset %ld (rx_origin now %"IVdf")...\n",
1141                   (long)(s - strbeg),
1142                 (IV)(rx_origin - strbeg)
1143               ));
1144
1145         }
1146     }
1147     else {
1148         DEBUG_OPTIMISE_MORE_r(
1149             Perl_re_printf( aTHX_
1150                 "  Check-only match: offset min:%"IVdf" max:%"IVdf
1151                 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1152                 " strend:%"IVdf"\n",
1153                 (IV)prog->check_offset_min,
1154                 (IV)prog->check_offset_max,
1155                 (IV)(check_at-strbeg),
1156                 (IV)(rx_origin-strbeg),
1157                 (IV)(rx_origin-check_at),
1158                 (IV)(strend-strbeg)
1159             )
1160         );
1161     }
1162
1163   postprocess_substr_matches:
1164
1165     /* handle the extra constraint of /^.../m if present */
1166
1167     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1168         char *s;
1169
1170         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1171                         "  looking for /^/m anchor"));
1172
1173         /* we have failed the constraint of a \n before rx_origin.
1174          * Find the next \n, if any, even if it's beyond the current
1175          * anchored and/or floating substrings. Whether we should be
1176          * scanning ahead for the next \n or the next substr is debatable.
1177          * On the one hand you'd expect rare substrings to appear less
1178          * often than \n's. On the other hand, searching for \n means
1179          * we're effectively flipping between check_substr and "\n" on each
1180          * iteration as the current "rarest" string candidate, which
1181          * means for example that we'll quickly reject the whole string if
1182          * hasn't got a \n, rather than trying every substr position
1183          * first
1184          */
1185
1186         s = HOP3c(strend, - prog->minlen, strpos);
1187         if (s <= rx_origin ||
1188             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1189         {
1190             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1191                             "  Did not find /%s^%s/m...\n",
1192                             PL_colors[0], PL_colors[1]));
1193             goto fail_finish;
1194         }
1195
1196         /* earliest possible origin is 1 char after the \n.
1197          * (since *rx_origin == '\n', it's safe to ++ here rather than
1198          * HOP(rx_origin, 1)) */
1199         rx_origin++;
1200
1201         if (prog->substrs->check_ix == 0  /* check is anchored */
1202             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1203         {
1204             /* Position contradicts check-string; either because
1205              * check was anchored (and thus has no wiggle room),
1206              * or check was float and rx_origin is above the float range */
1207             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1208                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1209                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1210             goto restart;
1211         }
1212
1213         /* if we get here, the check substr must have been float,
1214          * is in range, and we may or may not have had an anchored
1215          * "other" substr which still contradicts */
1216         assert(prog->substrs->check_ix); /* check is float */
1217
1218         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1219             /* whoops, the anchored "other" substr exists, so we still
1220              * contradict. On the other hand, the float "check" substr
1221              * didn't contradict, so just retry the anchored "other"
1222              * substr */
1223             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1224                 "  Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
1225                 PL_colors[0], PL_colors[1],
1226                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1227                 (IV)(rx_origin - strbeg)
1228             ));
1229             goto do_other_substr;
1230         }
1231
1232         /* success: we don't contradict the found floating substring
1233          * (and there's no anchored substr). */
1234         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1235             "  Found /%s^%s/m with rx_origin %ld...\n",
1236             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1237     }
1238     else {
1239         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1240             "  (multiline anchor test skipped)\n"));
1241     }
1242
1243   success_at_start:
1244
1245
1246     /* if we have a starting character class, then test that extra constraint.
1247      * (trie stclasses are too expensive to use here, we are better off to
1248      * leave it to regmatch itself) */
1249
1250     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1251         const U8* const str = (U8*)STRING(progi->regstclass);
1252
1253         /* XXX this value could be pre-computed */
1254         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1255                     ?  (reginfo->is_utf8_pat
1256                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1257                         : STR_LEN(progi->regstclass))
1258                     : 1);
1259         char * endpos;
1260         char *s;
1261         /* latest pos that a matching float substr constrains rx start to */
1262         char *rx_max_float = NULL;
1263
1264         /* if the current rx_origin is anchored, either by satisfying an
1265          * anchored substring constraint, or a /^.../m constraint, then we
1266          * can reject the current origin if the start class isn't found
1267          * at the current position. If we have a float-only match, then
1268          * rx_origin is constrained to a range; so look for the start class
1269          * in that range. if neither, then look for the start class in the
1270          * whole rest of the string */
1271
1272         /* XXX DAPM it's not clear what the minlen test is for, and why
1273          * it's not used in the floating case. Nothing in the test suite
1274          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1275          * Here are some old comments, which may or may not be correct:
1276          *
1277          *   minlen == 0 is possible if regstclass is \b or \B,
1278          *   and the fixed substr is ''$.
1279          *   Since minlen is already taken into account, rx_origin+1 is
1280          *   before strend; accidentally, minlen >= 1 guaranties no false
1281          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1282          *   0) below assumes that regstclass does not come from lookahead...
1283          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1284          *   This leaves EXACTF-ish only, which are dealt with in
1285          *   find_byclass().
1286          */
1287
1288         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1289             endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1290         else if (prog->float_substr || prog->float_utf8) {
1291             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1292             endpos= HOP3c(rx_max_float, cl_l, strend);
1293         }
1294         else
1295             endpos= strend;
1296
1297         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298             "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1299             " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1300               (IV)start_shift, (IV)(check_at - strbeg),
1301               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1302
1303         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1304                             reginfo);
1305         if (!s) {
1306             if (endpos == strend) {
1307                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1308                                 "  Could not match STCLASS...\n") );
1309                 goto fail;
1310             }
1311             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1312                                "  This position contradicts STCLASS...\n") );
1313             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1314                         && !(prog->intflags & PREGf_IMPLICIT))
1315                 goto fail;
1316
1317             /* Contradict one of substrings */
1318             if (prog->anchored_substr || prog->anchored_utf8) {
1319                 if (prog->substrs->check_ix == 1) { /* check is float */
1320                     /* Have both, check_string is floating */
1321                     assert(rx_origin + start_shift <= check_at);
1322                     if (rx_origin + start_shift != check_at) {
1323                         /* not at latest position float substr could match:
1324                          * Recheck anchored substring, but not floating.
1325                          * The condition above is in bytes rather than
1326                          * chars for efficiency. It's conservative, in
1327                          * that it errs on the side of doing 'goto
1328                          * do_other_substr'. In this case, at worst,
1329                          * an extra anchored search may get done, but in
1330                          * practice the extra fbm_instr() is likely to
1331                          * get skipped anyway. */
1332                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1333                             "  about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1334                             (long)(other_last - strbeg),
1335                             (IV)(rx_origin - strbeg)
1336                         ));
1337                         goto do_other_substr;
1338                     }
1339                 }
1340             }
1341             else {
1342                 /* float-only */
1343
1344                 if (ml_anch) {
1345                     /* In the presence of ml_anch, we might be able to
1346                      * find another \n without breaking the current float
1347                      * constraint. */
1348
1349                     /* strictly speaking this should be HOP3c(..., 1, ...),
1350                      * but since we goto a block of code that's going to
1351                      * search for the next \n if any, its safe here */
1352                     rx_origin++;
1353                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1354                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1355                               PL_colors[0], PL_colors[1],
1356                               (long)(rx_origin - strbeg)) );
1357                     goto postprocess_substr_matches;
1358                 }
1359
1360                 /* strictly speaking this can never be true; but might
1361                  * be if we ever allow intuit without substrings */
1362                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1363                     goto fail;
1364
1365                 rx_origin = rx_max_float;
1366             }
1367
1368             /* at this point, any matching substrings have been
1369              * contradicted. Start again... */
1370
1371             rx_origin = HOP3c(rx_origin, 1, strend);
1372
1373             /* uses bytes rather than char calculations for efficiency.
1374              * It's conservative: it errs on the side of doing 'goto restart',
1375              * where there is code that does a proper char-based test */
1376             if (rx_origin + start_shift + end_shift > strend) {
1377                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1378                                        "  Could not match STCLASS...\n") );
1379                 goto fail;
1380             }
1381             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1382                 "  about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1383                 (prog->substrs->check_ix ? "floating" : "anchored"),
1384                 (long)(rx_origin + start_shift - strbeg),
1385                 (IV)(rx_origin - strbeg)
1386             ));
1387             goto restart;
1388         }
1389
1390         /* Success !!! */
1391
1392         if (rx_origin != s) {
1393             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1394                         "  By STCLASS: moving %ld --> %ld\n",
1395                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1396                    );
1397         }
1398         else {
1399             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1400                                   "  Does not contradict STCLASS...\n");
1401                    );
1402         }
1403     }
1404
1405     /* Decide whether using the substrings helped */
1406
1407     if (rx_origin != strpos) {
1408         /* Fixed substring is found far enough so that the match
1409            cannot start at strpos. */
1410
1411         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1412         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1413     }
1414     else {
1415         /* The found rx_origin position does not prohibit matching at
1416          * strpos, so calling intuit didn't gain us anything. Decrement
1417          * the BmUSEFUL() count on the check substring, and if we reach
1418          * zero, free it.  */
1419         if (!(prog->intflags & PREGf_NAUGHTY)
1420             && (utf8_target ? (
1421                 prog->check_utf8                /* Could be deleted already */
1422                 && --BmUSEFUL(prog->check_utf8) < 0
1423                 && (prog->check_utf8 == prog->float_utf8)
1424             ) : (
1425                 prog->check_substr              /* Could be deleted already */
1426                 && --BmUSEFUL(prog->check_substr) < 0
1427                 && (prog->check_substr == prog->float_substr)
1428             )))
1429         {
1430             /* If flags & SOMETHING - do not do it many times on the same match */
1431             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1432             /* XXX Does the destruction order has to change with utf8_target? */
1433             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1434             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1435             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1436             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1437             check = NULL;                       /* abort */
1438             /* XXXX This is a remnant of the old implementation.  It
1439                     looks wasteful, since now INTUIT can use many
1440                     other heuristics. */
1441             prog->extflags &= ~RXf_USE_INTUIT;
1442         }
1443     }
1444
1445     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1446             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1447              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1448
1449     return rx_origin;
1450
1451   fail_finish:                          /* Substring not found */
1452     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1453         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1454   fail:
1455     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1456                           PL_colors[4], PL_colors[5]));
1457     return NULL;
1458 }
1459
1460
1461 #define DECL_TRIE_TYPE(scan) \
1462     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1463                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1464                  trie_utf8l, trie_flu8 }                                            \
1465                     trie_type = ((scan->flags == EXACT)                             \
1466                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1467                                  : (scan->flags == EXACTL)                          \
1468                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1469                                     : (scan->flags == EXACTFA)                      \
1470                                       ? (utf8_target                                \
1471                                          ? trie_utf8_exactfa_fold                   \
1472                                          : trie_latin_utf8_exactfa_fold)            \
1473                                       : (scan->flags == EXACTFLU8                   \
1474                                          ? trie_flu8                                \
1475                                          : (utf8_target                             \
1476                                            ? trie_utf8_fold                         \
1477                                            :   trie_latin_utf8_fold)))
1478
1479 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1480 STMT_START {                                                                        \
1481     STRLEN skiplen;                                                                 \
1482     U8 flags = FOLD_FLAGS_FULL;                                                     \
1483     switch (trie_type) {                                                            \
1484     case trie_flu8:                                                                 \
1485         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1486         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1487             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1488         }                                                                           \
1489         goto do_trie_utf8_fold;                                                     \
1490     case trie_utf8_exactfa_fold:                                                    \
1491         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1492         /* FALLTHROUGH */                                                           \
1493     case trie_utf8_fold:                                                            \
1494       do_trie_utf8_fold:                                                            \
1495         if ( foldlen>0 ) {                                                          \
1496             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1497             foldlen -= len;                                                         \
1498             uscan += len;                                                           \
1499             len=0;                                                                  \
1500         } else {                                                                    \
1501             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1502             len = UTF8SKIP(uc);                                                     \
1503             skiplen = UVCHR_SKIP( uvc );                                            \
1504             foldlen -= skiplen;                                                     \
1505             uscan = foldbuf + skiplen;                                              \
1506         }                                                                           \
1507         break;                                                                      \
1508     case trie_latin_utf8_exactfa_fold:                                              \
1509         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1510         /* FALLTHROUGH */                                                           \
1511     case trie_latin_utf8_fold:                                                      \
1512         if ( foldlen>0 ) {                                                          \
1513             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1514             foldlen -= len;                                                         \
1515             uscan += len;                                                           \
1516             len=0;                                                                  \
1517         } else {                                                                    \
1518             len = 1;                                                                \
1519             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1520             skiplen = UVCHR_SKIP( uvc );                                            \
1521             foldlen -= skiplen;                                                     \
1522             uscan = foldbuf + skiplen;                                              \
1523         }                                                                           \
1524         break;                                                                      \
1525     case trie_utf8l:                                                                \
1526         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1527         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1528             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1529         }                                                                           \
1530         /* FALLTHROUGH */                                                           \
1531     case trie_utf8:                                                                 \
1532         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1533         break;                                                                      \
1534     case trie_plain:                                                                \
1535         uvc = (UV)*uc;                                                              \
1536         len = 1;                                                                    \
1537     }                                                                               \
1538     if (uvc < 256) {                                                                \
1539         charid = trie->charmap[ uvc ];                                              \
1540     }                                                                               \
1541     else {                                                                          \
1542         charid = 0;                                                                 \
1543         if (widecharmap) {                                                          \
1544             SV** const svpp = hv_fetch(widecharmap,                                 \
1545                         (char*)&uvc, sizeof(UV), 0);                                \
1546             if (svpp)                                                               \
1547                 charid = (U16)SvIV(*svpp);                                          \
1548         }                                                                           \
1549     }                                                                               \
1550 } STMT_END
1551
1552 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1553     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1554                 startpos, doutf8, depth)
1555
1556 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1557 STMT_START {                                              \
1558     while (s <= e) {                                      \
1559         if ( (COND)                                       \
1560              && (ln == 1 || folder(s, pat_string, ln))    \
1561              && (reginfo->intuit || regtry(reginfo, &s)) )\
1562             goto got_it;                                  \
1563         s++;                                              \
1564     }                                                     \
1565 } STMT_END
1566
1567 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1568 STMT_START {                                          \
1569     while (s < strend) {                              \
1570         CODE                                          \
1571         s += UTF8SKIP(s);                             \
1572     }                                                 \
1573 } STMT_END
1574
1575 #define REXEC_FBC_SCAN(CODE)                          \
1576 STMT_START {                                          \
1577     while (s < strend) {                              \
1578         CODE                                          \
1579         s++;                                          \
1580     }                                                 \
1581 } STMT_END
1582
1583 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1584 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1585     if (COND) {                                                \
1586         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1587             goto got_it;                                       \
1588         else                                                   \
1589             tmp = doevery;                                     \
1590     }                                                          \
1591     else                                                       \
1592         tmp = 1;                                               \
1593 )
1594
1595 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1596 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1597     if (COND) {                                                \
1598         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1599             goto got_it;                                       \
1600         else                                                   \
1601             tmp = doevery;                                     \
1602     }                                                          \
1603     else                                                       \
1604         tmp = 1;                                               \
1605 )
1606
1607 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1608     if (utf8_target) {                                         \
1609         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1610     }                                                          \
1611     else {                                                     \
1612         REXEC_FBC_CLASS_SCAN(COND);                            \
1613     }
1614
1615 /* The three macros below are slightly different versions of the same logic.
1616  *
1617  * The first is for /a and /aa when the target string is UTF-8.  This can only
1618  * match ascii, but it must advance based on UTF-8.   The other two handle the
1619  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1620  * for the boundary (or non-boundary) between a word and non-word character.
1621  * The utf8 and non-utf8 cases have the same logic, but the details must be
1622  * different.  Find the "wordness" of the character just prior to this one, and
1623  * compare it with the wordness of this one.  If they differ, we have a
1624  * boundary.  At the beginning of the string, pretend that the previous
1625  * character was a new-line.
1626  *
1627  * All these macros uncleanly have side-effects with each other and outside
1628  * variables.  So far it's been too much trouble to clean-up
1629  *
1630  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1631  *               a word character or not.
1632  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1633  *               word/non-word
1634  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1635  *
1636  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1637  * are looking for a boundary or for a non-boundary.  If we are looking for a
1638  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1639  * see if this tentative match actually works, and if so, to quit the loop
1640  * here.  And vice-versa if we are looking for a non-boundary.
1641  *
1642  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1643  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1644  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1645  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1646  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1647  * complement.  But in that branch we complement tmp, meaning that at the
1648  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1649  * which means at the top of the loop in the next iteration, it is
1650  * TEST_NON_UTF8(s-1) */
1651 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1652     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1653     tmp = TEST_NON_UTF8(tmp);                                                  \
1654     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1655         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1656             tmp = !tmp;                                                        \
1657             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1658         }                                                                      \
1659         else {                                                                 \
1660             IF_FAIL;                                                           \
1661         }                                                                      \
1662     );                                                                         \
1663
1664 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1665  * TEST_UTF8 is a macro that for the same input code points returns identically
1666  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1667 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1668     if (s == reginfo->strbeg) {                                                \
1669         tmp = '\n';                                                            \
1670     }                                                                          \
1671     else { /* Back-up to the start of the previous character */                \
1672         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1673         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1674                                                        0, UTF8_ALLOW_DEFAULT); \
1675     }                                                                          \
1676     tmp = TEST_UV(tmp);                                                        \
1677     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1678     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1679         if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1680             tmp = !tmp;                                                        \
1681             IF_SUCCESS;                                                        \
1682         }                                                                      \
1683         else {                                                                 \
1684             IF_FAIL;                                                           \
1685         }                                                                      \
1686     );
1687
1688 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1689  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1690  * macros below */
1691 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1692     if (utf8_target) {                                                         \
1693         UTF8_CODE                                                              \
1694     }                                                                          \
1695     else {  /* Not utf8 */                                                     \
1696         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1697         tmp = TEST_NON_UTF8(tmp);                                              \
1698         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1699             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1700                 IF_SUCCESS;                                                    \
1701                 tmp = !tmp;                                                    \
1702             }                                                                  \
1703             else {                                                             \
1704                 IF_FAIL;                                                       \
1705             }                                                                  \
1706         );                                                                     \
1707     }                                                                          \
1708     /* Here, things have been set up by the previous code so that tmp is the   \
1709      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1710      * utf8ness of the target).  We also have to check if this matches against \
1711      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1712      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1713      * string */                                                               \
1714     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1715         IF_SUCCESS;                                                            \
1716     }                                                                          \
1717     else {                                                                     \
1718         IF_FAIL;                                                               \
1719     }
1720
1721 /* This is the macro to use when we want to see if something that looks like it
1722  * could match, actually does, and if so exits the loop */
1723 #define REXEC_FBC_TRYIT                            \
1724     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1725         goto got_it
1726
1727 /* The only difference between the BOUND and NBOUND cases is that
1728  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1729  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1730  * with the other one being empty (PLACEHOLDER is defined as empty).
1731  *
1732  * The TEST_FOO parameters are for operating on different forms of input, but
1733  * all should be ones that return identically for the same underlying code
1734  * points */
1735 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1736     FBC_BOUND_COMMON(                                                          \
1737           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1738           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1739
1740 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1741     FBC_BOUND_COMMON(                                                          \
1742             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1743             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1744
1745 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1746     FBC_BOUND_COMMON(                                                          \
1747           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1748           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1749
1750 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1751     FBC_BOUND_COMMON(                                                          \
1752             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1753             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1754
1755 #ifdef DEBUGGING
1756 static IV
1757 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1758   IV cp_out = Perl__invlist_search(invlist, cp_in);
1759   assert(cp_out >= 0);
1760   return cp_out;
1761 }
1762 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1763         invmap[S_get_break_val_cp_checked(invlist, cp)]
1764 #else
1765 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1766         invmap[_invlist_search(invlist, cp)]
1767 #endif
1768
1769 /* Takes a pointer to an inversion list, a pointer to its corresponding
1770  * inversion map, and a code point, and returns the code point's value
1771  * according to the two arrays.  It assumes that all code points have a value.
1772  * This is used as the base macro for macros for particular properties */
1773 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1774         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1775
1776 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1777  * of a code point, returning the value for the first code point in the string.
1778  * And it takes the particular macro name that finds the desired value given a
1779  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1780 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1781              (__ASSERT_(pos < strend)                                          \
1782                  /* Note assumes is valid UTF-8 */                             \
1783              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1784
1785 /* Returns the GCB value for the input code point */
1786 #define getGCB_VAL_CP(cp)                                                      \
1787           _generic_GET_BREAK_VAL_CP(                                           \
1788                                     PL_GCB_invlist,                            \
1789                                     _Perl_GCB_invmap,                          \
1790                                     (cp))
1791
1792 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1793  * bounded by pos and strend */
1794 #define getGCB_VAL_UTF8(pos, strend)                                           \
1795     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1796
1797 /* Returns the LB value for the input code point */
1798 #define getLB_VAL_CP(cp)                                                       \
1799           _generic_GET_BREAK_VAL_CP(                                           \
1800                                     PL_LB_invlist,                             \
1801                                     _Perl_LB_invmap,                           \
1802                                     (cp))
1803
1804 /* Returns the LB value for the first code point in the UTF-8 encoded string
1805  * bounded by pos and strend */
1806 #define getLB_VAL_UTF8(pos, strend)                                            \
1807     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1808
1809
1810 /* Returns the SB value for the input code point */
1811 #define getSB_VAL_CP(cp)                                                       \
1812           _generic_GET_BREAK_VAL_CP(                                           \
1813                                     PL_SB_invlist,                             \
1814                                     _Perl_SB_invmap,                     \
1815                                     (cp))
1816
1817 /* Returns the SB value for the first code point in the UTF-8 encoded string
1818  * bounded by pos and strend */
1819 #define getSB_VAL_UTF8(pos, strend)                                            \
1820     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1821
1822 /* Returns the WB value for the input code point */
1823 #define getWB_VAL_CP(cp)                                                       \
1824           _generic_GET_BREAK_VAL_CP(                                           \
1825                                     PL_WB_invlist,                             \
1826                                     _Perl_WB_invmap,                         \
1827                                     (cp))
1828
1829 /* Returns the WB value for the first code point in the UTF-8 encoded string
1830  * bounded by pos and strend */
1831 #define getWB_VAL_UTF8(pos, strend)                                            \
1832     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1833
1834 /* We know what class REx starts with.  Try to find this position... */
1835 /* if reginfo->intuit, its a dryrun */
1836 /* annoyingly all the vars in this routine have different names from their counterparts
1837    in regmatch. /grrr */
1838 STATIC char *
1839 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1840     const char *strend, regmatch_info *reginfo)
1841 {
1842     dVAR;
1843     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1844     char *pat_string;   /* The pattern's exactish string */
1845     char *pat_end;          /* ptr to end char of pat_string */
1846     re_fold_t folder;   /* Function for computing non-utf8 folds */
1847     const U8 *fold_array;   /* array for folding ords < 256 */
1848     STRLEN ln;
1849     STRLEN lnc;
1850     U8 c1;
1851     U8 c2;
1852     char *e;
1853     I32 tmp = 1;        /* Scratch variable? */
1854     const bool utf8_target = reginfo->is_utf8_target;
1855     UV utf8_fold_flags = 0;
1856     const bool is_utf8_pat = reginfo->is_utf8_pat;
1857     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1858                                    with a result inverts that result, as 0^1 =
1859                                    1 and 1^1 = 0 */
1860     _char_class_number classnum;
1861
1862     RXi_GET_DECL(prog,progi);
1863
1864     PERL_ARGS_ASSERT_FIND_BYCLASS;
1865
1866     /* We know what class it must start with. */
1867     switch (OP(c)) {
1868     case ANYOFL:
1869         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1870
1871         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
1872             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1873         }
1874
1875         /* FALLTHROUGH */
1876     case ANYOFD:
1877     case ANYOF:
1878         if (utf8_target) {
1879             REXEC_FBC_UTF8_CLASS_SCAN(
1880                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1881         }
1882         else {
1883             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s, 0));
1884         }
1885         break;
1886
1887     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1888         assert(! is_utf8_pat);
1889         /* FALLTHROUGH */
1890     case EXACTFA:
1891         if (is_utf8_pat || utf8_target) {
1892             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1893             goto do_exactf_utf8;
1894         }
1895         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1896         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1897         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1898
1899     case EXACTF:   /* This node only generated for non-utf8 patterns */
1900         assert(! is_utf8_pat);
1901         if (utf8_target) {
1902             utf8_fold_flags = 0;
1903             goto do_exactf_utf8;
1904         }
1905         fold_array = PL_fold;
1906         folder = foldEQ;
1907         goto do_exactf_non_utf8;
1908
1909     case EXACTFL:
1910         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1911         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1912             utf8_fold_flags = FOLDEQ_LOCALE;
1913             goto do_exactf_utf8;
1914         }
1915         fold_array = PL_fold_locale;
1916         folder = foldEQ_locale;
1917         goto do_exactf_non_utf8;
1918
1919     case EXACTFU_SS:
1920         if (is_utf8_pat) {
1921             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1922         }
1923         goto do_exactf_utf8;
1924
1925     case EXACTFLU8:
1926             if (! utf8_target) {    /* All code points in this node require
1927                                        UTF-8 to express.  */
1928                 break;
1929             }
1930             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1931                                              | FOLDEQ_S2_FOLDS_SANE;
1932             goto do_exactf_utf8;
1933
1934     case EXACTFU:
1935         if (is_utf8_pat || utf8_target) {
1936             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1937             goto do_exactf_utf8;
1938         }
1939
1940         /* Any 'ss' in the pattern should have been replaced by regcomp,
1941          * so we don't have to worry here about this single special case
1942          * in the Latin1 range */
1943         fold_array = PL_fold_latin1;
1944         folder = foldEQ_latin1;
1945
1946         /* FALLTHROUGH */
1947
1948       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1949                            are no glitches with fold-length differences
1950                            between the target string and pattern */
1951
1952         /* The idea in the non-utf8 EXACTF* cases is to first find the
1953          * first character of the EXACTF* node and then, if necessary,
1954          * case-insensitively compare the full text of the node.  c1 is the
1955          * first character.  c2 is its fold.  This logic will not work for
1956          * Unicode semantics and the german sharp ss, which hence should
1957          * not be compiled into a node that gets here. */
1958         pat_string = STRING(c);
1959         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1960
1961         /* We know that we have to match at least 'ln' bytes (which is the
1962          * same as characters, since not utf8).  If we have to match 3
1963          * characters, and there are only 2 availabe, we know without
1964          * trying that it will fail; so don't start a match past the
1965          * required minimum number from the far end */
1966         e = HOP3c(strend, -((SSize_t)ln), s);
1967
1968         if (reginfo->intuit && e < s) {
1969             e = s;                      /* Due to minlen logic of intuit() */
1970         }
1971
1972         c1 = *pat_string;
1973         c2 = fold_array[c1];
1974         if (c1 == c2) { /* If char and fold are the same */
1975             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1976         }
1977         else {
1978             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1979         }
1980         break;
1981
1982       do_exactf_utf8:
1983       {
1984         unsigned expansion;
1985
1986         /* If one of the operands is in utf8, we can't use the simpler folding
1987          * above, due to the fact that many different characters can have the
1988          * same fold, or portion of a fold, or different- length fold */
1989         pat_string = STRING(c);
1990         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1991         pat_end = pat_string + ln;
1992         lnc = is_utf8_pat       /* length to match in characters */
1993                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1994                 : ln;
1995
1996         /* We have 'lnc' characters to match in the pattern, but because of
1997          * multi-character folding, each character in the target can match
1998          * up to 3 characters (Unicode guarantees it will never exceed
1999          * this) if it is utf8-encoded; and up to 2 if not (based on the
2000          * fact that the Latin 1 folds are already determined, and the
2001          * only multi-char fold in that range is the sharp-s folding to
2002          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2003          * string character.  Adjust lnc accordingly, rounding up, so that
2004          * if we need to match at least 4+1/3 chars, that really is 5. */
2005         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2006         lnc = (lnc + expansion - 1) / expansion;
2007
2008         /* As in the non-UTF8 case, if we have to match 3 characters, and
2009          * only 2 are left, it's guaranteed to fail, so don't start a
2010          * match that would require us to go beyond the end of the string
2011          */
2012         e = HOP3c(strend, -((SSize_t)lnc), s);
2013
2014         if (reginfo->intuit && e < s) {
2015             e = s;                      /* Due to minlen logic of intuit() */
2016         }
2017
2018         /* XXX Note that we could recalculate e to stop the loop earlier,
2019          * as the worst case expansion above will rarely be met, and as we
2020          * go along we would usually find that e moves further to the left.
2021          * This would happen only after we reached the point in the loop
2022          * where if there were no expansion we should fail.  Unclear if
2023          * worth the expense */
2024
2025         while (s <= e) {
2026             char *my_strend= (char *)strend;
2027             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2028                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2029                 && (reginfo->intuit || regtry(reginfo, &s)) )
2030             {
2031                 goto got_it;
2032             }
2033             s += (utf8_target) ? UTF8SKIP(s) : 1;
2034         }
2035         break;
2036     }
2037
2038     case BOUNDL:
2039         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2040         if (FLAGS(c) != TRADITIONAL_BOUND) {
2041             if (! IN_UTF8_CTYPE_LOCALE) {
2042                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2043                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2044             }
2045             goto do_boundu;
2046         }
2047
2048         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2049         break;
2050
2051     case NBOUNDL:
2052         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2053         if (FLAGS(c) != TRADITIONAL_BOUND) {
2054             if (! IN_UTF8_CTYPE_LOCALE) {
2055                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2056                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2057             }
2058             goto do_nboundu;
2059         }
2060
2061         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2062         break;
2063
2064     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2065                    meaning */
2066         assert(FLAGS(c) == TRADITIONAL_BOUND);
2067
2068         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2069         break;
2070
2071     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2072                    meaning */
2073         assert(FLAGS(c) == TRADITIONAL_BOUND);
2074
2075         FBC_BOUND_A(isWORDCHAR_A);
2076         break;
2077
2078     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2079                    meaning */
2080         assert(FLAGS(c) == TRADITIONAL_BOUND);
2081
2082         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2083         break;
2084
2085     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2086                    meaning */
2087         assert(FLAGS(c) == TRADITIONAL_BOUND);
2088
2089         FBC_NBOUND_A(isWORDCHAR_A);
2090         break;
2091
2092     case NBOUNDU:
2093         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2094             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2095             break;
2096         }
2097
2098       do_nboundu:
2099
2100         to_complement = 1;
2101         /* FALLTHROUGH */
2102
2103     case BOUNDU:
2104       do_boundu:
2105         switch((bound_type) FLAGS(c)) {
2106             case TRADITIONAL_BOUND:
2107                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2108                 break;
2109             case GCB_BOUND:
2110                 if (s == reginfo->strbeg) {
2111                     if (reginfo->intuit || regtry(reginfo, &s))
2112                     {
2113                         goto got_it;
2114                     }
2115
2116                     /* Didn't match.  Try at the next position (if there is one) */
2117                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2118                     if (UNLIKELY(s >= reginfo->strend)) {
2119                         break;
2120                     }
2121                 }
2122
2123                 if (utf8_target) {
2124                     GCB_enum before = getGCB_VAL_UTF8(
2125                                                reghop3((U8*)s, -1,
2126                                                        (U8*)(reginfo->strbeg)),
2127                                                (U8*) reginfo->strend);
2128                     while (s < strend) {
2129                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2130                                                         (U8*) reginfo->strend);
2131                         if (   (to_complement ^ isGCB(before,
2132                                                       after,
2133                                                       (U8*) reginfo->strbeg,
2134                                                       (U8*) s,
2135                                                       utf8_target))
2136                             && (reginfo->intuit || regtry(reginfo, &s)))
2137                         {
2138                             goto got_it;
2139                         }
2140                         before = after;
2141                         s += UTF8SKIP(s);
2142                     }
2143                 }
2144                 else {  /* Not utf8.  Everything is a GCB except between CR and
2145                            LF */
2146                     while (s < strend) {
2147                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2148                                               || UCHARAT(s) != '\n'))
2149                             && (reginfo->intuit || regtry(reginfo, &s)))
2150                         {
2151                             goto got_it;
2152                         }
2153                         s++;
2154                     }
2155                 }
2156
2157                 /* And, since this is a bound, it can match after the final
2158                  * character in the string */
2159                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2160                     goto got_it;
2161                 }
2162                 break;
2163
2164             case LB_BOUND:
2165                 if (s == reginfo->strbeg) {
2166                     if (reginfo->intuit || regtry(reginfo, &s)) {
2167                         goto got_it;
2168                     }
2169                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2170                     if (UNLIKELY(s >= reginfo->strend)) {
2171                         break;
2172                     }
2173                 }
2174
2175                 if (utf8_target) {
2176                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2177                                                                -1,
2178                                                                (U8*)(reginfo->strbeg)),
2179                                                        (U8*) reginfo->strend);
2180                     while (s < strend) {
2181                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2182                         if (to_complement ^ isLB(before,
2183                                                  after,
2184                                                  (U8*) reginfo->strbeg,
2185                                                  (U8*) s,
2186                                                  (U8*) reginfo->strend,
2187                                                  utf8_target)
2188                             && (reginfo->intuit || regtry(reginfo, &s)))
2189                         {
2190                             goto got_it;
2191                         }
2192                         before = after;
2193                         s += UTF8SKIP(s);
2194                     }
2195                 }
2196                 else {  /* Not utf8. */
2197                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2198                     while (s < strend) {
2199                         LB_enum after = getLB_VAL_CP((U8) *s);
2200                         if (to_complement ^ isLB(before,
2201                                                  after,
2202                                                  (U8*) reginfo->strbeg,
2203                                                  (U8*) s,
2204                                                  (U8*) reginfo->strend,
2205                                                  utf8_target)
2206                             && (reginfo->intuit || regtry(reginfo, &s)))
2207                         {
2208                             goto got_it;
2209                         }
2210                         before = after;
2211                         s++;
2212                     }
2213                 }
2214
2215                 if (reginfo->intuit || regtry(reginfo, &s)) {
2216                     goto got_it;
2217                 }
2218
2219                 break;
2220
2221             case SB_BOUND:
2222                 if (s == reginfo->strbeg) {
2223                     if (reginfo->intuit || regtry(reginfo, &s)) {
2224                         goto got_it;
2225                     }
2226                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2227                     if (UNLIKELY(s >= reginfo->strend)) {
2228                         break;
2229                     }
2230                 }
2231
2232                 if (utf8_target) {
2233                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2234                                                         -1,
2235                                                         (U8*)(reginfo->strbeg)),
2236                                                       (U8*) reginfo->strend);
2237                     while (s < strend) {
2238                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2239                                                          (U8*) reginfo->strend);
2240                         if ((to_complement ^ isSB(before,
2241                                                   after,
2242                                                   (U8*) reginfo->strbeg,
2243                                                   (U8*) s,
2244                                                   (U8*) reginfo->strend,
2245                                                   utf8_target))
2246                             && (reginfo->intuit || regtry(reginfo, &s)))
2247                         {
2248                             goto got_it;
2249                         }
2250                         before = after;
2251                         s += UTF8SKIP(s);
2252                     }
2253                 }
2254                 else {  /* Not utf8. */
2255                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2256                     while (s < strend) {
2257                         SB_enum after = getSB_VAL_CP((U8) *s);
2258                         if ((to_complement ^ isSB(before,
2259                                                   after,
2260                                                   (U8*) reginfo->strbeg,
2261                                                   (U8*) s,
2262                                                   (U8*) reginfo->strend,
2263                                                   utf8_target))
2264                             && (reginfo->intuit || regtry(reginfo, &s)))
2265                         {
2266                             goto got_it;
2267                         }
2268                         before = after;
2269                         s++;
2270                     }
2271                 }
2272
2273                 /* Here are at the final position in the target string.  The SB
2274                  * value is always true here, so matches, depending on other
2275                  * constraints */
2276                 if (reginfo->intuit || regtry(reginfo, &s)) {
2277                     goto got_it;
2278                 }
2279
2280                 break;
2281
2282             case WB_BOUND:
2283                 if (s == reginfo->strbeg) {
2284                     if (reginfo->intuit || regtry(reginfo, &s)) {
2285                         goto got_it;
2286                     }
2287                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2288                     if (UNLIKELY(s >= reginfo->strend)) {
2289                         break;
2290                     }
2291                 }
2292
2293                 if (utf8_target) {
2294                     /* We are at a boundary between char_sub_0 and char_sub_1.
2295                      * We also keep track of the value for char_sub_-1 as we
2296                      * loop through the line.   Context may be needed to make a
2297                      * determination, and if so, this can save having to
2298                      * recalculate it */
2299                     WB_enum previous = WB_UNKNOWN;
2300                     WB_enum before = getWB_VAL_UTF8(
2301                                               reghop3((U8*)s,
2302                                                       -1,
2303                                                       (U8*)(reginfo->strbeg)),
2304                                               (U8*) reginfo->strend);
2305                     while (s < strend) {
2306                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2307                                                         (U8*) reginfo->strend);
2308                         if ((to_complement ^ isWB(previous,
2309                                                   before,
2310                                                   after,
2311                                                   (U8*) reginfo->strbeg,
2312                                                   (U8*) s,
2313                                                   (U8*) reginfo->strend,
2314                                                   utf8_target))
2315                             && (reginfo->intuit || regtry(reginfo, &s)))
2316                         {
2317                             goto got_it;
2318                         }
2319                         previous = before;
2320                         before = after;
2321                         s += UTF8SKIP(s);
2322                     }
2323                 }
2324                 else {  /* Not utf8. */
2325                     WB_enum previous = WB_UNKNOWN;
2326                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2327                     while (s < strend) {
2328                         WB_enum after = getWB_VAL_CP((U8) *s);
2329                         if ((to_complement ^ isWB(previous,
2330                                                   before,
2331                                                   after,
2332                                                   (U8*) reginfo->strbeg,
2333                                                   (U8*) s,
2334                                                   (U8*) reginfo->strend,
2335                                                   utf8_target))
2336                             && (reginfo->intuit || regtry(reginfo, &s)))
2337                         {
2338                             goto got_it;
2339                         }
2340                         previous = before;
2341                         before = after;
2342                         s++;
2343                     }
2344                 }
2345
2346                 if (reginfo->intuit || regtry(reginfo, &s)) {
2347                     goto got_it;
2348                 }
2349         }
2350         break;
2351
2352     case LNBREAK:
2353         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2354                         is_LNBREAK_latin1_safe(s, strend)
2355         );
2356         break;
2357
2358     /* The argument to all the POSIX node types is the class number to pass to
2359      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2360
2361     case NPOSIXL:
2362         to_complement = 1;
2363         /* FALLTHROUGH */
2364
2365     case POSIXL:
2366         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2367         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2368                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2369         break;
2370
2371     case NPOSIXD:
2372         to_complement = 1;
2373         /* FALLTHROUGH */
2374
2375     case POSIXD:
2376         if (utf8_target) {
2377             goto posix_utf8;
2378         }
2379         goto posixa;
2380
2381     case NPOSIXA:
2382         if (utf8_target) {
2383             /* The complement of something that matches only ASCII matches all
2384              * non-ASCII, plus everything in ASCII that isn't in the class. */
2385             REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2386                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2387             break;
2388         }
2389
2390         to_complement = 1;
2391         /* FALLTHROUGH */
2392
2393     case POSIXA:
2394       posixa:
2395         /* Don't need to worry about utf8, as it can match only a single
2396          * byte invariant character. */
2397         REXEC_FBC_CLASS_SCAN(
2398                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2399         break;
2400
2401     case NPOSIXU:
2402         to_complement = 1;
2403         /* FALLTHROUGH */
2404
2405     case POSIXU:
2406         if (! utf8_target) {
2407             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2408                                                                     FLAGS(c))));
2409         }
2410         else {
2411
2412           posix_utf8:
2413             classnum = (_char_class_number) FLAGS(c);
2414             if (classnum < _FIRST_NON_SWASH_CC) {
2415                 while (s < strend) {
2416
2417                     /* We avoid loading in the swash as long as possible, but
2418                      * should we have to, we jump to a separate loop.  This
2419                      * extra 'if' statement is what keeps this code from being
2420                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2421                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2422                         goto found_above_latin1;
2423                     }
2424                     if ((UTF8_IS_INVARIANT(*s)
2425                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2426                                                                 classnum)))
2427                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
2428                             && to_complement ^ cBOOL(
2429                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2430                                                                       *(s + 1)),
2431                                               classnum))))
2432                     {
2433                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2434                             goto got_it;
2435                         else {
2436                             tmp = doevery;
2437                         }
2438                     }
2439                     else {
2440                         tmp = 1;
2441                     }
2442                     s += UTF8SKIP(s);
2443                 }
2444             }
2445             else switch (classnum) {    /* These classes are implemented as
2446                                            macros */
2447                 case _CC_ENUM_SPACE:
2448                     REXEC_FBC_UTF8_CLASS_SCAN(
2449                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
2450                     break;
2451
2452                 case _CC_ENUM_BLANK:
2453                     REXEC_FBC_UTF8_CLASS_SCAN(
2454                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
2455                     break;
2456
2457                 case _CC_ENUM_XDIGIT:
2458                     REXEC_FBC_UTF8_CLASS_SCAN(
2459                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2460                     break;
2461
2462                 case _CC_ENUM_VERTSPACE:
2463                     REXEC_FBC_UTF8_CLASS_SCAN(
2464                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
2465                     break;
2466
2467                 case _CC_ENUM_CNTRL:
2468                     REXEC_FBC_UTF8_CLASS_SCAN(
2469                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
2470                     break;
2471
2472                 default:
2473                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2474                     NOT_REACHED; /* NOTREACHED */
2475             }
2476         }
2477         break;
2478
2479       found_above_latin1:   /* Here we have to load a swash to get the result
2480                                for the current code point */
2481         if (! PL_utf8_swash_ptrs[classnum]) {
2482             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2483             PL_utf8_swash_ptrs[classnum] =
2484                     _core_swash_init("utf8",
2485                                      "",
2486                                      &PL_sv_undef, 1, 0,
2487                                      PL_XPosix_ptrs[classnum], &flags);
2488         }
2489
2490         /* This is a copy of the loop above for swash classes, though using the
2491          * FBC macro instead of being expanded out.  Since we've loaded the
2492          * swash, we don't have to check for that each time through the loop */
2493         REXEC_FBC_UTF8_CLASS_SCAN(
2494                 to_complement ^ cBOOL(_generic_utf8(
2495                                       classnum,
2496                                       s,
2497                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2498                                                   (U8 *) s, TRUE))));
2499         break;
2500
2501     case AHOCORASICKC:
2502     case AHOCORASICK:
2503         {
2504             DECL_TRIE_TYPE(c);
2505             /* what trie are we using right now */
2506             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2507             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2508             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2509
2510             const char *last_start = strend - trie->minlen;
2511 #ifdef DEBUGGING
2512             const char *real_start = s;
2513 #endif
2514             STRLEN maxlen = trie->maxlen;
2515             SV *sv_points;
2516             U8 **points; /* map of where we were in the input string
2517                             when reading a given char. For ASCII this
2518                             is unnecessary overhead as the relationship
2519                             is always 1:1, but for Unicode, especially
2520                             case folded Unicode this is not true. */
2521             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2522             U8 *bitmap=NULL;
2523
2524
2525             GET_RE_DEBUG_FLAGS_DECL;
2526
2527             /* We can't just allocate points here. We need to wrap it in
2528              * an SV so it gets freed properly if there is a croak while
2529              * running the match */
2530             ENTER;
2531             SAVETMPS;
2532             sv_points=newSV(maxlen * sizeof(U8 *));
2533             SvCUR_set(sv_points,
2534                 maxlen * sizeof(U8 *));
2535             SvPOK_on(sv_points);
2536             sv_2mortal(sv_points);
2537             points=(U8**)SvPV_nolen(sv_points );
2538             if ( trie_type != trie_utf8_fold
2539                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2540             {
2541                 if (trie->bitmap)
2542                     bitmap=(U8*)trie->bitmap;
2543                 else
2544                     bitmap=(U8*)ANYOF_BITMAP(c);
2545             }
2546             /* this is the Aho-Corasick algorithm modified a touch
2547                to include special handling for long "unknown char" sequences.
2548                The basic idea being that we use AC as long as we are dealing
2549                with a possible matching char, when we encounter an unknown char
2550                (and we have not encountered an accepting state) we scan forward
2551                until we find a legal starting char.
2552                AC matching is basically that of trie matching, except that when
2553                we encounter a failing transition, we fall back to the current
2554                states "fail state", and try the current char again, a process
2555                we repeat until we reach the root state, state 1, or a legal
2556                transition. If we fail on the root state then we can either
2557                terminate if we have reached an accepting state previously, or
2558                restart the entire process from the beginning if we have not.
2559
2560              */
2561             while (s <= last_start) {
2562                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2563                 U8 *uc = (U8*)s;
2564                 U16 charid = 0;
2565                 U32 base = 1;
2566                 U32 state = 1;
2567                 UV uvc = 0;
2568                 STRLEN len = 0;
2569                 STRLEN foldlen = 0;
2570                 U8 *uscan = (U8*)NULL;
2571                 U8 *leftmost = NULL;
2572 #ifdef DEBUGGING
2573                 U32 accepted_word= 0;
2574 #endif
2575                 U32 pointpos = 0;
2576
2577                 while ( state && uc <= (U8*)strend ) {
2578                     int failed=0;
2579                     U32 word = aho->states[ state ].wordnum;
2580
2581                     if( state==1 ) {
2582                         if ( bitmap ) {
2583                             DEBUG_TRIE_EXECUTE_r(
2584                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2585                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2586                                         (char *)uc, utf8_target, 0 );
2587                                     Perl_re_printf( aTHX_
2588                                         " Scanning for legal start char...\n");
2589                                 }
2590                             );
2591                             if (utf8_target) {
2592                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2593                                     uc += UTF8SKIP(uc);
2594                                 }
2595                             } else {
2596                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2597                                     uc++;
2598                                 }
2599                             }
2600                             s= (char *)uc;
2601                         }
2602                         if (uc >(U8*)last_start) break;
2603                     }
2604
2605                     if ( word ) {
2606                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2607                         if (!leftmost || lpos < leftmost) {
2608                             DEBUG_r(accepted_word=word);
2609                             leftmost= lpos;
2610                         }
2611                         if (base==0) break;
2612
2613                     }
2614                     points[pointpos++ % maxlen]= uc;
2615                     if (foldlen || uc < (U8*)strend) {
2616                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2617                                          widecharmap, uc,
2618                                          uscan, len, uvc, charid, foldlen,
2619                                          foldbuf, uniflags);
2620                         DEBUG_TRIE_EXECUTE_r({
2621                             dump_exec_pos( (char *)uc, c, strend,
2622                                         real_start, s, utf8_target, 0);
2623                             Perl_re_printf( aTHX_
2624                                 " Charid:%3u CP:%4"UVxf" ",
2625                                  charid, uvc);
2626                         });
2627                     }
2628                     else {
2629                         len = 0;
2630                         charid = 0;
2631                     }
2632
2633
2634                     do {
2635 #ifdef DEBUGGING
2636                         word = aho->states[ state ].wordnum;
2637 #endif
2638                         base = aho->states[ state ].trans.base;
2639
2640                         DEBUG_TRIE_EXECUTE_r({
2641                             if (failed)
2642                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2643                                     s,   utf8_target, 0 );
2644                             Perl_re_printf( aTHX_
2645                                 "%sState: %4"UVxf", word=%"UVxf,
2646                                 failed ? " Fail transition to " : "",
2647                                 (UV)state, (UV)word);
2648                         });
2649                         if ( base ) {
2650                             U32 tmp;
2651                             I32 offset;
2652                             if (charid &&
2653                                  ( ((offset = base + charid
2654                                     - 1 - trie->uniquecharcount)) >= 0)
2655                                  && ((U32)offset < trie->lasttrans)
2656                                  && trie->trans[offset].check == state
2657                                  && (tmp=trie->trans[offset].next))
2658                             {
2659                                 DEBUG_TRIE_EXECUTE_r(
2660                                     Perl_re_printf( aTHX_ " - legal\n"));
2661                                 state = tmp;
2662                                 break;
2663                             }
2664                             else {
2665                                 DEBUG_TRIE_EXECUTE_r(
2666                                     Perl_re_printf( aTHX_ " - fail\n"));
2667                                 failed = 1;
2668                                 state = aho->fail[state];
2669                             }
2670                         }
2671                         else {
2672                             /* we must be accepting here */
2673                             DEBUG_TRIE_EXECUTE_r(
2674                                     Perl_re_printf( aTHX_ " - accepting\n"));
2675                             failed = 1;
2676                             break;
2677                         }
2678                     } while(state);
2679                     uc += len;
2680                     if (failed) {
2681                         if (leftmost)
2682                             break;
2683                         if (!state) state = 1;
2684                     }
2685                 }
2686                 if ( aho->states[ state ].wordnum ) {
2687                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2688                     if (!leftmost || lpos < leftmost) {
2689                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2690                         leftmost = lpos;
2691                     }
2692                 }
2693                 if (leftmost) {
2694                     s = (char*)leftmost;
2695                     DEBUG_TRIE_EXECUTE_r({
2696                         Perl_re_printf( aTHX_  "Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2697                             (UV)accepted_word, (IV)(s - real_start)
2698                         );
2699                     });
2700                     if (reginfo->intuit || regtry(reginfo, &s)) {
2701                         FREETMPS;
2702                         LEAVE;
2703                         goto got_it;
2704                     }
2705                     s = HOPc(s,1);
2706                     DEBUG_TRIE_EXECUTE_r({
2707                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2708                     });
2709                 } else {
2710                     DEBUG_TRIE_EXECUTE_r(
2711                         Perl_re_printf( aTHX_ "No match.\n"));
2712                     break;
2713                 }
2714             }
2715             FREETMPS;
2716             LEAVE;
2717         }
2718         break;
2719     default:
2720         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2721     }
2722     return 0;
2723   got_it:
2724     return s;
2725 }
2726
2727 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2728  * flags have same meanings as with regexec_flags() */
2729
2730 static void
2731 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2732                             char *strbeg,
2733                             char *strend,
2734                             SV *sv,
2735                             U32 flags,
2736                             bool utf8_target)
2737 {
2738     struct regexp *const prog = ReANY(rx);
2739
2740     if (flags & REXEC_COPY_STR) {
2741 #ifdef PERL_ANY_COW
2742         if (SvCANCOW(sv)) {
2743             DEBUG_C(Perl_re_printf( aTHX_
2744                               "Copy on write: regexp capture, type %d\n",
2745                                     (int) SvTYPE(sv)));
2746             /* Create a new COW SV to share the match string and store
2747              * in saved_copy, unless the current COW SV in saved_copy
2748              * is valid and suitable for our purpose */
2749             if ((   prog->saved_copy
2750                  && SvIsCOW(prog->saved_copy)
2751                  && SvPOKp(prog->saved_copy)
2752                  && SvIsCOW(sv)
2753                  && SvPOKp(sv)
2754                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2755             {
2756                 /* just reuse saved_copy SV */
2757                 if (RXp_MATCH_COPIED(prog)) {
2758                     Safefree(prog->subbeg);
2759                     RXp_MATCH_COPIED_off(prog);
2760                 }
2761             }
2762             else {
2763                 /* create new COW SV to share string */
2764                 RX_MATCH_COPY_FREE(rx);
2765                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2766             }
2767             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2768             assert (SvPOKp(prog->saved_copy));
2769             prog->sublen  = strend - strbeg;
2770             prog->suboffset = 0;
2771             prog->subcoffset = 0;
2772         } else
2773 #endif
2774         {
2775             SSize_t min = 0;
2776             SSize_t max = strend - strbeg;
2777             SSize_t sublen;
2778
2779             if (    (flags & REXEC_COPY_SKIP_POST)
2780                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2781                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2782             ) { /* don't copy $' part of string */
2783                 U32 n = 0;
2784                 max = -1;
2785                 /* calculate the right-most part of the string covered
2786                  * by a capture. Due to lookahead, this may be to
2787                  * the right of $&, so we have to scan all captures */
2788                 while (n <= prog->lastparen) {
2789                     if (prog->offs[n].end > max)
2790                         max = prog->offs[n].end;
2791                     n++;
2792                 }
2793                 if (max == -1)
2794                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2795                             ? prog->offs[0].start
2796                             : 0;
2797                 assert(max >= 0 && max <= strend - strbeg);
2798             }
2799
2800             if (    (flags & REXEC_COPY_SKIP_PRE)
2801                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2802                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2803             ) { /* don't copy $` part of string */
2804                 U32 n = 0;
2805                 min = max;
2806                 /* calculate the left-most part of the string covered
2807                  * by a capture. Due to lookbehind, this may be to
2808                  * the left of $&, so we have to scan all captures */
2809                 while (min && n <= prog->lastparen) {
2810                     if (   prog->offs[n].start != -1
2811                         && prog->offs[n].start < min)
2812                     {
2813                         min = prog->offs[n].start;
2814                     }
2815                     n++;
2816                 }
2817                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2818                     && min >  prog->offs[0].end
2819                 )
2820                     min = prog->offs[0].end;
2821
2822             }
2823
2824             assert(min >= 0 && min <= max && min <= strend - strbeg);
2825             sublen = max - min;
2826
2827             if (RX_MATCH_COPIED(rx)) {
2828                 if (sublen > prog->sublen)
2829                     prog->subbeg =
2830                             (char*)saferealloc(prog->subbeg, sublen+1);
2831             }
2832             else
2833                 prog->subbeg = (char*)safemalloc(sublen+1);
2834             Copy(strbeg + min, prog->subbeg, sublen, char);
2835             prog->subbeg[sublen] = '\0';
2836             prog->suboffset = min;
2837             prog->sublen = sublen;
2838             RX_MATCH_COPIED_on(rx);
2839         }
2840         prog->subcoffset = prog->suboffset;
2841         if (prog->suboffset && utf8_target) {
2842             /* Convert byte offset to chars.
2843              * XXX ideally should only compute this if @-/@+
2844              * has been seen, a la PL_sawampersand ??? */
2845
2846             /* If there's a direct correspondence between the
2847              * string which we're matching and the original SV,
2848              * then we can use the utf8 len cache associated with
2849              * the SV. In particular, it means that under //g,
2850              * sv_pos_b2u() will use the previously cached
2851              * position to speed up working out the new length of
2852              * subcoffset, rather than counting from the start of
2853              * the string each time. This stops
2854              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2855              * from going quadratic */
2856             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2857                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2858                                                 SV_GMAGIC|SV_CONST_RETURN);
2859             else
2860                 prog->subcoffset = utf8_length((U8*)strbeg,
2861                                     (U8*)(strbeg+prog->suboffset));
2862         }
2863     }
2864     else {
2865         RX_MATCH_COPY_FREE(rx);
2866         prog->subbeg = strbeg;
2867         prog->suboffset = 0;
2868         prog->subcoffset = 0;
2869         prog->sublen = strend - strbeg;
2870     }
2871 }
2872
2873
2874
2875
2876 /*
2877  - regexec_flags - match a regexp against a string
2878  */
2879 I32
2880 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2881               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2882 /* stringarg: the point in the string at which to begin matching */
2883 /* strend:    pointer to null at end of string */
2884 /* strbeg:    real beginning of string */
2885 /* minend:    end of match must be >= minend bytes after stringarg. */
2886 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2887  *            itself is accessed via the pointers above */
2888 /* data:      May be used for some additional optimizations.
2889               Currently unused. */
2890 /* flags:     For optimizations. See REXEC_* in regexp.h */
2891
2892 {
2893     struct regexp *const prog = ReANY(rx);
2894     char *s;
2895     regnode *c;
2896     char *startpos;
2897     SSize_t minlen;             /* must match at least this many chars */
2898     SSize_t dontbother = 0;     /* how many characters not to try at end */
2899     const bool utf8_target = cBOOL(DO_UTF8(sv));
2900     I32 multiline;
2901     RXi_GET_DECL(prog,progi);
2902     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2903     regmatch_info *const reginfo = &reginfo_buf;
2904     regexp_paren_pair *swap = NULL;
2905     I32 oldsave;
2906     GET_RE_DEBUG_FLAGS_DECL;
2907
2908     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2909     PERL_UNUSED_ARG(data);
2910
2911     /* Be paranoid... */
2912     if (prog == NULL) {
2913         Perl_croak(aTHX_ "NULL regexp parameter");
2914     }
2915
2916     DEBUG_EXECUTE_r(
2917         debug_start_match(rx, utf8_target, stringarg, strend,
2918         "Matching");
2919     );
2920
2921     startpos = stringarg;
2922
2923     /* set these early as they may be used by the HOP macros below */
2924     reginfo->strbeg = strbeg;
2925     reginfo->strend = strend;
2926     reginfo->is_utf8_target = cBOOL(utf8_target);
2927
2928     if (prog->intflags & PREGf_GPOS_SEEN) {
2929         MAGIC *mg;
2930
2931         /* set reginfo->ganch, the position where \G can match */
2932
2933         reginfo->ganch =
2934             (flags & REXEC_IGNOREPOS)
2935             ? stringarg /* use start pos rather than pos() */
2936             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2937               /* Defined pos(): */
2938             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2939             : strbeg; /* pos() not defined; use start of string */
2940
2941         DEBUG_GPOS_r(Perl_re_printf( aTHX_
2942             "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2943
2944         /* in the presence of \G, we may need to start looking earlier in
2945          * the string than the suggested start point of stringarg:
2946          * if prog->gofs is set, then that's a known, fixed minimum
2947          * offset, such as
2948          * /..\G/:   gofs = 2
2949          * /ab|c\G/: gofs = 1
2950          * or if the minimum offset isn't known, then we have to go back
2951          * to the start of the string, e.g. /w+\G/
2952          */
2953
2954         if (prog->intflags & PREGf_ANCH_GPOS) {
2955             if (prog->gofs) {
2956                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
2957                 if (!startpos ||
2958                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
2959                 {
2960                     DEBUG_r(Perl_re_printf( aTHX_
2961                             "fail: ganch-gofs before earliest possible start\n"));
2962                     return 0;
2963                 }
2964             }
2965             else
2966                 startpos = reginfo->ganch;
2967         }
2968         else if (prog->gofs) {
2969             startpos = HOPBACKc(startpos, prog->gofs);
2970             if (!startpos)
2971                 startpos = strbeg;
2972         }
2973         else if (prog->intflags & PREGf_GPOS_FLOAT)
2974             startpos = strbeg;
2975     }
2976
2977     minlen = prog->minlen;
2978     if ((startpos + minlen) > strend || startpos < strbeg) {
2979         DEBUG_r(Perl_re_printf( aTHX_
2980                     "Regex match can't succeed, so not even tried\n"));
2981         return 0;
2982     }
2983
2984     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2985      * which will call destuctors to reset PL_regmatch_state, free higher
2986      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2987      * regmatch_info_aux_eval */
2988
2989     oldsave = PL_savestack_ix;
2990
2991     s = startpos;
2992
2993     if ((prog->extflags & RXf_USE_INTUIT)
2994         && !(flags & REXEC_CHECKED))
2995     {
2996         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2997                                     flags, NULL);
2998         if (!s)
2999             return 0;
3000
3001         if (prog->extflags & RXf_CHECK_ALL) {
3002             /* we can match based purely on the result of INTUIT.
3003              * Set up captures etc just for $& and $-[0]
3004              * (an intuit-only match wont have $1,$2,..) */
3005             assert(!prog->nparens);
3006
3007             /* s/// doesn't like it if $& is earlier than where we asked it to
3008              * start searching (which can happen on something like /.\G/) */
3009             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3010                     && (s < stringarg))
3011             {
3012                 /* this should only be possible under \G */
3013                 assert(prog->intflags & PREGf_GPOS_SEEN);
3014                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3015                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3016                 goto phooey;
3017             }
3018
3019             /* match via INTUIT shouldn't have any captures.
3020              * Let @-, @+, $^N know */
3021             prog->lastparen = prog->lastcloseparen = 0;
3022             RX_MATCH_UTF8_set(rx, utf8_target);
3023             prog->offs[0].start = s - strbeg;
3024             prog->offs[0].end = utf8_target
3025                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3026                 : s - strbeg + prog->minlenret;
3027             if ( !(flags & REXEC_NOT_FIRST) )
3028                 S_reg_set_capture_string(aTHX_ rx,
3029                                         strbeg, strend,
3030                                         sv, flags, utf8_target);
3031
3032             return 1;
3033         }
3034     }
3035
3036     multiline = prog->extflags & RXf_PMf_MULTILINE;
3037
3038     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3039         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3040                               "String too short [regexec_flags]...\n"));
3041         goto phooey;
3042     }
3043
3044     /* Check validity of program. */
3045     if (UCHARAT(progi->program) != REG_MAGIC) {
3046         Perl_croak(aTHX_ "corrupted regexp program");
3047     }
3048
3049     RX_MATCH_TAINTED_off(rx);
3050     RX_MATCH_UTF8_set(rx, utf8_target);
3051
3052     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3053     reginfo->intuit = 0;
3054     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3055     reginfo->warned = FALSE;
3056     reginfo->sv = sv;
3057     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3058     /* see how far we have to get to not match where we matched before */
3059     reginfo->till = stringarg + minend;
3060
3061     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3062         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3063            S_cleanup_regmatch_info_aux has executed (registered by
3064            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3065            magic belonging to this SV.
3066            Not newSVsv, either, as it does not COW.
3067         */
3068         reginfo->sv = newSV(0);
3069         SvSetSV_nosteal(reginfo->sv, sv);
3070         SAVEFREESV(reginfo->sv);
3071     }
3072
3073     /* reserve next 2 or 3 slots in PL_regmatch_state:
3074      * slot N+0: may currently be in use: skip it
3075      * slot N+1: use for regmatch_info_aux struct
3076      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3077      * slot N+3: ready for use by regmatch()
3078      */
3079
3080     {
3081         regmatch_state *old_regmatch_state;
3082         regmatch_slab  *old_regmatch_slab;
3083         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3084
3085         /* on first ever match, allocate first slab */
3086         if (!PL_regmatch_slab) {
3087             Newx(PL_regmatch_slab, 1, regmatch_slab);
3088             PL_regmatch_slab->prev = NULL;
3089             PL_regmatch_slab->next = NULL;
3090             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3091         }
3092
3093         old_regmatch_state = PL_regmatch_state;
3094         old_regmatch_slab  = PL_regmatch_slab;
3095
3096         for (i=0; i <= max; i++) {
3097             if (i == 1)
3098                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3099             else if (i ==2)
3100                 reginfo->info_aux_eval =
3101                 reginfo->info_aux->info_aux_eval =
3102                             &(PL_regmatch_state->u.info_aux_eval);
3103
3104             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3105                 PL_regmatch_state = S_push_slab(aTHX);
3106         }
3107
3108         /* note initial PL_regmatch_state position; at end of match we'll
3109          * pop back to there and free any higher slabs */
3110
3111         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3112         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3113         reginfo->info_aux->poscache = NULL;
3114
3115         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3116
3117         if ((prog->extflags & RXf_EVAL_SEEN))
3118             S_setup_eval_state(aTHX_ reginfo);
3119         else
3120             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3121     }
3122
3123     /* If there is a "must appear" string, look for it. */
3124
3125     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3126         /* We have to be careful. If the previous successful match
3127            was from this regex we don't want a subsequent partially
3128            successful match to clobber the old results.
3129            So when we detect this possibility we add a swap buffer
3130            to the re, and switch the buffer each match. If we fail,
3131            we switch it back; otherwise we leave it swapped.
3132         */
3133         swap = prog->offs;
3134         /* do we need a save destructor here for eval dies? */
3135         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3136         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3137             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3138             0,
3139             PTR2UV(prog),
3140             PTR2UV(swap),
3141             PTR2UV(prog->offs)
3142         ));
3143     }
3144
3145     if (prog->recurse_locinput)
3146         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3147
3148     /* Simplest case: anchored match need be tried only once, or with
3149      * MBOL, only at the beginning of each line.
3150      *
3151      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3152      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3153      * match at the start of the string then it won't match anywhere else
3154      * either; while with /.*.../, if it doesn't match at the beginning,
3155      * the earliest it could match is at the start of the next line */
3156
3157     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3158         char *end;
3159
3160         if (regtry(reginfo, &s))
3161             goto got_it;
3162
3163         if (!(prog->intflags & PREGf_ANCH_MBOL))
3164             goto phooey;
3165
3166         /* didn't match at start, try at other newline positions */
3167
3168         if (minlen)
3169             dontbother = minlen - 1;
3170         end = HOP3c(strend, -dontbother, strbeg) - 1;
3171
3172         /* skip to next newline */
3173
3174         while (s <= end) { /* note it could be possible to match at the end of the string */
3175             /* NB: newlines are the same in unicode as they are in latin */
3176             if (*s++ != '\n')
3177                 continue;
3178             if (prog->check_substr || prog->check_utf8) {
3179             /* note that with PREGf_IMPLICIT, intuit can only fail
3180              * or return the start position, so it's of limited utility.
3181              * Nevertheless, I made the decision that the potential for
3182              * quick fail was still worth it - DAPM */
3183                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3184                 if (!s)
3185                     goto phooey;
3186             }
3187             if (regtry(reginfo, &s))
3188                 goto got_it;
3189         }
3190         goto phooey;
3191     } /* end anchored search */
3192
3193     if (prog->intflags & PREGf_ANCH_GPOS)
3194     {
3195         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3196         assert(prog->intflags & PREGf_GPOS_SEEN);
3197         /* For anchored \G, the only position it can match from is
3198          * (ganch-gofs); we already set startpos to this above; if intuit
3199          * moved us on from there, we can't possibly succeed */
3200         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3201         if (s == startpos && regtry(reginfo, &s))
3202             goto got_it;
3203         goto phooey;
3204     }
3205
3206     /* Messy cases:  unanchored match. */
3207     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3208         /* we have /x+whatever/ */
3209         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3210         char ch;
3211 #ifdef DEBUGGING
3212         int did_match = 0;
3213 #endif
3214         if (utf8_target) {
3215             if (! prog->anchored_utf8) {
3216                 to_utf8_substr(prog);
3217             }
3218             ch = SvPVX_const(prog->anchored_utf8)[0];
3219             REXEC_FBC_SCAN(
3220                 if (*s == ch) {
3221                     DEBUG_EXECUTE_r( did_match = 1 );
3222                     if (regtry(reginfo, &s)) goto got_it;
3223                     s += UTF8SKIP(s);
3224                     while (s < strend && *s == ch)
3225                         s += UTF8SKIP(s);
3226                 }
3227             );
3228
3229         }
3230         else {
3231             if (! prog->anchored_substr) {
3232                 if (! to_byte_substr(prog)) {
3233                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3234                 }
3235             }
3236             ch = SvPVX_const(prog->anchored_substr)[0];
3237             REXEC_FBC_SCAN(
3238                 if (*s == ch) {
3239                     DEBUG_EXECUTE_r( did_match = 1 );
3240                     if (regtry(reginfo, &s)) goto got_it;
3241                     s++;
3242                     while (s < strend && *s == ch)
3243                         s++;
3244                 }
3245             );
3246         }
3247         DEBUG_EXECUTE_r(if (!did_match)
3248                 Perl_re_printf( aTHX_
3249                                   "Did not find anchored character...\n")
3250                );
3251     }
3252     else if (prog->anchored_substr != NULL
3253               || prog->anchored_utf8 != NULL
3254               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3255                   && prog->float_max_offset < strend - s)) {
3256         SV *must;
3257         SSize_t back_max;
3258         SSize_t back_min;
3259         char *last;
3260         char *last1;            /* Last position checked before */
3261 #ifdef DEBUGGING
3262         int did_match = 0;
3263 #endif
3264         if (prog->anchored_substr || prog->anchored_utf8) {
3265             if (utf8_target) {
3266                 if (! prog->anchored_utf8) {
3267                     to_utf8_substr(prog);
3268                 }
3269                 must = prog->anchored_utf8;
3270             }
3271             else {
3272                 if (! prog->anchored_substr) {
3273                     if (! to_byte_substr(prog)) {
3274                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3275                     }
3276                 }
3277                 must = prog->anchored_substr;
3278             }
3279             back_max = back_min = prog->anchored_offset;
3280         } else {
3281             if (utf8_target) {
3282                 if (! prog->float_utf8) {
3283                     to_utf8_substr(prog);
3284                 }
3285                 must = prog->float_utf8;
3286             }
3287             else {
3288                 if (! prog->float_substr) {
3289                     if (! to_byte_substr(prog)) {
3290                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3291                     }
3292                 }
3293                 must = prog->float_substr;
3294             }
3295             back_max = prog->float_max_offset;
3296             back_min = prog->float_min_offset;
3297         }
3298
3299         if (back_min<0) {
3300             last = strend;
3301         } else {
3302             last = HOP3c(strend,        /* Cannot start after this */
3303                   -(SSize_t)(CHR_SVLEN(must)
3304                          - (SvTAIL(must) != 0) + back_min), strbeg);
3305         }
3306         if (s > reginfo->strbeg)
3307             last1 = HOPc(s, -1);
3308         else
3309             last1 = s - 1;      /* bogus */
3310
3311         /* XXXX check_substr already used to find "s", can optimize if
3312            check_substr==must. */
3313         dontbother = 0;
3314         strend = HOPc(strend, -dontbother);
3315         while ( (s <= last) &&
3316                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3317                                   (unsigned char*)strend, must,
3318                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3319             DEBUG_EXECUTE_r( did_match = 1 );
3320             if (HOPc(s, -back_max) > last1) {
3321                 last1 = HOPc(s, -back_min);
3322                 s = HOPc(s, -back_max);
3323             }
3324             else {
3325                 char * const t = (last1 >= reginfo->strbeg)
3326                                     ? HOPc(last1, 1) : last1 + 1;
3327
3328                 last1 = HOPc(s, -back_min);
3329                 s = t;
3330             }
3331             if (utf8_target) {
3332                 while (s <= last1) {
3333                     if (regtry(reginfo, &s))
3334                         goto got_it;
3335                     if (s >= last1) {
3336                         s++; /* to break out of outer loop */
3337                         break;
3338                     }
3339                     s += UTF8SKIP(s);
3340                 }
3341             }
3342             else {
3343                 while (s <= last1) {
3344                     if (regtry(reginfo, &s))
3345                         goto got_it;
3346                     s++;
3347                 }
3348             }
3349         }
3350         DEBUG_EXECUTE_r(if (!did_match) {
3351             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3352                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3353             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3354                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3355                                ? "anchored" : "floating"),
3356                 quoted, RE_SV_TAIL(must));
3357         });
3358         goto phooey;
3359     }
3360     else if ( (c = progi->regstclass) ) {
3361         if (minlen) {
3362             const OPCODE op = OP(progi->regstclass);
3363             /* don't bother with what can't match */
3364             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3365                 strend = HOPc(strend, -(minlen - 1));
3366         }
3367         DEBUG_EXECUTE_r({
3368             SV * const prop = sv_newmortal();
3369             regprop(prog, prop, c, reginfo, NULL);
3370             {
3371                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3372                     s,strend-s,60);
3373                 Perl_re_printf( aTHX_
3374                     "Matching stclass %.*s against %s (%d bytes)\n",
3375                     (int)SvCUR(prop), SvPVX_const(prop),
3376                      quoted, (int)(strend - s));
3377             }
3378         });
3379         if (find_byclass(prog, c, s, strend, reginfo))
3380             goto got_it;
3381         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3382     }
3383     else {
3384         dontbother = 0;
3385         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3386             /* Trim the end. */
3387             char *last= NULL;
3388             SV* float_real;
3389             STRLEN len;
3390             const char *little;
3391
3392             if (utf8_target) {
3393                 if (! prog->float_utf8) {
3394                     to_utf8_substr(prog);
3395                 }
3396                 float_real = prog->float_utf8;
3397             }
3398             else {
3399                 if (! prog->float_substr) {
3400                     if (! to_byte_substr(prog)) {
3401                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3402                     }
3403                 }
3404                 float_real = prog->float_substr;
3405             }
3406
3407             little = SvPV_const(float_real, len);
3408             if (SvTAIL(float_real)) {
3409                     /* This means that float_real contains an artificial \n on
3410                      * the end due to the presence of something like this:
3411                      * /foo$/ where we can match both "foo" and "foo\n" at the
3412                      * end of the string.  So we have to compare the end of the
3413                      * string first against the float_real without the \n and
3414                      * then against the full float_real with the string.  We
3415                      * have to watch out for cases where the string might be
3416                      * smaller than the float_real or the float_real without
3417                      * the \n. */
3418                     char *checkpos= strend - len;
3419                     DEBUG_OPTIMISE_r(
3420                         Perl_re_printf( aTHX_
3421                             "%sChecking for float_real.%s\n",
3422                             PL_colors[4], PL_colors[5]));
3423                     if (checkpos + 1 < strbeg) {
3424                         /* can't match, even if we remove the trailing \n
3425                          * string is too short to match */
3426                         DEBUG_EXECUTE_r(
3427                             Perl_re_printf( aTHX_
3428                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3429                                 PL_colors[4], PL_colors[5]));
3430                         goto phooey;
3431                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3432                         /* can match, the end of the string matches without the
3433                          * "\n" */
3434                         last = checkpos + 1;
3435                     } else if (checkpos < strbeg) {
3436                         /* cant match, string is too short when the "\n" is
3437                          * included */
3438                         DEBUG_EXECUTE_r(
3439                             Perl_re_printf( aTHX_
3440                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3441                                 PL_colors[4], PL_colors[5]));
3442                         goto phooey;
3443                     } else if (!multiline) {
3444                         /* non multiline match, so compare with the "\n" at the
3445                          * end of the string */
3446                         if (memEQ(checkpos, little, len)) {
3447                             last= checkpos;
3448                         } else {
3449                             DEBUG_EXECUTE_r(
3450                                 Perl_re_printf( aTHX_
3451                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3452                                     PL_colors[4], PL_colors[5]));
3453                             goto phooey;
3454                         }
3455                     } else {
3456                         /* multiline match, so we have to search for a place
3457                          * where the full string is located */
3458                         goto find_last;
3459                     }
3460             } else {
3461                   find_last:
3462                     if (len)
3463                         last = rninstr(s, strend, little, little + len);
3464                     else
3465                         last = strend;  /* matching "$" */
3466             }
3467             if (!last) {
3468                 /* at one point this block contained a comment which was
3469                  * probably incorrect, which said that this was a "should not
3470                  * happen" case.  Even if it was true when it was written I am
3471                  * pretty sure it is not anymore, so I have removed the comment
3472                  * and replaced it with this one. Yves */
3473                 DEBUG_EXECUTE_r(
3474                     Perl_re_printf( aTHX_
3475                         "%sString does not contain required substring, cannot match.%s\n",
3476                         PL_colors[4], PL_colors[5]
3477                     ));
3478                 goto phooey;
3479             }
3480             dontbother = strend - last + prog->float_min_offset;
3481         }
3482         if (minlen && (dontbother < minlen))
3483             dontbother = minlen - 1;
3484         strend -= dontbother;              /* this one's always in bytes! */
3485         /* We don't know much -- general case. */
3486         if (utf8_target) {
3487             for (;;) {
3488                 if (regtry(reginfo, &s))
3489                     goto got_it;
3490                 if (s >= strend)
3491                     break;
3492                 s += UTF8SKIP(s);
3493             };
3494         }
3495         else {
3496             do {
3497                 if (regtry(reginfo, &s))
3498                     goto got_it;
3499             } while (s++ < strend);
3500         }
3501     }
3502
3503     /* Failure. */
3504     goto phooey;
3505
3506   got_it:
3507     /* s/// doesn't like it if $& is earlier than where we asked it to
3508      * start searching (which can happen on something like /.\G/) */
3509     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3510             && (prog->offs[0].start < stringarg - strbeg))
3511     {
3512         /* this should only be possible under \G */
3513         assert(prog->intflags & PREGf_GPOS_SEEN);
3514         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3515             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3516         goto phooey;
3517     }
3518
3519     DEBUG_BUFFERS_r(
3520         if (swap)
3521             Perl_re_exec_indentf( aTHX_
3522                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3523                 0,
3524                 PTR2UV(prog),
3525                 PTR2UV(swap)
3526             );
3527     );
3528     Safefree(swap);
3529
3530     /* clean up; this will trigger destructors that will free all slabs
3531      * above the current one, and cleanup the regmatch_info_aux
3532      * and regmatch_info_aux_eval sructs */
3533
3534     LEAVE_SCOPE(oldsave);
3535
3536     if (RXp_PAREN_NAMES(prog))
3537         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3538
3539     /* make sure $`, $&, $', and $digit will work later */
3540     if ( !(flags & REXEC_NOT_FIRST) )
3541         S_reg_set_capture_string(aTHX_ rx,
3542                                     strbeg, reginfo->strend,
3543                                     sv, flags, utf8_target);
3544
3545     return 1;
3546
3547   phooey:
3548     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3549                           PL_colors[4], PL_colors[5]));
3550
3551     /* clean up; this will trigger destructors that will free all slabs
3552      * above the current one, and cleanup the regmatch_info_aux
3553      * and regmatch_info_aux_eval sructs */
3554
3555     LEAVE_SCOPE(oldsave);
3556
3557     if (swap) {
3558         /* we failed :-( roll it back */
3559         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3560             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3561             0,
3562             PTR2UV(prog),
3563             PTR2UV(prog->offs),
3564             PTR2UV(swap)
3565         ));
3566         Safefree(prog->offs);
3567         prog->offs = swap;
3568     }
3569     return 0;
3570 }
3571
3572
3573 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3574  * Do inc before dec, in case old and new rex are the same */
3575 #define SET_reg_curpm(Re2)                          \
3576     if (reginfo->info_aux_eval) {                   \
3577         (void)ReREFCNT_inc(Re2);                    \
3578         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3579         PM_SETRE((PL_reg_curpm), (Re2));            \
3580     }
3581
3582
3583 /*
3584  - regtry - try match at specific point
3585  */
3586 STATIC bool                     /* 0 failure, 1 success */
3587 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3588 {
3589     CHECKPOINT lastcp;
3590     REGEXP *const rx = reginfo->prog;
3591     regexp *const prog = ReANY(rx);
3592     SSize_t result;
3593 #ifdef DEBUGGING
3594     U32 depth = 0; /* used by REGCP_SET */
3595 #endif
3596     RXi_GET_DECL(prog,progi);
3597     GET_RE_DEBUG_FLAGS_DECL;
3598
3599     PERL_ARGS_ASSERT_REGTRY;
3600
3601     reginfo->cutpoint=NULL;
3602
3603     prog->offs[0].start = *startposp - reginfo->strbeg;
3604     prog->lastparen = 0;
3605     prog->lastcloseparen = 0;
3606
3607     /* XXXX What this code is doing here?!!!  There should be no need
3608        to do this again and again, prog->lastparen should take care of
3609        this!  --ilya*/
3610
3611     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3612      * Actually, the code in regcppop() (which Ilya may be meaning by
3613      * prog->lastparen), is not needed at all by the test suite
3614      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3615      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3616      * Meanwhile, this code *is* needed for the
3617      * above-mentioned test suite tests to succeed.  The common theme
3618      * on those tests seems to be returning null fields from matches.
3619      * --jhi updated by dapm */
3620
3621     /* After encountering a variant of the issue mentioned above I think
3622      * the point Ilya was making is that if we properly unwind whenever
3623      * we set lastparen to a smaller value then we should not need to do
3624      * this every time, only when needed. So if we have tests that fail if
3625      * we remove this, then it suggests somewhere else we are improperly
3626      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3627      * places it is called, and related regcp() routines. - Yves */
3628 #if 1
3629     if (prog->nparens) {
3630         regexp_paren_pair *pp = prog->offs;
3631         I32 i;
3632         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3633             ++pp;
3634             pp->start = -1;
3635             pp->end = -1;
3636         }
3637     }
3638 #endif
3639     REGCP_SET(lastcp);
3640     result = regmatch(reginfo, *startposp, progi->program + 1);
3641     if (result != -1) {
3642         prog->offs[0].end = result;
3643         return 1;
3644     }
3645     if (reginfo->cutpoint)
3646         *startposp= reginfo->cutpoint;
3647     REGCP_UNWIND(lastcp);
3648     return 0;
3649 }
3650
3651
3652 #define sayYES goto yes
3653 #define sayNO goto no
3654 #define sayNO_SILENT goto no_silent
3655
3656 /* we dont use STMT_START/END here because it leads to
3657    "unreachable code" warnings, which are bogus, but distracting. */
3658 #define CACHEsayNO \
3659     if (ST.cache_mask) \
3660        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3661     sayNO
3662
3663 /* this is used to determine how far from the left messages like
3664    'failed...' are printed in regexec.c. It should be set such that
3665    messages are inline with the regop output that created them.
3666 */
3667 #define REPORT_CODE_OFF 29
3668 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3669 #ifdef DEBUGGING
3670 int
3671 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3672 {
3673     va_list ap;
3674     int result;
3675     PerlIO *f= Perl_debug_log;
3676     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3677     va_start(ap, depth);
3678     PerlIO_printf(f, "%*s|%4"UVuf"| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3679     result = PerlIO_vprintf(f, fmt, ap);
3680     va_end(ap);
3681     return result;
3682 }
3683 #endif /* DEBUGGING */
3684
3685
3686 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3687 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3688 #define CHRTEST_NOT_A_CP_1 -999
3689 #define CHRTEST_NOT_A_CP_2 -998
3690
3691 /* grab a new slab and return the first slot in it */
3692
3693 STATIC regmatch_state *
3694 S_push_slab(pTHX)
3695 {
3696 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3697     dMY_CXT;
3698 #endif
3699     regmatch_slab *s = PL_regmatch_slab->next;
3700     if (!s) {
3701         Newx(s, 1, regmatch_slab);
3702         s->prev = PL_regmatch_slab;
3703         s->next = NULL;
3704         PL_regmatch_slab->next = s;
3705     }
3706     PL_regmatch_slab = s;
3707     return SLAB_FIRST(s);
3708 }
3709
3710
3711 /* push a new state then goto it */
3712
3713 #define PUSH_STATE_GOTO(state, node, input) \
3714     pushinput = input; \
3715     scan = node; \
3716     st->resume_state = state; \
3717     goto push_state;
3718
3719 /* push a new state with success backtracking, then goto it */
3720
3721 #define PUSH_YES_STATE_GOTO(state, node, input) \
3722     pushinput = input; \
3723     scan = node; \
3724     st->resume_state = state; \
3725     goto push_yes_state;
3726
3727
3728
3729
3730 /*
3731
3732 regmatch() - main matching routine
3733
3734 This is basically one big switch statement in a loop. We execute an op,
3735 set 'next' to point the next op, and continue. If we come to a point which
3736 we may need to backtrack to on failure such as (A|B|C), we push a
3737 backtrack state onto the backtrack stack. On failure, we pop the top
3738 state, and re-enter the loop at the state indicated. If there are no more
3739 states to pop, we return failure.
3740
3741 Sometimes we also need to backtrack on success; for example /A+/, where
3742 after successfully matching one A, we need to go back and try to
3743 match another one; similarly for lookahead assertions: if the assertion
3744 completes successfully, we backtrack to the state just before the assertion
3745 and then carry on.  In these cases, the pushed state is marked as
3746 'backtrack on success too'. This marking is in fact done by a chain of
3747 pointers, each pointing to the previous 'yes' state. On success, we pop to
3748 the nearest yes state, discarding any intermediate failure-only states.
3749 Sometimes a yes state is pushed just to force some cleanup code to be
3750 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3751 it to free the inner regex.
3752
3753 Note that failure backtracking rewinds the cursor position, while
3754 success backtracking leaves it alone.
3755
3756 A pattern is complete when the END op is executed, while a subpattern
3757 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3758 ops trigger the "pop to last yes state if any, otherwise return true"
3759 behaviour.
3760
3761 A common convention in this function is to use A and B to refer to the two
3762 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3763 the subpattern to be matched possibly multiple times, while B is the entire
3764 rest of the pattern. Variable and state names reflect this convention.
3765
3766 The states in the main switch are the union of ops and failure/success of
3767 substates associated with with that op.  For example, IFMATCH is the op
3768 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3769 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3770 successfully matched A and IFMATCH_A_fail is a state saying that we have
3771 just failed to match A. Resume states always come in pairs. The backtrack
3772 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3773 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3774 on success or failure.
3775
3776 The struct that holds a backtracking state is actually a big union, with
3777 one variant for each major type of op. The variable st points to the
3778 top-most backtrack struct. To make the code clearer, within each
3779 block of code we #define ST to alias the relevant union.
3780
3781 Here's a concrete example of a (vastly oversimplified) IFMATCH
3782 implementation:
3783
3784     switch (state) {
3785     ....
3786
3787 #define ST st->u.ifmatch
3788
3789     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3790         ST.foo = ...; // some state we wish to save
3791         ...
3792         // push a yes backtrack state with a resume value of
3793         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3794         // first node of A:
3795         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3796         // NOTREACHED
3797
3798     case IFMATCH_A: // we have successfully executed A; now continue with B
3799         next = B;
3800         bar = ST.foo; // do something with the preserved value
3801         break;
3802
3803     case IFMATCH_A_fail: // A failed, so the assertion failed
3804         ...;   // do some housekeeping, then ...
3805         sayNO; // propagate the failure
3806
3807 #undef ST
3808
3809     ...
3810     }
3811
3812 For any old-timers reading this who are familiar with the old recursive
3813 approach, the code above is equivalent to:
3814
3815     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3816     {
3817         int foo = ...
3818         ...
3819         if (regmatch(A)) {
3820             next = B;
3821             bar = foo;
3822             break;
3823         }
3824         ...;   // do some housekeeping, then ...
3825         sayNO; // propagate the failure
3826     }
3827
3828 The topmost backtrack state, pointed to by st, is usually free. If you
3829 want to claim it, populate any ST.foo fields in it with values you wish to
3830 save, then do one of
3831
3832         PUSH_STATE_GOTO(resume_state, node, newinput);
3833         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3834
3835 which sets that backtrack state's resume value to 'resume_state', pushes a
3836 new free entry to the top of the backtrack stack, then goes to 'node'.
3837 On backtracking, the free slot is popped, and the saved state becomes the
3838 new free state. An ST.foo field in this new top state can be temporarily
3839 accessed to retrieve values, but once the main loop is re-entered, it
3840 becomes available for reuse.
3841
3842 Note that the depth of the backtrack stack constantly increases during the
3843 left-to-right execution of the pattern, rather than going up and down with
3844 the pattern nesting. For example the stack is at its maximum at Z at the
3845 end of the pattern, rather than at X in the following:
3846
3847     /(((X)+)+)+....(Y)+....Z/
3848
3849 The only exceptions to this are lookahead/behind assertions and the cut,
3850 (?>A), which pop all the backtrack states associated with A before
3851 continuing.
3852
3853 Backtrack state structs are allocated in slabs of about 4K in size.
3854 PL_regmatch_state and st always point to the currently active state,
3855 and PL_regmatch_slab points to the slab currently containing
3856 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3857 allocated, and is never freed until interpreter destruction. When the slab
3858 is full, a new one is allocated and chained to the end. At exit from
3859 regmatch(), slabs allocated since entry are freed.
3860
3861 */
3862
3863
3864 #define DEBUG_STATE_pp(pp)                                  \
3865     DEBUG_STATE_r({                                         \
3866         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
3867         Perl_re_printf( aTHX_                                           \
3868             "%*s" pp " %s%s%s%s%s\n",                       \
3869             INDENT_CHARS(depth), "",                        \
3870             PL_reg_name[st->resume_state],                  \
3871             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3872             ((st==yes_state) ? "Y" : ""),                   \
3873             ((st==mark_state) ? "M" : ""),                  \
3874             ((st==yes_state||st==mark_state) ? "]" : "")    \
3875         );                                                  \
3876     });
3877
3878
3879 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3880
3881 #ifdef DEBUGGING
3882
3883 STATIC void
3884 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3885     const char *start, const char *end, const char *blurb)
3886 {
3887     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3888
3889     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3890
3891     if (!PL_colorset)
3892             reginitcolors();
3893     {
3894         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3895             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3896
3897         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3898             start, end - start, 60);
3899
3900         Perl_re_printf( aTHX_
3901             "%s%s REx%s %s against %s\n",
3902                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3903
3904         if (utf8_target||utf8_pat)
3905             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
3906                 utf8_pat ? "pattern" : "",
3907                 utf8_pat && utf8_target ? " and " : "",
3908                 utf8_target ? "string" : ""
3909             );
3910     }
3911 }
3912
3913 STATIC void
3914 S_dump_exec_pos(pTHX_ const char *locinput,
3915                       const regnode *scan,
3916                       const char *loc_regeol,
3917                       const char *loc_bostr,
3918                       const char *loc_reg_starttry,
3919                       const bool utf8_target,
3920                       const U32 depth
3921                 )
3922 {
3923     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3924     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3925     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3926     /* The part of the string before starttry has one color
3927        (pref0_len chars), between starttry and current
3928        position another one (pref_len - pref0_len chars),
3929        after the current position the third one.
3930        We assume that pref0_len <= pref_len, otherwise we
3931        decrease pref0_len.  */
3932     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3933         ? (5 + taill) - l : locinput - loc_bostr;
3934     int pref0_len;
3935
3936     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3937
3938     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3939         pref_len++;
3940     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3941     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3942         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3943               ? (5 + taill) - pref_len : loc_regeol - locinput);
3944     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3945         l--;
3946     if (pref0_len < 0)
3947         pref0_len = 0;
3948     if (pref0_len > pref_len)
3949         pref0_len = pref_len;
3950     {
3951         const int is_uni = utf8_target ? 1 : 0;
3952
3953         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3954             (locinput - pref_len),pref0_len, 60, 4, 5);
3955
3956         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3957                     (locinput - pref_len + pref0_len),
3958                     pref_len - pref0_len, 60, 2, 3);
3959
3960         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3961                     locinput, loc_regeol - locinput, 10, 0, 1);
3962
3963         const STRLEN tlen=len0+len1+len2;
3964         Perl_re_printf( aTHX_
3965                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|%4u| ",
3966                     (IV)(locinput - loc_bostr),
3967                     len0, s0,
3968                     len1, s1,
3969                     (docolor ? "" : "> <"),
3970                     len2, s2,
3971                     (int)(tlen > 19 ? 0 :  19 - tlen),
3972                     "",
3973                     depth);
3974     }
3975 }
3976
3977 #endif
3978
3979 /* reg_check_named_buff_matched()
3980  * Checks to see if a named buffer has matched. The data array of
3981  * buffer numbers corresponding to the buffer is expected to reside
3982  * in the regexp->data->data array in the slot stored in the ARG() of
3983  * node involved. Note that this routine doesn't actually care about the
3984  * name, that information is not preserved from compilation to execution.
3985  * Returns the index of the leftmost defined buffer with the given name
3986  * or 0 if non of the buffers matched.
3987  */
3988 STATIC I32
3989 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3990 {
3991     I32 n;
3992     RXi_GET_DECL(rex,rexi);
3993     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3994     I32 *nums=(I32*)SvPVX(sv_dat);
3995
3996     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3997
3998     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3999         if ((I32)rex->lastparen >= nums[n] &&
4000             rex->offs[nums[n]].end != -1)
4001         {
4002             return nums[n];
4003         }
4004     }
4005     return 0;
4006 }
4007
4008
4009 static bool
4010 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4011         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4012 {
4013     /* This function determines if there are one or two characters that match
4014      * the first character of the passed-in EXACTish node <text_node>, and if
4015      * so, returns them in the passed-in pointers.
4016      *
4017      * If it determines that no possible character in the target string can
4018      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4019      * the first character in <text_node> requires UTF-8 to represent, and the
4020      * target string isn't in UTF-8.)
4021      *
4022      * If there are more than two characters that could match the beginning of
4023      * <text_node>, or if more context is required to determine a match or not,
4024      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4025      *
4026      * The motiviation behind this function is to allow the caller to set up
4027      * tight loops for matching.  If <text_node> is of type EXACT, there is
4028      * only one possible character that can match its first character, and so
4029      * the situation is quite simple.  But things get much more complicated if
4030      * folding is involved.  It may be that the first character of an EXACTFish
4031      * node doesn't participate in any possible fold, e.g., punctuation, so it
4032      * can be matched only by itself.  The vast majority of characters that are
4033      * in folds match just two things, their lower and upper-case equivalents.
4034      * But not all are like that; some have multiple possible matches, or match
4035      * sequences of more than one character.  This function sorts all that out.
4036      *
4037      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4038      * loop of trying to match A*, we know we can't exit where the thing
4039      * following it isn't a B.  And something can't be a B unless it is the
4040      * beginning of B.  By putting a quick test for that beginning in a tight
4041      * loop, we can rule out things that can't possibly be B without having to
4042      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4043      * character, we can make a tight loop matching A*, using the outputs of
4044      * this function.
4045      *
4046      * If the target string to match isn't in UTF-8, and there aren't
4047      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4048      * the one or two possible octets (which are characters in this situation)
4049      * that can match.  In all cases, if there is only one character that can
4050      * match, *<c1p> and *<c2p> will be identical.
4051      *
4052      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4053      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4054      * can match the beginning of <text_node>.  They should be declared with at
4055      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4056      * undefined what these contain.)  If one or both of the buffers are
4057      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4058      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4059      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4060      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4061      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4062
4063     const bool utf8_target = reginfo->is_utf8_target;
4064
4065     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4066     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4067     bool use_chrtest_void = FALSE;
4068     const bool is_utf8_pat = reginfo->is_utf8_pat;
4069
4070     /* Used when we have both utf8 input and utf8 output, to avoid converting
4071      * to/from code points */
4072     bool utf8_has_been_setup = FALSE;
4073
4074     dVAR;
4075
4076     U8 *pat = (U8*)STRING(text_node);
4077     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4078
4079     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4080
4081         /* In an exact node, only one thing can be matched, that first
4082          * character.  If both the pat and the target are UTF-8, we can just
4083          * copy the input to the output, avoiding finding the code point of
4084          * that character */
4085         if (!is_utf8_pat) {
4086             c2 = c1 = *pat;
4087         }
4088         else if (utf8_target) {
4089             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4090             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4091             utf8_has_been_setup = TRUE;
4092         }
4093         else {
4094             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4095         }
4096     }
4097     else { /* an EXACTFish node */
4098         U8 *pat_end = pat + STR_LEN(text_node);
4099
4100         /* An EXACTFL node has at least some characters unfolded, because what
4101          * they match is not known until now.  So, now is the time to fold
4102          * the first few of them, as many as are needed to determine 'c1' and
4103          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4104          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4105          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4106          * need to fold as many characters as a single character can fold to,
4107          * so that later we can check if the first ones are such a multi-char
4108          * fold.  But, in such a pattern only locale-problematic characters
4109          * aren't folded, so we can skip this completely if the first character
4110          * in the node isn't one of the tricky ones */
4111         if (OP(text_node) == EXACTFL) {
4112
4113             if (! is_utf8_pat) {
4114                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4115                 {
4116                     folded[0] = folded[1] = 's';
4117                     pat = folded;
4118                     pat_end = folded + 2;
4119                 }
4120             }
4121             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4122                 U8 *s = pat;
4123                 U8 *d = folded;
4124                 int i;
4125
4126                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4127                     if (isASCII(*s)) {
4128                         *(d++) = (U8) toFOLD_LC(*s);
4129                         s++;
4130                     }
4131                     else {
4132                         STRLEN len;
4133                         _to_utf8_fold_flags(s,
4134                                             d,
4135                                             &len,
4136                                             FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4137                         d += len;
4138                         s += UTF8SKIP(s);
4139                     }
4140                 }
4141
4142                 pat = folded;
4143                 pat_end = d;
4144             }
4145         }
4146
4147         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4148              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4149         {
4150             /* Multi-character folds require more context to sort out.  Also
4151              * PL_utf8_foldclosures used below doesn't handle them, so have to
4152              * be handled outside this routine */
4153             use_chrtest_void = TRUE;
4154         }
4155         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4156             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4157             if (c1 > 255) {
4158                 /* Load the folds hash, if not already done */
4159                 SV** listp;
4160                 if (! PL_utf8_foldclosures) {
4161                     _load_PL_utf8_foldclosures();
4162                 }
4163
4164                 /* The fold closures data structure is a hash with the keys
4165                  * being the UTF-8 of every character that is folded to, like
4166                  * 'k', and the values each an array of all code points that
4167                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4168                  * Multi-character folds are not included */
4169                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4170                                         (char *) pat,
4171                                         UTF8SKIP(pat),
4172                                         FALSE))))
4173                 {
4174                     /* Not found in the hash, therefore there are no folds
4175                     * containing it, so there is only a single character that
4176                     * could match */
4177                     c2 = c1;
4178                 }
4179                 else {  /* Does participate in folds */
4180                     AV* list = (AV*) *listp;
4181                     if (av_tindex_nomg(list) != 1) {
4182
4183                         /* If there aren't exactly two folds to this, it is
4184                          * outside the scope of this function */
4185                         use_chrtest_void = TRUE;
4186                     }
4187                     else {  /* There are two.  Get them */
4188                         SV** c_p = av_fetch(list, 0, FALSE);
4189                         if (c_p == NULL) {
4190                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4191                         }
4192                         c1 = SvUV(*c_p);
4193
4194                         c_p = av_fetch(list, 1, FALSE);
4195                         if (c_p == NULL) {
4196                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4197                         }
4198                         c2 = SvUV(*c_p);
4199
4200                         /* Folds that cross the 255/256 boundary are forbidden
4201                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4202                          * one is ASCIII.  Since the pattern character is above
4203                          * 255, and its only other match is below 256, the only
4204                          * legal match will be to itself.  We have thrown away
4205                          * the original, so have to compute which is the one
4206                          * above 255. */
4207                         if ((c1 < 256) != (c2 < 256)) {
4208                             if ((OP(text_node) == EXACTFL
4209                                  && ! IN_UTF8_CTYPE_LOCALE)
4210                                 || ((OP(text_node) == EXACTFA
4211                                     || OP(text_node) == EXACTFA_NO_TRIE)
4212                                     && (isASCII(c1) || isASCII(c2))))
4213                             {
4214                                 if (c1 < 256) {
4215                                     c1 = c2;
4216                                 }
4217                                 else {
4218                                     c2 = c1;
4219                                 }
4220                             }
4221                         }
4222                     }
4223                 }
4224             }
4225             else /* Here, c1 is <= 255 */
4226                 if (utf8_target
4227                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4228                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4229                     && ((OP(text_node) != EXACTFA
4230                         && OP(text_node) != EXACTFA_NO_TRIE)
4231                         || ! isASCII(c1)))
4232             {
4233                 /* Here, there could be something above Latin1 in the target
4234                  * which folds to this character in the pattern.  All such
4235                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4236                  * than two characters involved in their folds, so are outside
4237                  * the scope of this function */
4238                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4239                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4240                 }
4241                 else {
4242                     use_chrtest_void = TRUE;
4243                 }
4244             }
4245             else { /* Here nothing above Latin1 can fold to the pattern
4246                       character */
4247                 switch (OP(text_node)) {
4248
4249                     case EXACTFL:   /* /l rules */
4250                         c2 = PL_fold_locale[c1];
4251                         break;
4252
4253                     case EXACTF:   /* This node only generated for non-utf8
4254                                     patterns */
4255                         assert(! is_utf8_pat);
4256                         if (! utf8_target) {    /* /d rules */
4257                             c2 = PL_fold[c1];
4258                             break;
4259                         }
4260                         /* FALLTHROUGH */
4261                         /* /u rules for all these.  This happens to work for
4262                         * EXACTFA as nothing in Latin1 folds to ASCII */
4263                     case EXACTFA_NO_TRIE:   /* This node only generated for
4264                                             non-utf8 patterns */
4265                         assert(! is_utf8_pat);
4266                         /* FALLTHROUGH */
4267                     case EXACTFA:
4268                     case EXACTFU_SS:
4269                     case EXACTFU:
4270                         c2 = PL_fold_latin1[c1];
4271                         break;
4272
4273                     default:
4274                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4275                         NOT_REACHED; /* NOTREACHED */
4276                 }
4277             }
4278         }
4279     }
4280
4281     /* Here have figured things out.  Set up the returns */
4282     if (use_chrtest_void) {
4283         *c2p = *c1p = CHRTEST_VOID;
4284     }
4285     else if (utf8_target) {
4286         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4287             uvchr_to_utf8(c1_utf8, c1);
4288             uvchr_to_utf8(c2_utf8, c2);
4289         }
4290
4291         /* Invariants are stored in both the utf8 and byte outputs; Use
4292          * negative numbers otherwise for the byte ones.  Make sure that the
4293          * byte ones are the same iff the utf8 ones are the same */
4294         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4295         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4296                 ? *c2_utf8
4297                 : (c1 == c2)
4298                   ? CHRTEST_NOT_A_CP_1
4299                   : CHRTEST_NOT_A_CP_2;
4300     }
4301     else if (c1 > 255) {
4302        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4303                            can represent */
4304            return FALSE;
4305        }
4306
4307        *c1p = *c2p = c2;    /* c2 is the only representable value */
4308     }
4309     else {  /* c1 is representable; see about c2 */
4310        *c1p = c1;
4311        *c2p = (c2 < 256) ? c2 : c1;
4312     }
4313
4314     return TRUE;
4315 }
4316
4317 STATIC bool
4318 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4319 {
4320     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4321      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4322
4323     PERL_ARGS_ASSERT_ISGCB;
4324
4325     switch (GCB_table[before][after]) {
4326         case GCB_BREAKABLE:
4327             return TRUE;
4328
4329         case GCB_NOBREAK:
4330             return FALSE;
4331
4332         case GCB_RI_then_RI:
4333             {
4334                 int RI_count = 1;
4335                 U8 * temp_pos = (U8 *) curpos;
4336
4337                 /* Do not break within emoji flag sequences. That is, do not
4338                  * break between regional indicator (RI) symbols if there is an
4339                  * odd number of RI characters before the break point.
4340                  *  GB12     ^ (RI RI)* RI × RI
4341                  *  GB13 [^RI] (RI RI)* RI × RI */
4342
4343                 while (backup_one_GCB(strbeg,
4344                                     &temp_pos,
4345                                     utf8_target) == GCB_Regional_Indicator)
4346                 {
4347                     RI_count++;
4348                 }
4349
4350                 return RI_count % 2 != 1;
4351             }
4352
4353         case GCB_EX_then_EM:
4354
4355             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4356             {
4357                 U8 * temp_pos = (U8 *) curpos;
4358                 GCB_enum prev;
4359
4360                 do {
4361                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4362                 }
4363                 while (prev == GCB_Extend);
4364
4365                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4366             }
4367
4368         default:
4369             break;
4370     }
4371
4372 #ifdef DEBUGGING
4373     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4374                                   before, after, GCB_table[before][after]);
4375     assert(0);
4376 #endif
4377     return TRUE;
4378 }
4379
4380 STATIC GCB_enum
4381 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4382 {
4383     GCB_enum gcb;
4384
4385     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4386
4387     if (*curpos < strbeg) {
4388         return GCB_EDGE;
4389     }
4390
4391     if (utf8_target) {
4392         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4393         U8 * prev_prev_char_pos;
4394
4395         if (! prev_char_pos) {
4396             return GCB_EDGE;
4397         }
4398
4399         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4400             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4401             *curpos = prev_char_pos;
4402             prev_char_pos = prev_prev_char_pos;
4403         }
4404         else {
4405             *curpos = (U8 *) strbeg;
4406             return GCB_EDGE;
4407         }
4408     }
4409     else {
4410         if (*curpos - 2 < strbeg) {
4411             *curpos = (U8 *) strbeg;
4412             return GCB_EDGE;
4413         }
4414         (*curpos)--;
4415         gcb = getGCB_VAL_CP(*(*curpos - 1));
4416     }
4417
4418     return gcb;
4419 }
4420
4421 /* Combining marks attach to most classes that precede them, but this defines
4422  * the exceptions (from TR14) */
4423 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4424                                      || prev == LB_Mandatory_Break      \
4425                                      || prev == LB_Carriage_Return      \
4426                                      || prev == LB_Line_Feed            \
4427                                      || prev == LB_Next_Line            \
4428                                      || prev == LB_Space                \
4429                                      || prev == LB_ZWSpace))
4430
4431 STATIC bool
4432 S_isLB(pTHX_ LB_enum before,
4433              LB_enum after,
4434              const U8 * const strbeg,
4435              const U8 * const curpos,
4436              const U8 * const strend,
4437              const bool utf8_target)
4438 {
4439     U8 * temp_pos = (U8 *) curpos;
4440     LB_enum prev = before;
4441
4442     /* Is the boundary between 'before' and 'after' line-breakable?
4443      * Most of this is just a table lookup of a generated table from Unicode
4444      * rules.  But some rules require context to decide, and so have to be
4445      * implemented in code */
4446
4447     PERL_ARGS_ASSERT_ISLB;
4448
4449     /* Rule numbers in the comments below are as of Unicode 9.0 */
4450
4451   redo:
4452     before = prev;
4453     switch (LB_table[before][after]) {
4454         case LB_BREAKABLE:
4455             return TRUE;
4456
4457         case LB_NOBREAK:
4458         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4459             return FALSE;
4460
4461         case LB_SP_foo + LB_BREAKABLE:
4462         case LB_SP_foo + LB_NOBREAK:
4463         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4464
4465             /* When we have something following a SP, we have to look at the
4466              * context in order to know what to do.
4467              *
4468              * SP SP should not reach here because LB7: Do not break before
4469              * spaces.  (For two spaces in a row there is nothing that
4470              * overrides that) */
4471             assert(after != LB_Space);
4472
4473             /* Here we have a space followed by a non-space.  Mostly this is a
4474              * case of LB18: "Break after spaces".  But there are complications
4475              * as the handling of spaces is somewhat tricky.  They are in a
4476              * number of rules, which have to be applied in priority order, but
4477              * something earlier in the string can cause a rule to be skipped
4478              * and a lower priority rule invoked.  A prime example is LB7 which
4479              * says don't break before a space.  But rule LB8 (lower priority)
4480              * says that the first break opportunity after a ZW is after any
4481              * span of spaces immediately after it.  If a ZW comes before a SP
4482              * in the input, rule LB8 applies, and not LB7.  Other such rules
4483              * involve combining marks which are rules 9 and 10, but they may
4484              * override higher priority rules if they come earlier in the
4485              * string.  Since we're doing random access into the middle of the
4486              * string, we have to look for rules that should get applied based
4487              * on both string position and priority.  Combining marks do not
4488              * attach to either ZW nor SP, so we don't have to consider them
4489              * until later.
4490              *
4491              * To check for LB8, we have to find the first non-space character
4492              * before this span of spaces */
4493             do {
4494                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4495             }
4496             while (prev == LB_Space);
4497
4498             /* LB8 Break before any character following a zero-width space,
4499              * even if one or more spaces intervene.
4500              *      ZW SP* ÷
4501              * So if we have a ZW just before this span, and to get here this
4502              * is the final space in the span. */
4503             if (prev == LB_ZWSpace) {
4504                 return TRUE;
4505             }
4506
4507             /* Here, not ZW SP+.  There are several rules that have higher
4508              * priority than LB18 and can be resolved now, as they don't depend
4509              * on anything earlier in the string (except ZW, which we have
4510              * already handled).  One of these rules is LB11 Do not break
4511              * before Word joiner, but we have specially encoded that in the
4512              * lookup table so it is caught by the single test below which
4513              * catches the other ones. */
4514             if (LB_table[LB_Space][after] - LB_SP_foo
4515                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4516             {
4517                 return FALSE;
4518             }
4519
4520             /* If we get here, we have to XXX consider combining marks. */
4521             if (prev == LB_Combining_Mark) {
4522
4523                 /* What happens with these depends on the character they
4524                  * follow.  */
4525                 do {
4526                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4527                 }
4528                 while (prev == LB_Combining_Mark);
4529
4530                 /* Most times these attach to and inherit the characteristics
4531                  * of that character, but not always, and when not, they are to
4532                  * be treated as AL by rule LB10. */
4533                 if (! LB_CM_ATTACHES_TO(prev)) {
4534                     prev = LB_Alphabetic;
4535                 }
4536             }
4537
4538             /* Here, we have the character preceding the span of spaces all set
4539              * up.  We follow LB18: "Break after spaces" unless the table shows
4540              * that is overriden */
4541             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4542
4543         case LB_CM_ZWJ_foo:
4544
4545             /* We don't know how to treat the CM except by looking at the first
4546              * non-CM character preceding it.  ZWJ is treated as CM */
4547             do {
4548                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4549             }
4550             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4551
4552             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4553              * attatches to it, then it inherits the behavior of 'prev'.  If it
4554              * doesn't attach, it is to be treated as an AL */
4555             if (! LB_CM_ATTACHES_TO(prev)) {
4556                 prev = LB_Alphabetic;
4557             }
4558
4559             goto redo;
4560
4561         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4562         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4563
4564             /* LB21a Don't break after Hebrew + Hyphen.
4565              * HL (HY | BA) × */
4566
4567             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4568                                                           == LB_Hebrew_Letter)
4569             {
4570                 return FALSE;
4571             }
4572
4573             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4574
4575         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4576         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4577
4578             /* LB25a (PR | PO) × ( OP | HY )? NU */
4579             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4580                 return FALSE;
4581             }
4582
4583             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4584                                                                 == LB_BREAKABLE;
4585
4586         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4587         case LB_SY_or_IS_then_various + LB_NOBREAK:
4588         {
4589             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4590
4591             LB_enum temp = prev;
4592             do {
4593                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4594             }
4595             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4596             if (temp == LB_Numeric) {
4597                 return FALSE;
4598             }
4599
4600             return LB_table[prev][after] - LB_SY_or_IS_then_various
4601                                                                == LB_BREAKABLE;
4602         }
4603
4604         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4605         case LB_various_then_PO_or_PR + LB_NOBREAK:
4606         {
4607             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4608
4609             LB_enum temp = prev;
4610             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4611             {
4612                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4613             }
4614             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4615                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4616             }
4617             if (temp == LB_Numeric) {
4618                 return FALSE;
4619             }
4620             return LB_various_then_PO_or_PR;
4621         }
4622
4623         case LB_RI_then_RI + LB_NOBREAK:
4624         case LB_RI_then_RI + LB_BREAKABLE:
4625             {
4626                 int RI_count = 1;
4627
4628                 /* LB30a Break between two regional indicator symbols if and
4629                  * only if there are an even number of regional indicators
4630                  * preceding the position of the break.
4631                  *
4632                  *  sot (RI RI)* RI × RI
4633                  *  [^RI] (RI RI)* RI × RI */
4634
4635                 while (backup_one_LB(strbeg,
4636                                      &temp_pos,
4637                                      utf8_target) == LB_Regional_Indicator)
4638                 {
4639                     RI_count++;
4640                 }
4641
4642                 return RI_count % 2 == 0;
4643             }
4644
4645         default:
4646             break;
4647     }
4648
4649 #ifdef DEBUGGING
4650     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4651                                   before, after, LB_table[before][after]);
4652     assert(0);
4653 #endif
4654     return TRUE;
4655 }
4656
4657 STATIC LB_enum
4658 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4659 {
4660     LB_enum lb;
4661
4662     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4663
4664     if (*curpos >= strend) {
4665         return LB_EDGE;
4666     }
4667
4668     if (utf8_target) {
4669         *curpos += UTF8SKIP(*curpos);
4670         if (*curpos >= strend) {
4671             return LB_EDGE;
4672         }
4673         lb = getLB_VAL_UTF8(*curpos, strend);
4674     }
4675     else {
4676         (*curpos)++;
4677         if (*curpos >= strend) {
4678             return LB_EDGE;
4679         }
4680         lb = getLB_VAL_CP(**curpos);
4681     }
4682
4683     return lb;
4684 }
4685
4686 STATIC LB_enum
4687 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4688 {
4689     LB_enum lb;
4690
4691     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4692
4693     if (*curpos < strbeg) {
4694         return LB_EDGE;
4695     }
4696
4697     if (utf8_target) {
4698         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4699         U8 * prev_prev_char_pos;
4700
4701         if (! prev_char_pos) {
4702             return LB_EDGE;
4703         }
4704
4705         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4706             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4707             *curpos = prev_char_pos;
4708             prev_char_pos = prev_prev_char_pos;
4709         }
4710         else {
4711             *curpos = (U8 *) strbeg;
4712             return LB_EDGE;
4713         }
4714     }
4715     else {
4716         if (*curpos - 2 < strbeg) {
4717             *curpos = (U8 *) strbeg;
4718             return LB_EDGE;
4719         }
4720         (*curpos)--;
4721         lb = getLB_VAL_CP(*(*curpos - 1));
4722     }
4723
4724     return lb;
4725 }
4726
4727 STATIC bool
4728 S_isSB(pTHX_ SB_enum before,
4729              SB_enum after,
4730              const U8 * const strbeg,
4731              const U8 * const curpos,
4732              const U8 * const strend,
4733              const bool utf8_target)
4734 {
4735     /* returns a boolean indicating if there is a Sentence Boundary Break
4736      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4737
4738     U8 * lpos = (U8 *) curpos;
4739     bool has_para_sep = FALSE;
4740     bool has_sp = FALSE;
4741
4742     PERL_ARGS_ASSERT_ISSB;
4743
4744     /* Break at the start and end of text.
4745         SB1.  sot  ÷
4746         SB2.  ÷  eot
4747       But unstated in Unicode is don't break if the text is empty */
4748     if (before == SB_EDGE || after == SB_EDGE) {
4749         return before != after;
4750     }
4751
4752     /* SB 3: Do not break within CRLF. */
4753     if (before == SB_CR && after == SB_LF) {
4754         return FALSE;
4755     }
4756
4757     /* Break after paragraph separators.  CR and LF are considered
4758      * so because Unicode views text as like word processing text where there
4759      * are no newlines except between paragraphs, and the word processor takes
4760      * care of wrapping without there being hard line-breaks in the text *./
4761        SB4.  Sep | CR | LF  ÷ */
4762     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4763         return TRUE;
4764     }
4765
4766     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4767      * (See Section 6.2, Replacing Ignore Rules.)
4768         SB5.  X (Extend | Format)*  →  X */
4769     if (after == SB_Extend || after == SB_Format) {
4770
4771         /* Implied is that the these characters attach to everything
4772          * immediately prior to them except for those separator-type
4773          * characters.  And the rules earlier have already handled the case
4774          * when one of those immediately precedes the extend char */
4775         return FALSE;
4776     }
4777
4778     if (before == SB_Extend || before == SB_Format) {
4779         U8 * temp_pos = lpos;
4780         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4781         if (   backup != SB_EDGE
4782             && backup != SB_Sep
4783             && backup != SB_CR
4784             && backup != SB_LF)
4785         {
4786             before = backup;
4787             lpos = temp_pos;
4788         }
4789
4790         /* Here, both 'before' and 'backup' are these types; implied is that we
4791          * don't break between them */
4792         if (backup == SB_Extend || backup == SB_Format) {
4793             return FALSE;
4794         }
4795     }
4796
4797     /* Do not break after ambiguous terminators like period, if they are
4798      * immediately followed by a number or lowercase letter, if they are
4799      * between uppercase letters, if the first following letter (optionally
4800      * after certain punctuation) is lowercase, or if they are followed by
4801      * "continuation" punctuation such as comma, colon, or semicolon. For
4802      * example, a period may be an abbreviation or numeric period, and thus may
4803      * not mark the end of a sentence.
4804
4805      * SB6. ATerm  ×  Numeric */
4806     if (before == SB_ATerm && after == SB_Numeric) {
4807         return FALSE;
4808     }
4809
4810     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4811     if (before == SB_ATerm && after == SB_Upper) {
4812         U8 * temp_pos = lpos;
4813         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4814         if (backup == SB_Upper || backup == SB_Lower) {
4815             return FALSE;
4816         }
4817     }
4818
4819     /* The remaining rules that aren't the final one, all require an STerm or
4820      * an ATerm after having backed up over some Close* Sp*, and in one case an
4821      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4822      * So do that backup now, setting flags if either Sp or a paragraph
4823      * separator are found */
4824
4825     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4826         has_para_sep = TRUE;
4827         before = backup_one_SB(strbeg, &lpos, utf8_target);
4828     }
4829
4830     if (before == SB_Sp) {
4831         has_sp = TRUE;
4832         do {
4833             before = backup_one_SB(strbeg, &lpos, utf8_target);
4834         }
4835         while (before == SB_Sp);
4836     }
4837
4838     while (before == SB_Close) {
4839         before = backup_one_SB(strbeg, &lpos, utf8_target);
4840     }
4841
4842     /* The next few rules apply only when the backed-up-to is an ATerm, and in
4843      * most cases an STerm */
4844     if (before == SB_STerm || before == SB_ATerm) {
4845
4846         /* So, here the lhs matches
4847          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
4848          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
4849          * The rules that apply here are:
4850          *
4851          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
4852                                            | LF | STerm | ATerm) )* Lower
4853            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4854            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
4855            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
4856            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
4857          */
4858
4859         /* And all but SB11 forbid having seen a paragraph separator */
4860         if (! has_para_sep) {
4861             if (before == SB_ATerm) {          /* SB8 */
4862                 U8 * rpos = (U8 *) curpos;
4863                 SB_enum later = after;
4864
4865                 while (    later != SB_OLetter
4866                         && later != SB_Upper
4867                         && later != SB_Lower
4868                         && later != SB_Sep
4869                         && later != SB_CR
4870                         && later != SB_LF
4871                         && later != SB_STerm
4872                         && later != SB_ATerm
4873                         && later != SB_EDGE)
4874                 {
4875                     later = advance_one_SB(&rpos, strend, utf8_target);
4876                 }
4877                 if (later == SB_Lower) {
4878                     return FALSE;
4879                 }
4880             }
4881
4882             if (   after == SB_SContinue    /* SB8a */
4883                 || after == SB_STerm
4884                 || after == SB_ATerm)
4885             {
4886                 return FALSE;
4887             }
4888
4889             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
4890                 if (   after == SB_Close
4891                     || after == SB_Sp
4892                     || after == SB_Sep
4893                     || after == SB_CR
4894                     || after == SB_LF)
4895                 {
4896                     return FALSE;
4897                 }
4898             }
4899
4900             /* SB10.  This and SB9 could probably be combined some way, but khw
4901              * has decided to follow the Unicode rule book precisely for
4902              * simplified maintenance */
4903             if (   after == SB_Sp
4904                 || after == SB_Sep
4905                 || after == SB_CR
4906                 || after == SB_LF)
4907             {
4908                 return FALSE;
4909             }
4910         }
4911
4912         /* SB11.  */
4913         return TRUE;
4914     }
4915
4916     /* Otherwise, do not break.
4917     SB12.  Any  ×  Any */
4918
4919     return FALSE;
4920 }
4921
4922 STATIC SB_enum
4923 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4924 {
4925     SB_enum sb;
4926
4927     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4928
4929     if (*curpos >= strend) {
4930         return SB_EDGE;
4931     }
4932
4933     if (utf8_target) {
4934         do {
4935             *curpos += UTF8SKIP(*curpos);
4936             if (*curpos >= strend) {
4937                 return SB_EDGE;
4938             }
4939             sb = getSB_VAL_UTF8(*curpos, strend);
4940         } while (sb == SB_Extend || sb == SB_Format);
4941     }
4942     else {
4943         do {
4944             (*curpos)++;
4945             if (*curpos >= strend) {
4946                 return SB_EDGE;
4947             }
4948             sb = getSB_VAL_CP(**curpos);
4949         } while (sb == SB_Extend || sb == SB_Format);
4950     }
4951
4952     return sb;
4953 }
4954
4955 STATIC SB_enum
4956 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4957 {
4958     SB_enum sb;
4959
4960     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4961
4962     if (*curpos < strbeg) {
4963         return SB_EDGE;
4964     }
4965
4966     if (utf8_target) {
4967         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4968         if (! prev_char_pos) {
4969             return SB_EDGE;
4970         }
4971
4972         /* Back up over Extend and Format.  curpos is always just to the right
4973          * of the characater whose value we are getting */
4974         do {
4975             U8 * prev_prev_char_pos;
4976             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4977                                                                       strbeg)))
4978             {
4979                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4980                 *curpos = prev_char_pos;
4981                 prev_char_pos = prev_prev_char_pos;
4982             }
4983             else {
4984                 *curpos = (U8 *) strbeg;
4985                 return SB_EDGE;
4986             }
4987         } while (sb == SB_Extend || sb == SB_Format);
4988     }
4989     else {
4990         do {
4991             if (*curpos - 2 < strbeg) {
4992                 *curpos = (U8 *) strbeg;
4993                 return SB_EDGE;
4994             }
4995             (*curpos)--;
4996             sb = getSB_VAL_CP(*(*curpos - 1));
4997         } while (sb == SB_Extend || sb == SB_Format);
4998     }
4999
5000     return sb;
5001 }
5002
5003 STATIC bool
5004 S_isWB(pTHX_ WB_enum previous,
5005              WB_enum before,
5006              WB_enum after,
5007              const U8 * const strbeg,
5008              const U8 * const curpos,
5009              const U8 * const strend,
5010              const bool utf8_target)
5011 {
5012     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5013      *  a Unicode word break, using their published algorithm, but tailored for
5014      *  Perl by treating spans of white space as one unit.  Context may be
5015      *  needed to make this determination.  If the value for the character
5016      *  before 'before' is known, it is passed as 'previous'; otherwise that
5017      *  should be set to WB_UNKNOWN.  The other input parameters give the
5018      *  boundaries and current position in the matching of the string.  That
5019      *  is, 'curpos' marks the position where the character whose wb value is
5020      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5021
5022     U8 * before_pos = (U8 *) curpos;
5023     U8 * after_pos = (U8 *) curpos;
5024     WB_enum prev = before;
5025     WB_enum next;
5026
5027     PERL_ARGS_ASSERT_ISWB;
5028
5029     /* Rule numbers in the comments below are as of Unicode 9.0 */
5030
5031   redo:
5032     before = prev;
5033     switch (WB_table[before][after]) {
5034         case WB_BREAKABLE:
5035             return TRUE;
5036
5037         case WB_NOBREAK:
5038             return FALSE;
5039
5040         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5041             next = advance_one_WB(&after_pos, strend, utf8_target,
5042                                  FALSE /* Don't skip Extend nor Format */ );
5043             /* A space immediately preceeding an Extend or Format is attached
5044              * to by them, and hence gets separated from previous spaces.
5045              * Otherwise don't break between horizontal white space */
5046             return next == WB_Extend || next == WB_Format;
5047
5048         /* WB4 Ignore Format and Extend characters, except when they appear at
5049          * the beginning of a region of text.  This code currently isn't
5050          * general purpose, but it works as the rules are currently and likely
5051          * to be laid out.  The reason it works is that when 'they appear at
5052          * the beginning of a region of text', the rule is to break before
5053          * them, just like any other character.  Therefore, the default rule
5054          * applies and we don't have to look in more depth.  Should this ever
5055          * change, we would have to have 2 'case' statements, like in the rules
5056          * below, and backup a single character (not spacing over the extend
5057          * ones) and then see if that is one of the region-end characters and
5058          * go from there */
5059         case WB_Ex_or_FO_or_ZWJ_then_foo:
5060             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5061             goto redo;
5062
5063         case WB_DQ_then_HL + WB_BREAKABLE:
5064         case WB_DQ_then_HL + WB_NOBREAK:
5065
5066             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5067
5068             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5069                                                             == WB_Hebrew_Letter)
5070             {
5071                 return FALSE;
5072             }
5073
5074              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5075
5076         case WB_HL_then_DQ + WB_BREAKABLE:
5077         case WB_HL_then_DQ + WB_NOBREAK:
5078
5079             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5080
5081             if (advance_one_WB(&after_pos, strend, utf8_target,
5082                                        TRUE /* Do skip Extend and Format */ )
5083                                                             == WB_Hebrew_Letter)
5084             {
5085                 return FALSE;
5086             }
5087
5088             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5089
5090         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5091         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5092
5093             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5094              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5095
5096             next = advance_one_WB(&after_pos, strend, utf8_target,
5097                                        TRUE /* Do skip Extend and Format */ );
5098
5099             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5100             {
5101                 return FALSE;
5102             }
5103
5104             return WB_table[before][after]
5105                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5106
5107         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5108         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5109
5110             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5111              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5112
5113             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5114             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5115             {
5116                 return FALSE;
5117             }
5118
5119             return WB_table[before][after]
5120                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5121
5122         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5123         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5124
5125             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5126              * */
5127
5128             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5129                                                             == WB_Numeric)
5130             {
5131                 return FALSE;
5132             }
5133
5134             return WB_table[before][after]
5135                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5136
5137         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5138         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5139
5140             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5141
5142             if (advance_one_WB(&after_pos, strend, utf8_target,
5143                                        TRUE /* Do skip Extend and Format */ )
5144                                                             == WB_Numeric)
5145             {
5146                 return FALSE;
5147             }
5148
5149             return WB_table[before][after]
5150                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5151
5152         case WB_RI_then_RI + WB_NOBREAK:
5153         case WB_RI_then_RI + WB_BREAKABLE:
5154             {
5155                 int RI_count = 1;
5156
5157                 /* Do not break within emoji flag sequences. That is, do not
5158                  * break between regional indicator (RI) symbols if there is an
5159                  * odd number of RI characters before the potential break
5160                  * point.
5161                  *
5162                  * WB15     ^ (RI RI)* RI × RI
5163                  * WB16 [^RI] (RI RI)* RI × RI */
5164
5165                 while (backup_one_WB(&previous,
5166                                      strbeg,
5167                                      &before_pos,
5168                                      utf8_target) == WB_Regional_Indicator)
5169                 {
5170                     RI_count++;
5171                 }
5172
5173                 return RI_count % 2 != 1;
5174             }
5175
5176         default:
5177             break;
5178     }
5179
5180 #ifdef DEBUGGING
5181     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5182                                   before, after, WB_table[before][after]);
5183     assert(0);
5184 #endif
5185     return TRUE;
5186 }
5187
5188 STATIC WB_enum
5189 S_advance_one_WB(pTHX_ U8 ** curpos,
5190                        const U8 * const strend,
5191                        const bool utf8_target,
5192                        const bool skip_Extend_Format)
5193 {
5194     WB_enum wb;
5195
5196     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5197
5198     if (*curpos >= strend) {
5199         return WB_EDGE;
5200     }
5201
5202     if (utf8_target) {
5203
5204         /* Advance over Extend and Format */
5205         do {
5206             *curpos += UTF8SKIP(*curpos);
5207             if (*curpos >= strend) {
5208                 return WB_EDGE;
5209             }
5210             wb = getWB_VAL_UTF8(*curpos, strend);
5211         } while (    skip_Extend_Format
5212                  && (wb == WB_Extend || wb == WB_Format));
5213     }
5214     else {
5215         do {
5216             (*curpos)++;
5217             if (*curpos >= strend) {
5218                 return WB_EDGE;
5219             }
5220             wb = getWB_VAL_CP(**curpos);
5221         } while (    skip_Extend_Format
5222                  && (wb == WB_Extend || wb == WB_Format));
5223     }
5224
5225     return wb;
5226 }
5227
5228 STATIC WB_enum
5229 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5230 {
5231     WB_enum wb;
5232
5233     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5234
5235     /* If we know what the previous character's break value is, don't have
5236         * to look it up */
5237     if (*previous != WB_UNKNOWN) {
5238         wb = *previous;
5239
5240         /* But we need to move backwards by one */
5241         if (utf8_target) {
5242             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5243             if (! *curpos) {
5244                 *previous = WB_EDGE;
5245                 *curpos = (U8 *) strbeg;
5246             }
5247             else {
5248                 *previous = WB_UNKNOWN;
5249             }
5250         }
5251         else {
5252             (*curpos)--;
5253             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5254         }
5255
5256         /* And we always back up over these three types */
5257         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5258             return wb;
5259         }
5260     }
5261
5262     if (*curpos < strbeg) {
5263         return WB_EDGE;
5264     }
5265
5266     if (utf8_target) {
5267         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5268         if (! prev_char_pos) {
5269             return WB_EDGE;
5270         }
5271
5272         /* Back up over Extend and Format.  curpos is always just to the right
5273          * of the characater whose value we are getting */
5274         do {
5275             U8 * prev_prev_char_pos;
5276             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5277                                                    -1,
5278                                                    strbeg)))
5279             {
5280                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5281                 *curpos = prev_char_pos;
5282                 prev_char_pos = prev_prev_char_pos;
5283             }
5284             else {
5285                 *curpos = (U8 *) strbeg;
5286                 return WB_EDGE;
5287             }
5288         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5289     }
5290     else {
5291         do {
5292             if (*curpos - 2 < strbeg) {
5293                 *curpos = (U8 *) strbeg;
5294                 return WB_EDGE;
5295             }
5296             (*curpos)--;
5297             wb = getWB_VAL_CP(*(*curpos - 1));
5298         } while (wb == WB_Extend || wb == WB_Format);
5299     }
5300
5301     return wb;
5302 }
5303
5304 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5305 (                                                           \
5306     (   ( st )                                         ) && \
5307     (   ( st )->u.eval.close_paren                     ) && \
5308     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5309 )
5310
5311 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5312 (                                                           \
5313     (   ( st )                                         ) && \
5314     (   ( st )->u.eval.close_paren                     ) && \
5315     (   ( expr )                                       ) && \
5316     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5317 )
5318
5319
5320 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5321     (st)->u.eval.close_paren = ( (expr) + 1 )
5322
5323 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5324     (st)->u.eval.close_paren = 0
5325
5326 /* returns -1 on failure, $+[0] on success */
5327 STATIC SSize_t
5328 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5329 {
5330
5331 #if PERL_VERSION < 9 && !defined(PERL_CORE)
5332     dMY_CXT;
5333 #endif
5334     dVAR;
5335     const bool utf8_target = reginfo->is_utf8_target;
5336     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5337     REGEXP *rex_sv = reginfo->prog;
5338     regexp *rex = ReANY(rex_sv);
5339     RXi_GET_DECL(rex,rexi);
5340     /* the current state. This is a cached copy of PL_regmatch_state */
5341     regmatch_state *st;
5342     /* cache heavy used fields of st in registers */
5343     regnode *scan;
5344     regnode *next;
5345     U32 n = 0;  /* general value; init to avoid compiler warning */
5346     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5347     char *locinput = startpos;
5348     char *pushinput; /* where to continue after a PUSH */
5349     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5350
5351     bool result = 0;        /* return value of S_regmatch */
5352     U32 depth = 0;            /* depth of backtrack stack */
5353     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5354     const U32 max_nochange_depth =
5355         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5356         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5357     regmatch_state *yes_state = NULL; /* state to pop to on success of
5358                                                             subpattern */
5359     /* mark_state piggy backs on the yes_state logic so that when we unwind
5360        the stack on success we can update the mark_state as we go */
5361     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5362     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5363     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5364     U32 state_num;
5365     bool no_final = 0;      /* prevent failure from backtracking? */
5366     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5367     char *startpoint = locinput;
5368     SV *popmark = NULL;     /* are we looking for a mark? */
5369     SV *sv_commit = NULL;   /* last mark name seen in failure */
5370     SV *sv_yes_mark = NULL; /* last mark name we have seen
5371                                during a successful match */
5372     U32 lastopen = 0;       /* last open we saw */
5373     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
5374     SV* const oreplsv = GvSVn(PL_replgv);
5375     /* these three flags are set by various ops to signal information to
5376      * the very next op. They have a useful lifetime of exactly one loop
5377      * iteration, and are not preserved or restored by state pushes/pops
5378      */
5379     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5380     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5381     int logical = 0;        /* the following EVAL is:
5382                                 0: (?{...})
5383                                 1: (?(?{...})X|Y)
5384                                 2: (??{...})
5385                                or the following IFMATCH/UNLESSM is:
5386                                 false: plain (?=foo)
5387                                 true:  used as a condition: (?(?=foo))
5388                             */
5389     PAD* last_pad = NULL;
5390     dMULTICALL;
5391     U8 gimme = G_SCALAR;
5392     CV *caller_cv = NULL;       /* who called us */
5393     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5394     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
5395     U32 maxopenparen = 0;       /* max '(' index seen so far */
5396     int to_complement;  /* Invert the result? */
5397     _char_class_number classnum;
5398     bool is_utf8_pat = reginfo->is_utf8_pat;
5399     bool match = FALSE;
5400
5401 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5402 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5403 #  define SOLARIS_BAD_OPTIMIZER
5404     const U32 *pl_charclass_dup = PL_charclass;
5405 #  define PL_charclass pl_charclass_dup
5406 #endif
5407
5408 #ifdef DEBUGGING
5409     GET_RE_DEBUG_FLAGS_DECL;
5410 #endif
5411
5412     /* protect against undef(*^R) */
5413     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5414
5415     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5416     multicall_oldcatch = 0;
5417     PERL_UNUSED_VAR(multicall_cop);
5418
5419     PERL_ARGS_ASSERT_REGMATCH;
5420
5421     st = PL_regmatch_state;
5422
5423     /* Note that nextchr is a byte even in UTF */
5424     SET_nextchr;
5425     scan = prog;
5426
5427     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5428             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5429             Perl_re_printf( aTHX_ "regmatch start\n" );
5430     }));
5431
5432     while (scan != NULL) {
5433
5434
5435         next = scan + NEXT_OFF(scan);
5436         if (next == scan)
5437             next = NULL;
5438         state_num = OP(scan);
5439
5440       reenter_switch:
5441         DEBUG_EXECUTE_r(
5442             if (state_num <= REGNODE_MAX) {
5443                 SV * const prop = sv_newmortal();
5444                 regnode *rnext = regnext(scan);
5445
5446                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5447                 regprop(rex, prop, scan, reginfo, NULL);
5448                 Perl_re_printf( aTHX_
5449                     "%*s%"IVdf":%s(%"IVdf")\n",
5450                     INDENT_CHARS(depth), "",
5451                     (IV)(scan - rexi->program),
5452                     SvPVX_const(prop),
5453                     (PL_regkind[OP(scan)] == END || !rnext) ?
5454                         0 : (IV)(rnext - rexi->program));
5455             }
5456         );
5457
5458         to_complement = 0;
5459
5460         SET_nextchr;
5461         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5462
5463         switch (state_num) {
5464         case SBOL: /*  /^../ and /\A../  */
5465             if (locinput == reginfo->strbeg)
5466                 break;
5467             sayNO;
5468
5469         case MBOL: /*  /^../m  */
5470             if (locinput == reginfo->strbeg ||
5471                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5472             {
5473                 break;
5474             }
5475             sayNO;
5476
5477         case GPOS: /*  \G  */
5478             if (locinput == reginfo->ganch)
5479                 break;
5480             sayNO;
5481
5482         case KEEPS: /*   \K  */
5483             /* update the startpoint */
5484             st->u.keeper.val = rex->offs[0].start;
5485             rex->offs[0].start = locinput - reginfo->strbeg;
5486             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5487             NOT_REACHED; /* NOTREACHED */
5488
5489         case KEEPS_next_fail:
5490             /* rollback the start point change */
5491             rex->offs[0].start = st->u.keeper.val;
5492             sayNO_SILENT;
5493             NOT_REACHED; /* NOTREACHED */
5494
5495         case MEOL: /* /..$/m  */
5496             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5497                 sayNO;
5498             break;
5499
5500         case SEOL: /* /..$/  */
5501             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5502                 sayNO;
5503             if (reginfo->strend - locinput > 1)
5504                 sayNO;
5505             break;
5506
5507         case EOS: /*  \z  */
5508             if (!NEXTCHR_IS_EOS)
5509                 sayNO;
5510             break;
5511
5512         case SANY: /*  /./s  */
5513             if (NEXTCHR_IS_EOS)
5514                 sayNO;
5515             goto increment_locinput;
5516
5517         case REG_ANY: /*  /./  */
5518             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5519                 sayNO;
5520             goto increment_locinput;
5521
5522
5523 #undef  ST
5524 #define ST st->u.trie
5525         case TRIEC: /* (ab|cd) with known charclass */
5526             /* In this case the charclass data is available inline so
5527                we can fail fast without a lot of extra overhead.
5528              */
5529             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5530                 DEBUG_EXECUTE_r(
5531                     Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5532                               depth, PL_colors[4], PL_colors[5])
5533                 );
5534                 sayNO_SILENT;
5535                 NOT_REACHED; /* NOTREACHED */
5536             }
5537             /* FALLTHROUGH */
5538         case TRIE:  /* (ab|cd)  */
5539             /* the basic plan of execution of the trie is:
5540              * At the beginning, run though all the states, and
5541              * find the longest-matching word. Also remember the position
5542              * of the shortest matching word. For example, this pattern:
5543              *    1  2 3 4    5
5544              *    ab|a|x|abcd|abc
5545              * when matched against the string "abcde", will generate
5546              * accept states for all words except 3, with the longest
5547              * matching word being 4, and the shortest being 2 (with
5548              * the position being after char 1 of the string).
5549              *
5550              * Then for each matching word, in word order (i.e. 1,2,4,5),
5551              * we run the remainder of the pattern; on each try setting
5552              * the current position to the character following the word,
5553              * returning to try the next word on failure.
5554              *
5555              * We avoid having to build a list of words at runtime by
5556              * using a compile-time structure, wordinfo[].prev, which
5557              * gives, for each word, the previous accepting word (if any).
5558              * In the case above it would contain the mappings 1->2, 2->0,
5559              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5560              * the longest word (4 above), a list of all words, by
5561              * following the list of prev pointers; this gives us the
5562              * unordered list 4,5,1,2. Then given the current word we have
5563              * just tried, we can go through the list and find the
5564              * next-biggest word to try (so if we just failed on word 2,
5565              * the next in the list is 4).
5566              *
5567              * Since at runtime we don't record the matching position in
5568              * the string for each word, we have to work that out for
5569              * each word we're about to process. The wordinfo table holds
5570              * the character length of each word; given that we recorded
5571              * at the start: the position of the shortest word and its
5572              * length in chars, we just need to move the pointer the
5573              * difference between the two char lengths. Depending on
5574              * Unicode status and folding, that's cheap or expensive.
5575              *
5576              * This algorithm is optimised for the case where are only a
5577              * small number of accept states, i.e. 0,1, or maybe 2.
5578              * With lots of accepts states, and having to try all of them,
5579              * it becomes quadratic on number of accept states to find all
5580              * the next words.
5581              */
5582
5583             {
5584                 /* what type of TRIE am I? (utf8 makes this contextual) */
5585                 DECL_TRIE_TYPE(scan);
5586
5587                 /* what trie are we using right now */
5588                 reg_trie_data * const trie
5589                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5590                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5591                 U32 state = trie->startstate;
5592
5593                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5594                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5595                     if (utf8_target
5596                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5597                         && scan->flags == EXACTL)
5598                     {
5599                         /* We only output for EXACTL, as we let the folder
5600                          * output this message for EXACTFLU8 to avoid
5601                          * duplication */
5602                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5603                                                                reginfo->strend);
5604                     }
5605                 }
5606                 if (   trie->bitmap
5607                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5608                 {
5609                     if (trie->states[ state ].wordnum) {
5610                          DEBUG_EXECUTE_r(
5611                             Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
5612                                           depth, PL_colors[4], PL_colors[5])
5613                         );
5614                         if (!trie->jump)
5615                             break;
5616                     } else {
5617                         DEBUG_EXECUTE_r(
5618                             Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5619                                           depth, PL_colors[4], PL_colors[5])
5620                         );
5621                         sayNO_SILENT;
5622                    }
5623                 }
5624
5625             {
5626                 U8 *uc = ( U8* )locinput;
5627
5628                 STRLEN len = 0;
5629                 STRLEN foldlen = 0;
5630                 U8 *uscan = (U8*)NULL;
5631                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5632                 U32 charcount = 0; /* how many input chars we have matched */
5633                 U32 accepted = 0; /* have we seen any accepting states? */
5634
5635                 ST.jump = trie->jump;
5636                 ST.me = scan;
5637                 ST.firstpos = NULL;
5638                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5639                 ST.nextword = 0;
5640
5641                 /* fully traverse the TRIE; note the position of the
5642                    shortest accept state and the wordnum of the longest
5643                    accept state */
5644
5645                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5646                     U32 base = trie->states[ state ].trans.base;
5647                     UV uvc = 0;
5648                     U16 charid = 0;
5649                     U16 wordnum;
5650                     wordnum = trie->states[ state ].wordnum;
5651
5652                     if (wordnum) { /* it's an accept state */
5653                         if (!accepted) {
5654                             accepted = 1;
5655                             /* record first match position */
5656                             if (ST.longfold) {
5657                                 ST.firstpos = (U8*)locinput;
5658                                 ST.firstchars = 0;
5659                             }
5660                             else {
5661                                 ST.firstpos = uc;
5662                                 ST.firstchars = charcount;
5663                             }
5664                         }
5665                         if (!ST.nextword || wordnum < ST.nextword)
5666                             ST.nextword = wordnum;
5667                         ST.topword = wordnum;
5668                     }
5669
5670                     DEBUG_TRIE_EXECUTE_r({
5671                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5672                                 /* HERE */
5673                                 PerlIO_printf( Perl_debug_log,
5674                                     "%*s%sState: %4"UVxf" Accepted: %c ",
5675                                     INDENT_CHARS(depth), "", PL_colors[4],
5676                                     (UV)state, (accepted ? 'Y' : 'N'));
5677                     });
5678
5679                     /* read a char and goto next state */
5680                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5681                         I32 offset;
5682                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5683                                              uscan, len, uvc, charid, foldlen,
5684                                              foldbuf, uniflags);
5685                         charcount++;
5686                         if (foldlen>0)
5687                             ST.longfold = TRUE;
5688                         if (charid &&
5689                              ( ((offset =
5690                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5691
5692                              && ((U32)offset < trie->lasttrans)
5693                              && trie->trans[offset].check == state)
5694                         {
5695                             state = trie->trans[offset].next;
5696                         }
5697                         else {
5698                             state = 0;
5699                         }
5700                         uc += len;
5701
5702                     }
5703                     else {
5704                         state = 0;
5705                     }
5706                     DEBUG_TRIE_EXECUTE_r(
5707                         Perl_re_printf( aTHX_
5708                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5709                             charid, uvc, (UV)state, PL_colors[5] );
5710                     );
5711                 }
5712                 if (!accepted)
5713                    sayNO;
5714
5715                 /* calculate total number of accept states */
5716                 {
5717                     U16 w = ST.topword;
5718                     accepted = 0;
5719                     while (w) {
5720                         w = trie->wordinfo[w].prev;
5721                         accepted++;
5722                     }
5723                     ST.accepted = accepted;
5724                 }
5725
5726                 DEBUG_EXECUTE_r(
5727                     Perl_re_exec_indentf( aTHX_  "%sgot %"IVdf" possible matches%s\n",
5728                         depth,
5729                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5730                 );
5731                 goto trie_first_try; /* jump into the fail handler */
5732             }}
5733             NOT_REACHED; /* NOTREACHED */
5734
5735         case TRIE_next_fail: /* we failed - try next alternative */
5736         {
5737             U8 *uc;
5738             if ( ST.jump ) {
5739                 REGCP_UNWIND(ST.cp);
5740                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5741             }
5742             if (!--ST.accepted) {
5743                 DEBUG_EXECUTE_r({
5744                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
5745                         depth,
5746                         PL_colors[4],
5747                         PL_colors[5] );
5748                 });
5749                 sayNO_SILENT;
5750             }
5751             {
5752                 /* Find next-highest word to process.  Note that this code
5753                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5754                 U16 min = 0;
5755                 U16 word;
5756                 U16 const nextword = ST.nextword;
5757                 reg_trie_wordinfo * const wordinfo
5758                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5759                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5760                     if (word > nextword && (!min || word < min))
5761                         min = word;
5762                 }
5763                 ST.nextword = min;
5764             }
5765
5766           trie_first_try:
5767             if (do_cutgroup) {
5768                 do_cutgroup = 0;
5769                 no_final = 0;
5770             }
5771
5772             if ( ST.jump ) {
5773                 ST.lastparen = rex->lastparen;
5774                 ST.lastcloseparen = rex->lastcloseparen;
5775                 REGCP_SET(ST.cp);
5776             }
5777
5778             /* find start char of end of current word */
5779             {
5780                 U32 chars; /* how many chars to skip */
5781                 reg_trie_data * const trie
5782                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5783
5784                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5785                             >=  ST.firstchars);
5786                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5787                             - ST.firstchars;
5788                 uc = ST.firstpos;
5789
5790                 if (ST.longfold) {
5791                     /* the hard option - fold each char in turn and find
5792                      * its folded length (which may be different */
5793                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5794                     STRLEN foldlen;
5795                     STRLEN len;
5796                     UV uvc;
5797                     U8 *uscan;
5798
5799                     while (chars) {
5800                         if (utf8_target) {
5801                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5802                                                     uniflags);
5803                             uc += len;
5804                         }
5805                         else {
5806                             uvc = *uc;
5807                             uc++;
5808                         }
5809                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5810                         uscan = foldbuf;
5811                         while (foldlen) {
5812                             if (!--chars)
5813                                 break;
5814                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5815                                             uniflags);
5816                             uscan += len;
5817                             foldlen -= len;
5818                         }
5819                     }
5820                 }
5821                 else {
5822                     if (utf8_target)
5823                         while (chars--)
5824                             uc += UTF8SKIP(uc);
5825                     else
5826                         uc += chars;
5827                 }
5828             }
5829
5830             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5831                             ? ST.jump[ST.nextword]
5832                             : NEXT_OFF(ST.me));
5833
5834             DEBUG_EXECUTE_r({
5835                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
5836                     depth,
5837                     PL_colors[4],
5838                     ST.nextword,
5839                     PL_colors[5]
5840                     );
5841             });
5842
5843             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
5844                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5845                 NOT_REACHED; /* NOTREACHED */
5846             }
5847             /* only one choice left - just continue */
5848             DEBUG_EXECUTE_r({
5849                 AV *const trie_words
5850                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5851                 SV ** const tmp = trie_words
5852                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5853                 SV *sv= tmp ? sv_newmortal() : NULL;
5854
5855                 Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
5856                     depth, PL_colors[4],
5857                     ST.nextword,
5858                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5859                             PL_colors[0], PL_colors[1],
5860                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5861                         )
5862                     : "not compiled under -Dr",
5863                     PL_colors[5] );
5864             });
5865
5866             locinput = (char*)uc;
5867             continue; /* execute rest of RE */
5868             /* NOTREACHED */
5869         }
5870 #undef  ST
5871
5872         case EXACTL:             /*  /abc/l       */
5873             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5874
5875             /* Complete checking would involve going through every character
5876              * matched by the string to see if any is above latin1.  But the
5877              * comparision otherwise might very well be a fast assembly
5878              * language routine, and I (khw) don't think slowing things down
5879              * just to check for this warning is worth it.  So this just checks
5880              * the first character */
5881             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5882                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5883             }
5884             /* FALLTHROUGH */
5885         case EXACT: {            /*  /abc/        */
5886             char *s = STRING(scan);
5887             ln = STR_LEN(scan);
5888             if (utf8_target != is_utf8_pat) {
5889                 /* The target and the pattern have differing utf8ness. */
5890                 char *l = locinput;
5891                 const char * const e = s + ln;
5892
5893                 if (utf8_target) {
5894                     /* The target is utf8, the pattern is not utf8.
5895                      * Above-Latin1 code points can't match the pattern;
5896                      * invariants match exactly, and the other Latin1 ones need
5897                      * to be downgraded to a single byte in order to do the
5898                      * comparison.  (If we could be confident that the target
5899                      * is not malformed, this could be refactored to have fewer
5900                      * tests by just assuming that if the first bytes match, it
5901                      * is an invariant, but there are tests in the test suite
5902                      * dealing with (??{...}) which violate this) */
5903                     while (s < e) {
5904                         if (l >= reginfo->strend
5905                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5906                         {
5907                             sayNO;
5908                         }
5909                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
5910                             if (*l != *s) {
5911                                 sayNO;
5912                             }
5913                             l++;
5914                         }
5915                         else {
5916                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5917                             {
5918                                 sayNO;
5919                             }
5920                             l += 2;
5921                         }
5922                         s++;
5923                     }
5924                 }
5925                 else {
5926                     /* The target is not utf8, the pattern is utf8. */
5927                     while (s < e) {
5928                         if (l >= reginfo->strend
5929                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5930                         {
5931                             sayNO;
5932                         }
5933                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
5934                             if (*s != *l) {
5935                                 sayNO;
5936                             }
5937                             s++;
5938                         }
5939                         else {
5940                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5941                             {
5942                                 sayNO;
5943                             }
5944                             s += 2;
5945                         }
5946                         l++;
5947                     }
5948                 }
5949                 locinput = l;
5950             }
5951             else {
5952                 /* The target and the pattern have the same utf8ness. */
5953                 /* Inline the first character, for speed. */
5954                 if (reginfo->strend - locinput < ln
5955                     || UCHARAT(s) != nextchr
5956                     || (ln > 1 && memNE(s, locinput, ln)))
5957                 {
5958                     sayNO;
5959                 }
5960                 locinput += ln;
5961             }
5962             break;
5963             }
5964
5965         case EXACTFL: {          /*  /abc/il      */
5966             re_fold_t folder;
5967             const U8 * fold_array;
5968             const char * s;
5969             U32 fold_utf8_flags;
5970
5971             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5972             folder = foldEQ_locale;
5973             fold_array = PL_fold_locale;
5974             fold_utf8_flags = FOLDEQ_LOCALE;
5975             goto do_exactf;
5976
5977         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5978                                       is effectively /u; hence to match, target
5979                                       must be UTF-8. */
5980             if (! utf8_target) {
5981                 sayNO;
5982             }
5983             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5984                                              | FOLDEQ_S1_FOLDS_SANE;
5985             folder = foldEQ_latin1;
5986             fold_array = PL_fold_latin1;
5987             goto do_exactf;
5988
5989         case EXACTFU_SS:         /*  /\x{df}/iu   */
5990         case EXACTFU:            /*  /abc/iu      */
5991             folder = foldEQ_latin1;
5992             fold_array = PL_fold_latin1;
5993             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5994             goto do_exactf;
5995
5996         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
5997                                    patterns */
5998             assert(! is_utf8_pat);
5999             /* FALLTHROUGH */
6000         case EXACTFA:            /*  /abc/iaa     */
6001             folder = foldEQ_latin1;
6002             fold_array = PL_fold_latin1;
6003             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6004             goto do_exactf;
6005
6006         case EXACTF:             /*  /abc/i    This node only generated for
6007                                                non-utf8 patterns */
6008             assert(! is_utf8_pat);
6009             folder = foldEQ;
6010             fold_array = PL_fold;
6011             fold_utf8_flags = 0;
6012
6013           do_exactf:
6014             s = STRING(scan);
6015             ln = STR_LEN(scan);
6016
6017             if (utf8_target
6018                 || is_utf8_pat
6019                 || state_num == EXACTFU_SS
6020                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6021             {
6022               /* Either target or the pattern are utf8, or has the issue where
6023                * the fold lengths may differ. */
6024                 const char * const l = locinput;
6025                 char *e = reginfo->strend;
6026
6027                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6028                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6029                 {
6030                     sayNO;
6031                 }
6032                 locinput = e;
6033                 break;
6034             }
6035
6036             /* Neither the target nor the pattern are utf8 */
6037             if (UCHARAT(s) != nextchr
6038                 && !NEXTCHR_IS_EOS
6039                 && UCHARAT(s) != fold_array[nextchr])
6040             {
6041                 sayNO;
6042             }
6043             if (reginfo->strend - locinput < ln)
6044                 sayNO;
6045             if (ln > 1 && ! folder(s, locinput, ln))
6046                 sayNO;
6047             locinput += ln;
6048             break;
6049         }
6050
6051         case NBOUNDL: /*  /\B/l  */
6052             to_complement = 1;
6053             /* FALLTHROUGH */
6054
6055         case BOUNDL:  /*  /\b/l  */
6056         {
6057             bool b1, b2;
6058             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6059
6060             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6061                 if (! IN_UTF8_CTYPE_LOCALE) {
6062                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6063                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6064                 }
6065                 goto boundu;
6066             }
6067
6068             if (utf8_target) {
6069                 if (locinput == reginfo->strbeg)
6070                     b1 = isWORDCHAR_LC('\n');
6071                 else {
6072                     b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
6073                                                         (U8*)(reginfo->strbeg)));
6074                 }
6075                 b2 = (NEXTCHR_IS_EOS)
6076                     ? isWORDCHAR_LC('\n')
6077                     : isWORDCHAR_LC_utf8((U8*)locinput);
6078             }
6079             else { /* Here the string isn't utf8 */
6080                 b1 = (locinput == reginfo->strbeg)
6081                      ? isWORDCHAR_LC('\n')
6082                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6083                 b2 = (NEXTCHR_IS_EOS)
6084                     ? isWORDCHAR_LC('\n')
6085                     : isWORDCHAR_LC(nextchr);
6086             }
6087             if (to_complement ^ (b1 == b2)) {
6088                 sayNO;
6089             }
6090             break;
6091         }
6092
6093         case NBOUND:  /*  /\B/   */
6094             to_complement = 1;
6095             /* FALLTHROUGH */
6096
6097         case BOUND:   /*  /\b/   */
6098             if (utf8_target) {
6099                 goto bound_utf8;
6100             }
6101             goto bound_ascii_match_only;
6102
6103         case NBOUNDA: /*  /\B/a  */
6104             to_complement = 1;
6105             /* FALLTHROUGH */
6106
6107         case BOUNDA:  /*  /\b/a  */
6108         {
6109             bool b1, b2;
6110
6111           bound_ascii_match_only:
6112             /* Here the string isn't utf8, or is utf8 and only ascii characters
6113              * are to match \w.  In the latter case looking at the byte just
6114              * prior to the current one may be just the final byte of a
6115              * multi-byte character.  This is ok.  There are two cases:
6116              * 1) it is a single byte character, and then the test is doing
6117              *    just what it's supposed to.
6118              * 2) it is a multi-byte character, in which case the final byte is
6119              *    never mistakable for ASCII, and so the test will say it is
6120              *    not a word character, which is the correct answer. */
6121             b1 = (locinput == reginfo->strbeg)
6122                  ? isWORDCHAR_A('\n')
6123                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6124             b2 = (NEXTCHR_IS_EOS)
6125                 ? isWORDCHAR_A('\n')
6126                 : isWORDCHAR_A(nextchr);
6127             if (to_complement ^ (b1 == b2)) {
6128                 sayNO;
6129             }
6130             break;
6131         }
6132
6133         case NBOUNDU: /*  /\B/u  */
6134             to_complement = 1;
6135             /* FALLTHROUGH */
6136
6137         case BOUNDU:  /*  /\b/u  */
6138
6139           boundu:
6140             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6141                 match = FALSE;
6142             }
6143             else if (utf8_target) {
6144               bound_utf8:
6145                 switch((bound_type) FLAGS(scan)) {
6146                     case TRADITIONAL_BOUND:
6147                     {
6148                         bool b1, b2;
6149                         b1 = (locinput == reginfo->strbeg)
6150                              ? 0 /* isWORDCHAR_L1('\n') */
6151                              : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
6152                                                        (U8*)(reginfo->strbeg)));
6153                         b2 = (NEXTCHR_IS_EOS)
6154                             ? 0 /* isWORDCHAR_L1('\n') */
6155                             : isWORDCHAR_utf8((U8*)locinput);
6156                         match = cBOOL(b1 != b2);
6157                         break;
6158                     }
6159                     case GCB_BOUND:
6160                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6161                             match = TRUE; /* GCB always matches at begin and
6162                                              end */
6163                         }
6164                         else {
6165                             /* Find the gcb values of previous and current
6166                              * chars, then see if is a break point */
6167                             match = isGCB(getGCB_VAL_UTF8(
6168                                                 reghop3((U8*)locinput,
6169                                                         -1,
6170                                                         (U8*)(reginfo->strbeg)),
6171                                                 (U8*) reginfo->strend),
6172                                           getGCB_VAL_UTF8((U8*) locinput,
6173                                                         (U8*) reginfo->strend),
6174                                           (U8*) reginfo->strbeg,
6175                                           (U8*) locinput,
6176                                           utf8_target);
6177                         }
6178                         break;
6179
6180                     case LB_BOUND:
6181                         if (locinput == reginfo->strbeg) {
6182                             match = FALSE;
6183                         }
6184                         else if (NEXTCHR_IS_EOS) {
6185                             match = TRUE;
6186                         }
6187                         else {
6188                             match = isLB(getLB_VAL_UTF8(
6189                                                 reghop3((U8*)locinput,
6190                                                         -1,
6191                                                         (U8*)(reginfo->strbeg)),
6192                                                 (U8*) reginfo->strend),
6193                                           getLB_VAL_UTF8((U8*) locinput,
6194                                                         (U8*) reginfo->strend),
6195                                           (U8*) reginfo->strbeg,
6196                                           (U8*) locinput,
6197                                           (U8*) reginfo->strend,
6198                                           utf8_target);
6199                         }
6200                         break;
6201
6202                     case SB_BOUND: /* Always matches at begin and end */
6203                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6204                             match = TRUE;
6205                         }
6206                         else {
6207                             match = isSB(getSB_VAL_UTF8(
6208                                                 reghop3((U8*)locinput,
6209                                                         -1,
6210                                                         (U8*)(reginfo->strbeg)),
6211                                                 (U8*) reginfo->strend),
6212                                           getSB_VAL_UTF8((U8*) locinput,
6213                                                         (U8*) reginfo->strend),
6214                                           (U8*) reginfo->strbeg,
6215                                           (U8*) locinput,
6216                                           (U8*) reginfo->strend,
6217                                           utf8_target);
6218                         }
6219                         break;
6220
6221                     case WB_BOUND:
6222                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6223                             match = TRUE;
6224                         }
6225                         else {
6226                             match = isWB(WB_UNKNOWN,
6227                                          getWB_VAL_UTF8(
6228                                                 reghop3((U8*)locinput,
6229                                                         -1,
6230                                                         (U8*)(reginfo->strbeg)),
6231                                                 (U8*) reginfo->strend),
6232                                           getWB_VAL_UTF8((U8*) locinput,
6233                                                         (U8*) reginfo->strend),
6234                                           (U8*) reginfo->strbeg,
6235                                           (U8*) locinput,
6236                                           (U8*) reginfo->strend,
6237                                           utf8_target);
6238                         }
6239                         break;
6240                 }
6241             }
6242             else {  /* Not utf8 target */
6243                 switch((bound_type) FLAGS(scan)) {
6244                     case TRADITIONAL_BOUND:
6245                     {
6246                         bool b1, b2;
6247                         b1 = (locinput == reginfo->strbeg)
6248                             ? 0 /* isWORDCHAR_L1('\n') */
6249                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6250                         b2 = (NEXTCHR_IS_EOS)
6251                             ? 0 /* isWORDCHAR_L1('\n') */
6252                             : isWORDCHAR_L1(nextchr);
6253                         match = cBOOL(b1 != b2);
6254                         break;
6255                     }
6256
6257                     case GCB_BOUND:
6258                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6259                             match = TRUE; /* GCB always matches at begin and
6260                                              end */
6261                         }
6262                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6263                                    range */
6264                             match =    UCHARAT(locinput - 1) != '\r'
6265                                     || UCHARAT(locinput) != '\n';
6266                         }
6267                         break;
6268
6269                     case LB_BOUND:
6270                         if (locinput == reginfo->strbeg) {
6271                             match = FALSE;
6272                         }
6273                         else if (NEXTCHR_IS_EOS) {
6274                             match = TRUE;
6275                         }
6276                         else {
6277                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6278                                          getLB_VAL_CP(UCHARAT(locinput)),
6279                                          (U8*) reginfo->strbeg,
6280                                          (U8*) locinput,
6281                                          (U8*) reginfo->strend,
6282                                          utf8_target);
6283                         }
6284                         break;
6285
6286                     case SB_BOUND: /* Always matches at begin and end */
6287                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6288                             match = TRUE;
6289                         }
6290                         else {
6291                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6292                                          getSB_VAL_CP(UCHARAT(locinput)),
6293                                          (U8*) reginfo->strbeg,
6294                                          (U8*) locinput,
6295                                          (U8*) reginfo->strend,
6296                                          utf8_target);
6297                         }
6298                         break;
6299
6300                     case WB_BOUND:
6301                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6302                             match = TRUE;
6303                         }
6304                         else {
6305                             match = isWB(WB_UNKNOWN,
6306                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6307                                          getWB_VAL_CP(UCHARAT(locinput)),
6308                                          (U8*) reginfo->strbeg,
6309                                          (U8*) locinput,
6310                                          (U8*) reginfo->strend,
6311                                          utf8_target);
6312                         }
6313                         break;
6314                 }
6315             }
6316
6317             if (to_complement ^ ! match) {
6318                 sayNO;
6319             }
6320             break;
6321
6322         case ANYOFL:  /*  /[abc]/l      */
6323             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6324
6325             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6326             {
6327               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6328             }
6329             /* FALLTHROUGH */
6330         case ANYOFD:  /*   /[abc]/d       */
6331         case ANYOF:  /*   /[abc]/       */
6332             if (NEXTCHR_IS_EOS)
6333                 sayNO;
6334             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6335                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6336                                                                    utf8_target))
6337                     sayNO;
6338                 locinput += UTF8SKIP(locinput);
6339             }
6340             else {
6341                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6342                     sayNO;
6343                 locinput++;
6344             }
6345             break;
6346
6347         /* The argument (FLAGS) to all the POSIX node types is the class number
6348          * */
6349
6350         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6351             to_complement = 1;
6352             /* FALLTHROUGH */
6353
6354         case POSIXL:    /* \w or [:punct:] etc. under /l */
6355             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6356             if (NEXTCHR_IS_EOS)
6357                 sayNO;
6358
6359             /* Use isFOO_lc() for characters within Latin1.  (Note that
6360              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6361              * wouldn't be invariant) */
6362             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6363                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6364                     sayNO;
6365                 }
6366
6367                 locinput++;
6368                 break;
6369             }
6370
6371             if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
6372                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6373                 goto utf8_posix_above_latin1;
6374             }
6375
6376             /* Here is a UTF-8 variant code point below 256 and the target is
6377              * UTF-8 */
6378             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6379                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6380                                             *(locinput + 1))))))
6381             {
6382                 sayNO;
6383             }
6384
6385             goto increment_locinput;
6386
6387         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6388             to_complement = 1;
6389             /* FALLTHROUGH */
6390
6391         case POSIXD:    /* \w or [:punct:] etc. under /d */
6392             if (utf8_target) {
6393                 goto utf8_posix;
6394             }
6395             goto posixa;
6396
6397         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6398
6399             if (NEXTCHR_IS_EOS) {
6400                 sayNO;
6401             }
6402
6403             /* All UTF-8 variants match */
6404             if (! UTF8_IS_INVARIANT(nextchr)) {
6405                 goto increment_locinput;
6406             }
6407
6408             to_complement = 1;
6409             goto join_nposixa;
6410
6411         case POSIXA:    /* \w or [:punct:] etc. under /a */
6412
6413           posixa:
6414             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6415              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6416              * character is a single byte */
6417
6418             if (NEXTCHR_IS_EOS) {
6419                 sayNO;
6420             }
6421
6422           join_nposixa:
6423
6424             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6425                                                                 FLAGS(scan)))))
6426             {
6427                 sayNO;
6428             }
6429
6430             /* Here we are either not in utf8, or we matched a utf8-invariant,
6431              * so the next char is the next byte */
6432             locinput++;
6433             break;
6434
6435         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6436             to_complement = 1;
6437             /* FALLTHROUGH */
6438
6439         case POSIXU:    /* \w or [:punct:] etc. under /u */
6440           utf8_posix:
6441             if (NEXTCHR_IS_EOS) {
6442                 sayNO;
6443             }
6444
6445             /* Use _generic_isCC() for characters within Latin1.  (Note that
6446              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6447              * wouldn't be invariant) */
6448             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6449                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6450                                                            FLAGS(scan)))))
6451                 {
6452                     sayNO;
6453                 }
6454                 locinput++;
6455             }
6456             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
6457                 if (! (to_complement
6458                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6459                                                                *(locinput + 1)),
6460                                              FLAGS(scan)))))
6461                 {
6462                     sayNO;
6463                 }
6464                 locinput += 2;
6465             }
6466             else {  /* Handle above Latin-1 code points */
6467               utf8_posix_above_latin1:
6468                 classnum = (_char_class_number) FLAGS(scan);
6469                 if (classnum < _FIRST_NON_SWASH_CC) {
6470
6471                     /* Here, uses a swash to find such code points.  Load if if
6472                      * not done already */
6473                     if (! PL_utf8_swash_ptrs[classnum]) {
6474                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6475                         PL_utf8_swash_ptrs[classnum]
6476                                 = _core_swash_init("utf8",
6477                                         "",
6478                                         &PL_sv_undef, 1, 0,
6479                                         PL_XPosix_ptrs[classnum], &flags);
6480                     }
6481                     if (! (to_complement
6482                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6483                                                (U8 *) locinput, TRUE))))
6484                     {
6485                         sayNO;
6486                     }
6487                 }
6488                 else {  /* Here, uses macros to find above Latin-1 code points */
6489                     switch (classnum) {
6490                         case _CC_ENUM_SPACE:
6491                             if (! (to_complement
6492                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
6493                             {
6494                                 sayNO;
6495                             }
6496                             break;
6497                         case _CC_ENUM_BLANK:
6498                             if (! (to_complement
6499                                             ^ cBOOL(is_HORIZWS_high(locinput))))
6500                             {
6501                                 sayNO;
6502                             }
6503                             break;
6504                         case _CC_ENUM_XDIGIT:
6505                             if (! (to_complement
6506                                             ^ cBOOL(is_XDIGIT_high(locinput))))
6507                             {
6508                                 sayNO;
6509                             }
6510                             break;
6511                         case _CC_ENUM_VERTSPACE:
6512                             if (! (to_complement
6513                                             ^ cBOOL(is_VERTWS_high(locinput))))
6514                             {
6515                                 sayNO;
6516                             }
6517                             break;
6518                         default:    /* The rest, e.g. [:cntrl:], can't match
6519                                        above Latin1 */
6520                             if (! to_complement) {
6521                                 sayNO;
6522                             }
6523                             break;
6524                     }
6525                 }
6526                 locinput += UTF8SKIP(locinput);
6527             }
6528             break;
6529
6530         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6531                        a Unicode extended Grapheme Cluster */
6532             if (NEXTCHR_IS_EOS)
6533                 sayNO;
6534             if  (! utf8_target) {
6535
6536                 /* Match either CR LF  or '.', as all the other possibilities
6537                  * require utf8 */
6538                 locinput++;         /* Match the . or CR */
6539                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6540                                        match the LF */
6541                     && locinput < reginfo->strend
6542                     && UCHARAT(locinput) == '\n')
6543                 {
6544                     locinput++;
6545                 }
6546             }
6547             else {
6548
6549                 /* Get the gcb type for the current character */
6550                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6551                                                        (U8*) reginfo->strend);
6552
6553                 /* Then scan through the input until we get to the first
6554                  * character whose type is supposed to be a gcb with the
6555                  * current character.  (There is always a break at the
6556                  * end-of-input) */
6557                 locinput += UTF8SKIP(locinput);
6558                 while (locinput < reginfo->strend) {
6559                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6560                                                          (U8*) reginfo->strend);
6561                     if (isGCB(prev_gcb, cur_gcb,
6562                               (U8*) reginfo->strbeg, (U8*) locinput,
6563                               utf8_target))
6564                     {
6565                         break;
6566                     }
6567
6568                     prev_gcb = cur_gcb;
6569                     locinput += UTF8SKIP(locinput);
6570                 }
6571
6572
6573             }
6574             break;
6575
6576         case NREFFL:  /*  /\g{name}/il  */
6577         {   /* The capture buffer cases.  The ones beginning with N for the
6578                named buffers just convert to the equivalent numbered and
6579                pretend they were called as the corresponding numbered buffer
6580                op.  */
6581             /* don't initialize these in the declaration, it makes C++
6582                unhappy */
6583             const char *s;
6584             char type;
6585             re_fold_t folder;
6586             const U8 *fold_array;
6587             UV utf8_fold_flags;
6588
6589             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6590             folder = foldEQ_locale;
6591             fold_array = PL_fold_locale;
6592             type = REFFL;
6593             utf8_fold_flags = FOLDEQ_LOCALE;
6594             goto do_nref;
6595
6596         case NREFFA:  /*  /\g{name}/iaa  */
6597             folder = foldEQ_latin1;
6598             fold_array = PL_fold_latin1;
6599             type = REFFA;
6600             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6601             goto do_nref;
6602
6603         case NREFFU:  /*  /\g{name}/iu  */
6604             folder = foldEQ_latin1;
6605             fold_array = PL_fold_latin1;
6606             type = REFFU;
6607             utf8_fold_flags = 0;
6608             goto do_nref;
6609
6610         case NREFF:  /*  /\g{name}/i  */
6611             folder = foldEQ;
6612             fold_array = PL_fold;
6613             type = REFF;
6614             utf8_fold_flags = 0;
6615             goto do_nref;
6616
6617         case NREF:  /*  /\g{name}/   */
6618             type = REF;
6619             folder = NULL;
6620             fold_array = NULL;
6621             utf8_fold_flags = 0;
6622           do_nref:
6623
6624             /* For the named back references, find the corresponding buffer
6625              * number */
6626             n = reg_check_named_buff_matched(rex,scan);
6627
6628             if ( ! n ) {
6629                 sayNO;
6630             }
6631             goto do_nref_ref_common;
6632
6633         case REFFL:  /*  /\1/il  */
6634             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6635             folder = foldEQ_locale;
6636             fold_array = PL_fold_locale;
6637             utf8_fold_flags = FOLDEQ_LOCALE;
6638             goto do_ref;
6639
6640         case REFFA:  /*  /\1/iaa  */
6641             folder = foldEQ_latin1;
6642             fold_array = PL_fold_latin1;
6643             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6644             goto do_ref;
6645
6646         case REFFU:  /*  /\1/iu  */
6647             folder = foldEQ_latin1;
6648             fold_array = PL_fold_latin1;
6649             utf8_fold_flags = 0;
6650             goto do_ref;
6651
6652         case REFF:  /*  /\1/i  */
6653             folder = foldEQ;
6654             fold_array = PL_fold;
6655             utf8_fold_flags = 0;
6656             goto do_ref;
6657
6658         case REF:  /*  /\1/    */
6659             folder = NULL;
6660             fold_array = NULL;
6661             utf8_fold_flags = 0;
6662
6663           do_ref:
6664             type = OP(scan);
6665             n = ARG(scan);  /* which paren pair */
6666
6667           do_nref_ref_common:
6668             ln = rex->offs[n].start;
6669             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6670             if (rex->lastparen < n || ln == -1)
6671                 sayNO;                  /* Do not match unless seen CLOSEn. */
6672             if (ln == rex->offs[n].end)
6673                 break;
6674
6675             s = reginfo->strbeg + ln;
6676             if (type != REF     /* REF can do byte comparison */
6677                 && (utf8_target || type == REFFU || type == REFFL))
6678             {
6679                 char * limit = reginfo->strend;
6680
6681                 /* This call case insensitively compares the entire buffer
6682                     * at s, with the current input starting at locinput, but
6683                     * not going off the end given by reginfo->strend, and
6684                     * returns in <limit> upon success, how much of the
6685                     * current input was matched */
6686                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6687                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6688                 {
6689                     sayNO;
6690                 }
6691                 locinput = limit;
6692                 break;
6693             }
6694
6695             /* Not utf8:  Inline the first character, for speed. */
6696             if (!NEXTCHR_IS_EOS &&
6697                 UCHARAT(s) != nextchr &&
6698                 (type == REF ||
6699                  UCHARAT(s) != fold_array[nextchr]))
6700                 sayNO;
6701             ln = rex->offs[n].end - ln;
6702             if (locinput + ln > reginfo->strend)
6703                 sayNO;
6704             if (ln > 1 && (type == REF
6705                            ? memNE(s, locinput, ln)
6706                            : ! folder(s, locinput, ln)))
6707                 sayNO;
6708             locinput += ln;
6709             break;
6710         }
6711
6712         case NOTHING: /* null op; e.g. the 'nothing' following
6713                        * the '*' in m{(a+|b)*}' */
6714             break;
6715         case TAIL: /* placeholder while compiling (A|B|C) */
6716             break;
6717
6718 #undef  ST
6719 #define ST st->u.eval
6720 #define CUR_EVAL cur_eval->u.eval
6721
6722         {
6723             SV *ret;
6724             REGEXP *re_sv;
6725             regexp *re;
6726             regexp_internal *rei;
6727             regnode *startpoint;
6728             U32 arg;
6729
6730         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6731             arg= (U32)ARG(scan);
6732             if (cur_eval && cur_eval->locinput == locinput) {
6733                 if ( ++nochange_depth > max_nochange_depth )
6734                     Perl_croak(aTHX_
6735                         "Pattern subroutine nesting without pos change"
6736                         " exceeded limit in regex");
6737             } else {
6738                 nochange_depth = 0;
6739             }
6740             re_sv = rex_sv;
6741             re = rex;
6742             rei = rexi;
6743             startpoint = scan + ARG2L(scan);
6744             EVAL_CLOSE_PAREN_SET( st, arg );
6745             /* Detect infinite recursion
6746              *
6747              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6748              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6749              * So we track the position in the string we are at each time
6750              * we recurse and if we try to enter the same routine twice from
6751              * the same position we throw an error.
6752              */
6753             if ( rex->recurse_locinput[arg] == locinput ) {
6754                 /* FIXME: we should show the regop that is failing as part
6755                  * of the error message. */
6756                 Perl_croak(aTHX_ "Infinite recursion in regex");
6757             } else {
6758                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6759                 rex->recurse_locinput[arg]= locinput;
6760
6761                 DEBUG_r({
6762                     GET_RE_DEBUG_FLAGS_DECL;
6763                     DEBUG_STACK_r({
6764                         Perl_re_exec_indentf( aTHX_
6765                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6766                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6767                         );
6768                     });
6769                 });
6770             }
6771
6772             /* Save all the positions seen so far. */
6773             ST.cp = regcppush(rex, 0, maxopenparen);
6774             REGCP_SET(ST.lastcp);
6775
6776             /* and then jump to the code we share with EVAL */
6777             goto eval_recurse_doit;
6778             /* NOTREACHED */
6779
6780         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
6781             if (cur_eval && cur_eval->locinput==locinput) {
6782                 if ( ++nochange_depth > max_nochange_depth )
6783                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6784             } else {
6785                 nochange_depth = 0;
6786             }
6787             {
6788                 /* execute the code in the {...} */
6789
6790                 dSP;
6791                 IV before;
6792                 OP * const oop = PL_op;
6793                 COP * const ocurcop = PL_curcop;
6794                 OP *nop;
6795                 CV *newcv;
6796
6797                 /* save *all* paren positions */
6798                 regcppush(rex, 0, maxopenparen);
6799                 REGCP_SET(runops_cp);
6800
6801                 if (!caller_cv)
6802                     caller_cv = find_runcv(NULL);
6803
6804                 n = ARG(scan);
6805
6806                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6807                     newcv = (ReANY(
6808                                     (REGEXP*)(rexi->data->data[n])
6809                             ))->qr_anoncv;
6810                     nop = (OP*)rexi->data->data[n+1];
6811                 }
6812                 else if (rexi->data->what[n] == 'l') { /* literal code */
6813                     newcv = caller_cv;
6814                     nop = (OP*)rexi->data->data[n];
6815                     assert(CvDEPTH(newcv));
6816                 }
6817                 else {
6818                     /* literal with own CV */
6819                     assert(rexi->data->what[n] == 'L');
6820                     newcv = rex->qr_anoncv;
6821                     nop = (OP*)rexi->data->data[n];
6822                 }
6823
6824                 /* normally if we're about to execute code from the same
6825                  * CV that we used previously, we just use the existing
6826                  * CX stack entry. However, its possible that in the
6827                  * meantime we may have backtracked, popped from the save
6828                  * stack, and undone the SAVECOMPPAD(s) associated with
6829                  * PUSH_MULTICALL; in which case PL_comppad no longer
6830                  * points to newcv's pad. */
6831                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6832                 {
6833                     U8 flags = (CXp_SUB_RE |
6834                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6835                     if (last_pushed_cv) {
6836                         /* PUSH/POP_MULTICALL save and restore the
6837                          * caller's PL_comppad; if we call multiple subs
6838                          * using the same CX block, we have to save and
6839                          * unwind the varying PL_comppad's ourselves,
6840                          * especially restoring the right PL_comppad on
6841                          * backtrack - so save it on the save stack */
6842                         SAVECOMPPAD();
6843                         CHANGE_MULTICALL_FLAGS(newcv, flags);
6844                     }
6845                     else {
6846                         PUSH_MULTICALL_FLAGS(newcv, flags);
6847                     }
6848                     last_pushed_cv = newcv;
6849                 }
6850                 else {
6851                     /* these assignments are just to silence compiler
6852                      * warnings */
6853                     multicall_cop = NULL;
6854                 }
6855                 last_pad = PL_comppad;
6856
6857                 /* the initial nextstate you would normally execute
6858                  * at the start of an eval (which would cause error
6859                  * messages to come from the eval), may be optimised
6860                  * away from the execution path in the regex code blocks;
6861                  * so manually set PL_curcop to it initially */
6862                 {
6863                     OP *o = cUNOPx(nop)->op_first;
6864                     assert(o->op_type == OP_NULL);
6865                     if (o->op_targ == OP_SCOPE) {
6866                         o = cUNOPo->op_first;
6867                     }
6868                     else {
6869                         assert(o->op_targ == OP_LEAVE);
6870                         o = cUNOPo->op_first;
6871                         assert(o->op_type == OP_ENTER);
6872                         o = OpSIBLING(o);
6873                     }
6874
6875                     if (o->op_type != OP_STUB) {
6876                         assert(    o->op_type == OP_NEXTSTATE
6877                                 || o->op_type == OP_DBSTATE
6878                                 || (o->op_type == OP_NULL
6879                                     &&  (  o->op_targ == OP_NEXTSTATE
6880                                         || o->op_targ == OP_DBSTATE
6881                                         )
6882                                     )
6883                         );
6884                         PL_curcop = (COP*)o;
6885                     }
6886                 }
6887                 nop = nop->op_next;
6888
6889                 DEBUG_STATE_r( Perl_re_printf( aTHX_
6890                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6891
6892                 rex->offs[0].end = locinput - reginfo->strbeg;
6893                 if (reginfo->info_aux_eval->pos_magic)
6894                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6895                                   reginfo->sv, reginfo->strbeg,
6896                                   locinput - reginfo->strbeg);
6897
6898                 if (sv_yes_mark) {
6899                     SV *sv_mrk = get_sv("REGMARK", 1);
6900                     sv_setsv(sv_mrk, sv_yes_mark);
6901                 }
6902
6903                 /* we don't use MULTICALL here as we want to call the
6904                  * first op of the block of interest, rather than the
6905                  * first op of the sub. Also, we don't want to free
6906                  * the savestack frame */
6907                 before = (IV)(SP-PL_stack_base);
6908                 PL_op = nop;
6909                 CALLRUNOPS(aTHX);                       /* Scalar context. */
6910                 SPAGAIN;
6911                 if ((IV)(SP-PL_stack_base) == before)
6912                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6913                 else {
6914                     ret = POPs;
6915                     PUTBACK;
6916                 }
6917
6918                 /* before restoring everything, evaluate the returned
6919                  * value, so that 'uninit' warnings don't use the wrong
6920                  * PL_op or pad. Also need to process any magic vars
6921                  * (e.g. $1) *before* parentheses are restored */
6922
6923                 PL_op = NULL;
6924
6925                 re_sv = NULL;
6926                 if (logical == 0)        /*   (?{})/   */
6927                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6928                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6929                     sw = cBOOL(SvTRUE(ret));
6930                     logical = 0;
6931                 }
6932                 else {                   /*  /(??{})  */
6933                     /*  if its overloaded, let the regex compiler handle
6934                      *  it; otherwise extract regex, or stringify  */
6935                     if (SvGMAGICAL(ret))
6936                         ret = sv_mortalcopy(ret);
6937                     if (!SvAMAGIC(ret)) {
6938                         SV *sv = ret;
6939                         if (SvROK(sv))
6940                             sv = SvRV(sv);
6941                         if (SvTYPE(sv) == SVt_REGEXP)
6942                             re_sv = (REGEXP*) sv;
6943                         else if (SvSMAGICAL(ret)) {
6944                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6945                             if (mg)
6946                                 re_sv = (REGEXP *) mg->mg_obj;
6947                         }
6948
6949                         /* force any undef warnings here */
6950                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6951                             ret = sv_mortalcopy(ret);
6952                             (void) SvPV_force_nolen(ret);
6953                         }
6954                     }
6955
6956                 }
6957
6958                 /* *** Note that at this point we don't restore
6959                  * PL_comppad, (or pop the CxSUB) on the assumption it may
6960                  * be used again soon. This is safe as long as nothing
6961                  * in the regexp code uses the pad ! */
6962                 PL_op = oop;
6963                 PL_curcop = ocurcop;
6964                 regcp_restore(rex, runops_cp, &maxopenparen);
6965                 PL_curpm = PL_reg_curpm;
6966
6967                 if (logical != 2)
6968                     break;
6969             }
6970
6971                 /* only /(??{})/  from now on */
6972                 logical = 0;
6973                 {
6974                     /* extract RE object from returned value; compiling if
6975                      * necessary */
6976
6977                     if (re_sv) {
6978                         re_sv = reg_temp_copy(NULL, re_sv);
6979                     }
6980                     else {
6981                         U32 pm_flags = 0;
6982
6983                         if (SvUTF8(ret) && IN_BYTES) {
6984                             /* In use 'bytes': make a copy of the octet
6985                              * sequence, but without the flag on */
6986                             STRLEN len;
6987                             const char *const p = SvPV(ret, len);
6988                             ret = newSVpvn_flags(p, len, SVs_TEMP);
6989                         }
6990                         if (rex->intflags & PREGf_USE_RE_EVAL)
6991                             pm_flags |= PMf_USE_RE_EVAL;
6992
6993                         /* if we got here, it should be an engine which
6994                          * supports compiling code blocks and stuff */
6995                         assert(rex->engine && rex->engine->op_comp);
6996                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6997                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
6998                                     rex->engine, NULL, NULL,
6999                                     /* copy /msixn etc to inner pattern */
7000                                     ARG2L(scan),
7001                                     pm_flags);
7002
7003                         if (!(SvFLAGS(ret)
7004                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7005                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7006                             /* This isn't a first class regexp. Instead, it's
7007                                caching a regexp onto an existing, Perl visible
7008                                scalar.  */
7009                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7010                         }
7011                     }
7012                     SAVEFREESV(re_sv);
7013                     re = ReANY(re_sv);
7014                 }
7015                 RXp_MATCH_COPIED_off(re);
7016                 re->subbeg = rex->subbeg;
7017                 re->sublen = rex->sublen;
7018                 re->suboffset = rex->suboffset;
7019                 re->subcoffset = rex->subcoffset;
7020                 re->lastparen = 0;
7021                 re->lastcloseparen = 0;
7022                 rei = RXi_GET(re);
7023                 DEBUG_EXECUTE_r(
7024                     debug_start_match(re_sv, utf8_target, locinput,
7025                                     reginfo->strend, "Matching embedded");
7026                 );
7027                 startpoint = rei->program + 1;
7028                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7029                                              * close_paren only for GOSUB */
7030                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7031                 /* Save all the seen positions so far. */
7032                 ST.cp = regcppush(rex, 0, maxopenparen);
7033                 REGCP_SET(ST.lastcp);
7034                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7035                 maxopenparen = 0;
7036                 /* run the pattern returned from (??{...}) */
7037
7038               eval_recurse_doit: /* Share code with GOSUB below this line
7039                             * At this point we expect the stack context to be
7040                             * set up correctly */
7041
7042                 /* invalidate the S-L poscache. We're now executing a
7043                  * different set of WHILEM ops (and their associated
7044                  * indexes) against the same string, so the bits in the
7045                  * cache are meaningless. Setting maxiter to zero forces
7046                  * the cache to be invalidated and zeroed before reuse.
7047                  * XXX This is too dramatic a measure. Ideally we should
7048                  * save the old cache and restore when running the outer
7049                  * pattern again */
7050                 reginfo->poscache_maxiter = 0;
7051
7052                 /* the new regexp might have a different is_utf8_pat than we do */
7053                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7054
7055                 ST.prev_rex = rex_sv;
7056                 ST.prev_curlyx = cur_curlyx;
7057                 rex_sv = re_sv;
7058                 SET_reg_curpm(rex_sv);
7059                 rex = re;
7060                 rexi = rei;
7061                 cur_curlyx = NULL;
7062                 ST.B = next;
7063                 ST.prev_eval = cur_eval;
7064                 cur_eval = st;
7065                 /* now continue from first node in postoned RE */
7066                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
7067                 NOT_REACHED; /* NOTREACHED */
7068         }
7069
7070         case EVAL_AB: /* cleanup after a successful (??{A})B */
7071             /* note: this is called twice; first after popping B, then A */
7072             DEBUG_STACK_r({
7073                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7074                     depth, cur_eval, ST.prev_eval);
7075             });
7076
7077 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7078             if ( cur_eval && CUR_EVAL.close_paren ) {\
7079                 DEBUG_STACK_r({ \
7080                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7081                         depth,    \
7082                         CUR_EVAL.close_paren - 1,\
7083                         cur_eval, \
7084                         VAL);     \
7085                 });               \
7086                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7087             }
7088
7089             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7090
7091             rex_sv = ST.prev_rex;
7092             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7093             SET_reg_curpm(rex_sv);
7094             rex = ReANY(rex_sv);
7095             rexi = RXi_GET(rex);
7096             {
7097                 /* preserve $^R across LEAVE's. See Bug 121070. */
7098                 SV *save_sv= GvSV(PL_replgv);
7099                 SvREFCNT_inc(save_sv);
7100                 regcpblow(ST.cp); /* LEAVE in disguise */
7101                 sv_setsv(GvSV(PL_replgv), save_sv);
7102                 SvREFCNT_dec(save_sv);
7103             }
7104             cur_eval = ST.prev_eval;
7105             cur_curlyx = ST.prev_curlyx;
7106
7107             /* Invalidate cache. See "invalidate" comment above. */
7108             reginfo->poscache_maxiter = 0;
7109             if ( nochange_depth )
7110                 nochange_depth--;
7111
7112             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7113             sayYES;
7114
7115
7116         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7117             /* note: this is called twice; first after popping B, then A */
7118             DEBUG_STACK_r({
7119                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7120                     depth, cur_eval, ST.prev_eval);
7121             });
7122
7123             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7124
7125             rex_sv = ST.prev_rex;
7126             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7127             SET_reg_curpm(rex_sv);
7128             rex = ReANY(rex_sv);
7129             rexi = RXi_GET(rex);
7130
7131             REGCP_UNWIND(ST.lastcp);
7132             regcppop(rex, &maxopenparen);
7133             cur_eval = ST.prev_eval;
7134             cur_curlyx = ST.prev_curlyx;
7135
7136             /* Invalidate cache. See "invalidate" comment above. */
7137             reginfo->poscache_maxiter = 0;
7138             if ( nochange_depth )
7139                 nochange_depth--;
7140
7141             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7142             sayNO_SILENT;
7143 #undef ST
7144
7145         case OPEN: /*  (  */
7146             n = ARG(scan);  /* which paren pair */
7147             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7148             if (n > maxopenparen)
7149                 maxopenparen = n;
7150             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7151                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
7152                 depth,
7153                 PTR2UV(rex),
7154                 PTR2UV(rex->offs),
7155                 (UV)n,
7156                 (IV)rex->offs[n].start_tmp,
7157                 (UV)maxopenparen
7158             ));
7159             lastopen = n;
7160             break;
7161
7162 /* XXX really need to log other places start/end are set too */
7163 #define CLOSE_CAPTURE                                                      \
7164     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7165     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7166     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
7167         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
7168         depth,                                                             \
7169         PTR2UV(rex),                                                       \
7170         PTR2UV(rex->offs),                                                 \
7171         (UV)n,                                                             \
7172         (IV)rex->offs[n].start,                                            \
7173         (IV)rex->offs[n].end                                               \
7174     ))
7175
7176         case CLOSE:  /*  )  */
7177             n = ARG(scan);  /* which paren pair */
7178             CLOSE_CAPTURE;
7179             if (n > rex->lastparen)
7180                 rex->lastparen = n;
7181             rex->lastcloseparen = n;
7182             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7183                 goto fake_end;
7184
7185             break;
7186
7187         case ACCEPT:  /*  (*ACCEPT)  */
7188             if (scan->flags)
7189                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7190             if (ARG2L(scan)){
7191                 regnode *cursor;
7192                 for (cursor=scan;
7193                      cursor && OP(cursor)!=END;
7194                      cursor=regnext(cursor))
7195                 {
7196                     if ( OP(cursor)==CLOSE ){
7197                         n = ARG(cursor);
7198                         if ( n <= lastopen ) {
7199                             CLOSE_CAPTURE;
7200                             if (n > rex->lastparen)
7201                                 rex->lastparen = n;
7202                             rex->lastcloseparen = n;
7203                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7204                                 break;
7205                         }
7206                     }
7207                 }
7208             }
7209             goto fake_end;
7210             /* NOTREACHED */
7211
7212         case GROUPP:  /*  (?(1))  */
7213             n = ARG(scan);  /* which paren pair */
7214             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7215             break;
7216
7217         case NGROUPP:  /*  (?(<name>))  */
7218             /* reg_check_named_buff_matched returns 0 for no match */
7219             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7220             break;
7221
7222         case INSUBP:   /*  (?(R))  */
7223             n = ARG(scan);
7224             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7225              * of SCAN is already set up as matches a eval.close_paren */
7226             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7227             break;
7228
7229         case DEFINEP:  /*  (?(DEFINE))  */
7230             sw = 0;
7231             break;
7232
7233         case IFTHEN:   /*  (?(cond)A|B)  */
7234             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7235             if (sw)
7236                 next = NEXTOPER(NEXTOPER(scan));
7237             else {
7238                 next = scan + ARG(scan);
7239                 if (OP(next) == IFTHEN) /* Fake one. */
7240                     next = NEXTOPER(NEXTOPER(next));
7241             }
7242             break;
7243
7244         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7245             logical = scan->flags;
7246             break;
7247
7248 /*******************************************************************
7249
7250 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7251 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7252 STAR/PLUS/CURLY/CURLYN are used instead.)
7253
7254 A*B is compiled as <CURLYX><A><WHILEM><B>
7255
7256 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7257 state, which contains the current count, initialised to -1. It also sets
7258 cur_curlyx to point to this state, with any previous value saved in the
7259 state block.
7260
7261 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7262 since the pattern may possibly match zero times (i.e. it's a while {} loop
7263 rather than a do {} while loop).
7264
7265 Each entry to WHILEM represents a successful match of A. The count in the
7266 CURLYX block is incremented, another WHILEM state is pushed, and execution
7267 passes to A or B depending on greediness and the current count.
7268
7269 For example, if matching against the string a1a2a3b (where the aN are
7270 substrings that match /A/), then the match progresses as follows: (the
7271 pushed states are interspersed with the bits of strings matched so far):
7272
7273     <CURLYX cnt=-1>
7274     <CURLYX cnt=0><WHILEM>
7275     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7276     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7277     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7278     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7279
7280 (Contrast this with something like CURLYM, which maintains only a single
7281 backtrack state:
7282
7283     <CURLYM cnt=0> a1
7284     a1 <CURLYM cnt=1> a2
7285     a1 a2 <CURLYM cnt=2> a3
7286     a1 a2 a3 <CURLYM cnt=3> b
7287 )
7288
7289 Each WHILEM state block marks a point to backtrack to upon partial failure
7290 of A or B, and also contains some minor state data related to that
7291 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7292 overall state, such as the count, and pointers to the A and B ops.
7293
7294 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7295 must always point to the *current* CURLYX block, the rules are:
7296
7297 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7298 and set cur_curlyx to point the new block.
7299
7300 When popping the CURLYX block after a successful or unsuccessful match,
7301 restore the previous cur_curlyx.
7302
7303 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7304 to the outer one saved in the CURLYX block.
7305
7306 When popping the WHILEM block after a successful or unsuccessful B match,
7307 restore the previous cur_curlyx.
7308
7309 Here's an example for the pattern (AI* BI)*BO
7310 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7311
7312 cur_
7313 curlyx backtrack stack
7314 ------ ---------------
7315 NULL
7316 CO     <CO prev=NULL> <WO>
7317 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7318 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7319 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7320
7321 At this point the pattern succeeds, and we work back down the stack to
7322 clean up, restoring as we go:
7323
7324 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7325 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7326 CO     <CO prev=NULL> <WO>
7327 NULL
7328
7329 *******************************************************************/
7330
7331 #define ST st->u.curlyx
7332
7333         case CURLYX:    /* start of /A*B/  (for complex A) */
7334         {
7335             /* No need to save/restore up to this paren */
7336             I32 parenfloor = scan->flags;
7337
7338             assert(next); /* keep Coverity happy */
7339             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7340                 next += ARG(next);
7341
7342             /* XXXX Probably it is better to teach regpush to support
7343                parenfloor > maxopenparen ... */
7344             if (parenfloor > (I32)rex->lastparen)
7345                 parenfloor = rex->lastparen; /* Pessimization... */
7346
7347             ST.prev_curlyx= cur_curlyx;
7348             cur_curlyx = st;
7349             ST.cp = PL_savestack_ix;
7350
7351             /* these fields contain the state of the current curly.
7352              * they are accessed by subsequent WHILEMs */
7353             ST.parenfloor = parenfloor;
7354             ST.me = scan;
7355             ST.B = next;
7356             ST.minmod = minmod;
7357             minmod = 0;
7358             ST.count = -1;      /* this will be updated by WHILEM */
7359             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7360
7361             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7362             NOT_REACHED; /* NOTREACHED */
7363         }
7364
7365         case CURLYX_end: /* just finished matching all of A*B */
7366             cur_curlyx = ST.prev_curlyx;
7367             sayYES;
7368             NOT_REACHED; /* NOTREACHED */
7369
7370         case CURLYX_end_fail: /* just failed to match all of A*B */
7371             regcpblow(ST.cp);
7372             cur_curlyx = ST.prev_curlyx;
7373             sayNO;
7374             NOT_REACHED; /* NOTREACHED */
7375
7376
7377 #undef ST
7378 #define ST st->u.whilem
7379
7380         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7381         {
7382             /* see the discussion above about CURLYX/WHILEM */
7383             I32 n;
7384             int min, max;
7385             regnode *A;
7386
7387             assert(cur_curlyx); /* keep Coverity happy */
7388
7389             min = ARG1(cur_curlyx->u.curlyx.me);
7390             max = ARG2(cur_curlyx->u.curlyx.me);
7391             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7392             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7393             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7394             ST.cache_offset = 0;
7395             ST.cache_mask = 0;
7396
7397
7398             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
7399                   depth, (long)n, min, max)
7400             );
7401
7402             /* First just match a string of min A's. */
7403
7404             if (n < min) {
7405                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7406                 cur_curlyx->u.curlyx.lastloc = locinput;
7407                 REGCP_SET(ST.lastcp);
7408
7409                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7410                 NOT_REACHED; /* NOTREACHED */
7411             }
7412
7413             /* If degenerate A matches "", assume A done. */
7414
7415             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7416                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
7417                    depth)
7418                 );
7419                 goto do_whilem_B_max;
7420             }
7421
7422             /* super-linear cache processing.
7423              *
7424              * The idea here is that for certain types of CURLYX/WHILEM -
7425              * principally those whose upper bound is infinity (and
7426              * excluding regexes that have things like \1 and other very
7427              * non-regular expresssiony things), then if a pattern like
7428              * /....A*.../ fails and we backtrack to the WHILEM, then we
7429              * make a note that this particular WHILEM op was at string
7430              * position 47 (say) when the rest of pattern failed. Then, if
7431              * we ever find ourselves back at that WHILEM, and at string
7432              * position 47 again, we can just fail immediately rather than
7433              * running the rest of the pattern again.
7434              *
7435              * This is very handy when patterns start to go
7436              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7437              * with a combinatorial explosion of backtracking.
7438              *
7439              * The cache is implemented as a bit array, with one bit per
7440              * string byte position per WHILEM op (up to 16) - so its
7441              * between 0.25 and 2x the string size.
7442              *
7443              * To avoid allocating a poscache buffer every time, we do an
7444              * initially countdown; only after we have  executed a WHILEM
7445              * op (string-length x #WHILEMs) times do we allocate the
7446              * cache.
7447              *
7448              * The top 4 bits of scan->flags byte say how many different
7449              * relevant CURLLYX/WHILEM op pairs there are, while the
7450              * bottom 4-bits is the identifying index number of this
7451              * WHILEM.
7452              */
7453
7454             if (scan->flags) {
7455
7456                 if (!reginfo->poscache_maxiter) {
7457                     /* start the countdown: Postpone detection until we
7458                      * know the match is not *that* much linear. */
7459                     reginfo->poscache_maxiter
7460                         =    (reginfo->strend - reginfo->strbeg + 1)
7461                            * (scan->flags>>4);
7462                     /* possible overflow for long strings and many CURLYX's */
7463                     if (reginfo->poscache_maxiter < 0)
7464                         reginfo->poscache_maxiter = I32_MAX;
7465                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7466                 }
7467
7468                 if (reginfo->poscache_iter-- == 0) {
7469                     /* initialise cache */
7470                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7471                     regmatch_info_aux *const aux = reginfo->info_aux;
7472                     if (aux->poscache) {
7473                         if ((SSize_t)reginfo->poscache_size < size) {
7474                             Renew(aux->poscache, size, char);
7475                             reginfo->poscache_size = size;
7476                         }
7477                         Zero(aux->poscache, size, char);
7478                     }
7479                     else {
7480                         reginfo->poscache_size = size;
7481                         Newxz(aux->poscache, size, char);
7482                     }
7483                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7484       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7485                               PL_colors[4], PL_colors[5])
7486                     );
7487                 }
7488
7489                 if (reginfo->poscache_iter < 0) {
7490                     /* have we already failed at this position? */
7491                     SSize_t offset, mask;
7492
7493                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7494                     offset  = (scan->flags & 0xf) - 1
7495                                 +   (locinput - reginfo->strbeg)
7496                                   * (scan->flags>>4);
7497                     mask    = 1 << (offset % 8);
7498                     offset /= 8;
7499                     if (reginfo->info_aux->poscache[offset] & mask) {
7500                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
7501                             depth)
7502                         );
7503                         sayNO; /* cache records failure */
7504                     }
7505                     ST.cache_offset = offset;
7506                     ST.cache_mask   = mask;
7507                 }
7508             }
7509
7510             /* Prefer B over A for minimal matching. */
7511
7512             if (cur_curlyx->u.curlyx.minmod) {
7513                 ST.save_curlyx = cur_curlyx;
7514                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7515                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
7516                             maxopenparen);
7517                 REGCP_SET(ST.lastcp);
7518                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7519                                     locinput);
7520                 NOT_REACHED; /* NOTREACHED */
7521             }
7522
7523             /* Prefer A over B for maximal matching. */
7524
7525             if (n < max) { /* More greed allowed? */
7526                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7527                             maxopenparen);
7528                 cur_curlyx->u.curlyx.lastloc = locinput;
7529                 REGCP_SET(ST.lastcp);
7530                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7531                 NOT_REACHED; /* NOTREACHED */
7532             }
7533             goto do_whilem_B_max;
7534         }
7535         NOT_REACHED; /* NOTREACHED */
7536
7537         case WHILEM_B_min: /* just matched B in a minimal match */
7538         case WHILEM_B_max: /* just matched B in a maximal match */
7539             cur_curlyx = ST.save_curlyx;
7540             sayYES;
7541             NOT_REACHED; /* NOTREACHED */
7542
7543         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7544             cur_curlyx = ST.save_curlyx;
7545             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7546             cur_curlyx->u.curlyx.count--;
7547             CACHEsayNO;
7548             NOT_REACHED; /* NOTREACHED */
7549
7550         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7551             /* FALLTHROUGH */
7552         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7553             REGCP_UNWIND(ST.lastcp);
7554             regcppop(rex, &maxopenparen);
7555             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7556             cur_curlyx->u.curlyx.count--;
7557             CACHEsayNO;
7558             NOT_REACHED; /* NOTREACHED */
7559
7560         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7561             REGCP_UNWIND(ST.lastcp);
7562             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7563             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
7564                 depth)
7565             );
7566           do_whilem_B_max:
7567             if (cur_curlyx->u.curlyx.count >= REG_INFTY
7568                 && ckWARN(WARN_REGEXP)
7569                 && !reginfo->warned)
7570             {
7571                 reginfo->warned = TRUE;
7572                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7573                      "Complex regular subexpression recursion limit (%d) "
7574                      "exceeded",
7575                      REG_INFTY - 1);
7576             }
7577
7578             /* now try B */
7579             ST.save_curlyx = cur_curlyx;
7580             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7581             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7582                                 locinput);
7583             NOT_REACHED; /* NOTREACHED */
7584
7585         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7586             cur_curlyx = ST.save_curlyx;
7587             REGCP_UNWIND(ST.lastcp);
7588             regcppop(rex, &maxopenparen);
7589
7590             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7591                 /* Maximum greed exceeded */
7592                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7593                     && ckWARN(WARN_REGEXP)
7594                     && !reginfo->warned)
7595                 {
7596                     reginfo->warned     = TRUE;
7597                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7598                         "Complex regular subexpression recursion "
7599                         "limit (%d) exceeded",
7600                         REG_INFTY - 1);
7601                 }
7602                 cur_curlyx->u.curlyx.count--;
7603                 CACHEsayNO;
7604             }
7605
7606             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
7607             );
7608             /* Try grabbing another A and see if it helps. */
7609             cur_curlyx->u.curlyx.lastloc = locinput;
7610             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7611                             maxopenparen);
7612             REGCP_SET(ST.lastcp);
7613             PUSH_STATE_GOTO(WHILEM_A_min,
7614                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7615                 locinput);
7616             NOT_REACHED; /* NOTREACHED */
7617
7618 #undef  ST
7619 #define ST st->u.branch
7620
7621         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
7622             next = scan + ARG(scan);
7623             if (next == scan)
7624                 next = NULL;
7625             scan = NEXTOPER(scan);
7626             /* FALLTHROUGH */
7627
7628         case BRANCH:        /*  /(...|A|...)/ */
7629             scan = NEXTOPER(scan); /* scan now points to inner node */
7630             ST.lastparen = rex->lastparen;
7631             ST.lastcloseparen = rex->lastcloseparen;
7632             ST.next_branch = next;
7633             REGCP_SET(ST.cp);
7634
7635             /* Now go into the branch */
7636             if (has_cutgroup) {
7637                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7638             } else {
7639                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7640             }
7641             NOT_REACHED; /* NOTREACHED */
7642
7643         case CUTGROUP:  /*  /(*THEN)/  */
7644             sv_yes_mark = st->u.mark.mark_name = scan->flags
7645                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7646                 : NULL;
7647             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7648             NOT_REACHED; /* NOTREACHED */
7649
7650         case CUTGROUP_next_fail:
7651             do_cutgroup = 1;
7652             no_final = 1;
7653             if (st->u.mark.mark_name)
7654                 sv_commit = st->u.mark.mark_name;
7655             sayNO;
7656             NOT_REACHED; /* NOTREACHED */
7657
7658         case BRANCH_next:
7659             sayYES;
7660             NOT_REACHED; /* NOTREACHED */
7661
7662         case BRANCH_next_fail: /* that branch failed; try the next, if any */
7663             if (do_cutgroup) {
7664                 do_cutgroup = 0;
7665                 no_final = 0;
7666             }
7667             REGCP_UNWIND(ST.cp);
7668             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7669             scan = ST.next_branch;
7670             /* no more branches? */
7671             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7672                 DEBUG_EXECUTE_r({
7673                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
7674                         depth,
7675                         PL_colors[4],
7676                         PL_colors[5] );
7677                 });
7678                 sayNO_SILENT;
7679             }
7680             continue; /* execute next BRANCH[J] op */
7681             /* NOTREACHED */
7682
7683         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7684             minmod = 1;
7685             break;
7686
7687 #undef  ST
7688 #define ST st->u.curlym
7689
7690         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7691
7692             /* This is an optimisation of CURLYX that enables us to push
7693              * only a single backtracking state, no matter how many matches
7694              * there are in {m,n}. It relies on the pattern being constant
7695              * length, with no parens to influence future backrefs
7696              */
7697
7698             ST.me = scan;
7699             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7700
7701             ST.lastparen      = rex->lastparen;
7702             ST.lastcloseparen = rex->lastcloseparen;
7703
7704             /* if paren positive, emulate an OPEN/CLOSE around A */
7705             if (ST.me->flags) {
7706                 U32 paren = ST.me->flags;
7707                 if (paren > maxopenparen)
7708                     maxopenparen = paren;
7709                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7710             }
7711             ST.A = scan;
7712             ST.B = next;
7713             ST.alen = 0;
7714             ST.count = 0;
7715             ST.minmod = minmod;
7716             minmod = 0;
7717             ST.c1 = CHRTEST_UNINIT;
7718             REGCP_SET(ST.cp);
7719
7720             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7721                 goto curlym_do_B;
7722
7723           curlym_do_A: /* execute the A in /A{m,n}B/  */
7724             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7725             NOT_REACHED; /* NOTREACHED */
7726
7727         case CURLYM_A: /* we've just matched an A */
7728             ST.count++;
7729             /* after first match, determine A's length: u.curlym.alen */
7730             if (ST.count == 1) {
7731                 if (reginfo->is_utf8_target) {
7732                     char *s = st->locinput;
7733                     while (s < locinput) {
7734                         ST.alen++;
7735                         s += UTF8SKIP(s);
7736                     }
7737                 }
7738                 else {
7739                     ST.alen = locinput - st->locinput;
7740                 }
7741                 if (ST.alen == 0)
7742                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7743             }
7744             DEBUG_EXECUTE_r(
7745                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7746                           depth, (IV) ST.count, (IV)ST.alen)
7747             );
7748
7749             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7750                 goto fake_end;
7751
7752             {
7753                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7754                 if ( max == REG_INFTY || ST.count < max )
7755                     goto curlym_do_A; /* try to match another A */
7756             }
7757             goto curlym_do_B; /* try to match B */
7758
7759         case CURLYM_A_fail: /* just failed to match an A */
7760             REGCP_UNWIND(ST.cp);
7761
7762
7763             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7764                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7765                 sayNO;
7766
7767           curlym_do_B: /* execute the B in /A{m,n}B/  */
7768             if (ST.c1 == CHRTEST_UNINIT) {
7769                 /* calculate c1 and c2 for possible match of 1st char
7770                  * following curly */
7771                 ST.c1 = ST.c2 = CHRTEST_VOID;
7772                 assert(ST.B);
7773                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7774                     regnode *text_node = ST.B;
7775                     if (! HAS_TEXT(text_node))
7776                         FIND_NEXT_IMPT(text_node);
7777                     /* this used to be
7778
7779                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7780
7781                         But the former is redundant in light of the latter.
7782
7783                         if this changes back then the macro for
7784                         IS_TEXT and friends need to change.
7785                      */
7786                     if (PL_regkind[OP(text_node)] == EXACT) {
7787                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7788                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7789                            reginfo))
7790                         {
7791                             sayNO;
7792                         }
7793                     }
7794                 }
7795             }
7796
7797             DEBUG_EXECUTE_r(
7798                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%"IVdf"...\n",
7799                     depth, (IV)ST.count)
7800                 );
7801             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7802                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7803                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7804                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7805                     {
7806                         /* simulate B failing */
7807                         DEBUG_OPTIMISE_r(
7808                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7809                                 depth,
7810                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7811                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7812                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7813                         );
7814                         state_num = CURLYM_B_fail;
7815                         goto reenter_switch;
7816                     }
7817                 }
7818                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7819                     /* simulate B failing */
7820                     DEBUG_OPTIMISE_r(
7821                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7822                             depth,
7823                             (int) nextchr, ST.c1, ST.c2)
7824                     );
7825                     state_num = CURLYM_B_fail;
7826                     goto reenter_switch;
7827                 }
7828             }
7829
7830             if (ST.me->flags) {
7831                 /* emulate CLOSE: mark current A as captured */
7832                 I32 paren = ST.me->flags;
7833                 if (ST.count) {
7834                     rex->offs[paren].start
7835                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7836                     rex->offs[paren].end = locinput - reginfo->strbeg;
7837                     if ((U32)paren > rex->lastparen)
7838                         rex->lastparen = paren;
7839                     rex->lastcloseparen = paren;
7840                 }
7841                 else
7842                     rex->offs[paren].end = -1;
7843
7844                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7845                 {
7846                     if (ST.count)
7847                         goto fake_end;
7848                     else
7849                         sayNO;
7850                 }
7851             }
7852
7853             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7854             NOT_REACHED; /* NOTREACHED */
7855
7856         case CURLYM_B_fail: /* just failed to match a B */
7857             REGCP_UNWIND(ST.cp);
7858             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7859             if (ST.minmod) {
7860                 I32 max = ARG2(ST.me);
7861                 if (max != REG_INFTY && ST.count == max)
7862                     sayNO;
7863                 goto curlym_do_A; /* try to match a further A */
7864             }
7865             /* backtrack one A */
7866             if (ST.count == ARG1(ST.me) /* min */)
7867                 sayNO;
7868             ST.count--;
7869             SET_locinput(HOPc(locinput, -ST.alen));
7870             goto curlym_do_B; /* try to match B */
7871
7872 #undef ST
7873 #define ST st->u.curly
7874
7875 #define CURLY_SETPAREN(paren, success) \
7876     if (paren) { \
7877         if (success) { \
7878             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7879             rex->offs[paren].end = locinput - reginfo->strbeg; \
7880             if (paren > rex->lastparen) \
7881                 rex->lastparen = paren; \
7882             rex->lastcloseparen = paren; \
7883         } \
7884         else { \
7885             rex->offs[paren].end = -1; \
7886             rex->lastparen      = ST.lastparen; \
7887             rex->lastcloseparen = ST.lastcloseparen; \
7888         } \
7889     }
7890
7891         case STAR:              /*  /A*B/ where A is width 1 char */
7892             ST.paren = 0;
7893             ST.min = 0;
7894             ST.max = REG_INFTY;
7895             scan = NEXTOPER(scan);
7896             goto repeat;
7897
7898         case PLUS:              /*  /A+B/ where A is width 1 char */
7899             ST.paren = 0;
7900             ST.min = 1;
7901             ST.max = REG_INFTY;
7902             scan = NEXTOPER(scan);
7903             goto repeat;
7904
7905         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
7906             ST.paren = scan->flags;     /* Which paren to set */
7907             ST.lastparen      = rex->lastparen;
7908             ST.lastcloseparen = rex->lastcloseparen;
7909             if (ST.paren > maxopenparen)
7910                 maxopenparen = ST.paren;
7911             ST.min = ARG1(scan);  /* min to match */
7912             ST.max = ARG2(scan);  /* max to match */
7913             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
7914             {
7915                 ST.min=1;
7916                 ST.max=1;
7917             }
7918             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7919             goto repeat;
7920
7921         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
7922             ST.paren = 0;
7923             ST.min = ARG1(scan);  /* min to match */
7924             ST.max = ARG2(scan);  /* max to match */
7925             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7926           repeat:
7927             /*
7928             * Lookahead to avoid useless match attempts
7929             * when we know what character comes next.
7930             *
7931             * Used to only do .*x and .*?x, but now it allows
7932             * for )'s, ('s and (?{ ... })'s to be in the way
7933             * of the quantifier and the EXACT-like node.  -- japhy
7934             */
7935
7936             assert(ST.min <= ST.max);
7937             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7938                 ST.c1 = ST.c2 = CHRTEST_VOID;
7939             }
7940             else {
7941                 regnode *text_node = next;
7942
7943                 if (! HAS_TEXT(text_node))
7944                     FIND_NEXT_IMPT(text_node);
7945
7946                 if (! HAS_TEXT(text_node))
7947                     ST.c1 = ST.c2 = CHRTEST_VOID;
7948                 else {
7949                     if ( PL_regkind[OP(text_node)] != EXACT ) {
7950                         ST.c1 = ST.c2 = CHRTEST_VOID;
7951                     }
7952                     else {
7953
7954                     /*  Currently we only get here when
7955
7956                         PL_rekind[OP(text_node)] == EXACT
7957
7958                         if this changes back then the macro for IS_TEXT and
7959                         friends need to change. */
7960                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7961                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7962                            reginfo))
7963                         {
7964                             sayNO;
7965                         }
7966                     }
7967                 }
7968             }
7969
7970             ST.A = scan;
7971             ST.B = next;
7972             if (minmod) {
7973                 char *li = locinput;
7974                 minmod = 0;
7975                 if (ST.min &&
7976                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
7977                             < ST.min)
7978                     sayNO;
7979                 SET_locinput(li);
7980                 ST.count = ST.min;
7981                 REGCP_SET(ST.cp);
7982                 if (ST.c1 == CHRTEST_VOID)
7983                     goto curly_try_B_min;
7984
7985                 ST.oldloc = locinput;
7986
7987                 /* set ST.maxpos to the furthest point along the
7988                  * string that could possibly match */
7989                 if  (ST.max == REG_INFTY) {
7990                     ST.maxpos = reginfo->strend - 1;
7991                     if (utf8_target)
7992                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7993                             ST.maxpos--;
7994                 }
7995                 else if (utf8_target) {
7996                     int m = ST.max - ST.min;
7997                     for (ST.maxpos = locinput;
7998                          m >0 && ST.maxpos < reginfo->strend; m--)
7999                         ST.maxpos += UTF8SKIP(ST.maxpos);
8000                 }
8001                 else {
8002                     ST.maxpos = locinput + ST.max - ST.min;
8003                     if (ST.maxpos >= reginfo->strend)
8004                         ST.maxpos = reginfo->strend - 1;
8005                 }
8006                 goto curly_try_B_min_known;
8007
8008             }
8009             else {
8010                 /* avoid taking address of locinput, so it can remain
8011                  * a register var */
8012                 char *li = locinput;
8013                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8014                 if (ST.count < ST.min)
8015                     sayNO;
8016                 SET_locinput(li);
8017                 if ((ST.count > ST.min)
8018                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8019                 {
8020                     /* A{m,n} must come at the end of the string, there's
8021                      * no point in backing off ... */
8022                     ST.min = ST.count;
8023                     /* ...except that $ and \Z can match before *and* after
8024                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8025                        We may back off by one in this case. */
8026                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8027                         ST.min--;
8028                 }
8029                 REGCP_SET(ST.cp);
8030                 goto curly_try_B_max;
8031             }
8032             NOT_REACHED; /* NOTREACHED */
8033
8034         case CURLY_B_min_known_fail:
8035             /* failed to find B in a non-greedy match where c1,c2 valid */
8036
8037             REGCP_UNWIND(ST.cp);
8038             if (ST.paren) {
8039                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8040             }
8041             /* Couldn't or didn't -- move forward. */
8042             ST.oldloc = locinput;
8043             if (utf8_target)
8044                 locinput += UTF8SKIP(locinput);
8045             else
8046                 locinput++;
8047             ST.count++;
8048           curly_try_B_min_known:
8049              /* find the next place where 'B' could work, then call B */
8050             {
8051                 int n;
8052                 if (utf8_target) {
8053                     n = (ST.oldloc == locinput) ? 0 : 1;
8054                     if (ST.c1 == ST.c2) {
8055                         /* set n to utf8_distance(oldloc, locinput) */
8056                         while (locinput <= ST.maxpos
8057                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8058                         {
8059                             locinput += UTF8SKIP(locinput);
8060                             n++;
8061                         }
8062                     }
8063                     else {
8064                         /* set n to utf8_distance(oldloc, locinput) */
8065                         while (locinput <= ST.maxpos
8066                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8067                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8068                         {
8069                             locinput += UTF8SKIP(locinput);
8070                             n++;
8071                         }
8072                     }
8073                 }
8074                 else {  /* Not utf8_target */
8075                     if (ST.c1 == ST.c2) {
8076                         while (locinput <= ST.maxpos &&
8077                                UCHARAT(locinput) != ST.c1)
8078                             locinput++;
8079                     }
8080                     else {
8081                         while (locinput <= ST.maxpos
8082                                && UCHARAT(locinput) != ST.c1
8083                                && UCHARAT(locinput) != ST.c2)
8084                             locinput++;
8085                     }
8086                     n = locinput - ST.oldloc;
8087                 }
8088                 if (locinput > ST.maxpos)
8089                     sayNO;
8090                 if (n) {
8091                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8092                      * at b; check that everything between oldloc and
8093                      * locinput matches */
8094                     char *li = ST.oldloc;
8095                     ST.count += n;
8096                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8097                         sayNO;
8098                     assert(n == REG_INFTY || locinput == li);
8099                 }
8100                 CURLY_SETPAREN(ST.paren, ST.count);
8101                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8102                     goto fake_end;
8103                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8104             }
8105             NOT_REACHED; /* NOTREACHED */
8106
8107         case CURLY_B_min_fail:
8108             /* failed to find B in a non-greedy match where c1,c2 invalid */
8109
8110             REGCP_UNWIND(ST.cp);
8111             if (ST.paren) {
8112                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8113             }
8114             /* failed -- move forward one */
8115             {
8116                 char *li = locinput;
8117                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8118                     sayNO;
8119                 }
8120                 locinput = li;
8121             }
8122             {
8123                 ST.count++;
8124                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8125                         ST.count > 0)) /* count overflow ? */
8126                 {
8127                   curly_try_B_min:
8128                     CURLY_SETPAREN(ST.paren, ST.count);
8129                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8130                         goto fake_end;
8131                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8132                 }
8133             }
8134             sayNO;
8135             NOT_REACHED; /* NOTREACHED */
8136
8137           curly_try_B_max:
8138             /* a successful greedy match: now try to match B */
8139             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8140                 goto fake_end;
8141             {
8142                 bool could_match = locinput < reginfo->strend;
8143
8144                 /* If it could work, try it. */
8145                 if (ST.c1 != CHRTEST_VOID && could_match) {
8146                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8147                     {
8148                         could_match = memEQ(locinput,
8149                                             ST.c1_utf8,
8150                                             UTF8SKIP(locinput))
8151                                     || memEQ(locinput,
8152                                              ST.c2_utf8,
8153                                              UTF8SKIP(locinput));
8154                     }
8155                     else {
8156                         could_match = UCHARAT(locinput) == ST.c1
8157                                       || UCHARAT(locinput) == ST.c2;
8158                     }
8159                 }
8160                 if (ST.c1 == CHRTEST_VOID || could_match) {
8161                     CURLY_SETPAREN(ST.paren, ST.count);
8162                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8163                     NOT_REACHED; /* NOTREACHED */
8164                 }
8165             }
8166             /* FALLTHROUGH */
8167
8168         case CURLY_B_max_fail:
8169             /* failed to find B in a greedy match */
8170
8171             REGCP_UNWIND(ST.cp);
8172             if (ST.paren) {
8173                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8174             }
8175             /*  back up. */
8176             if (--ST.count < ST.min)
8177                 sayNO;
8178             locinput = HOPc(locinput, -1);
8179             goto curly_try_B_max;
8180
8181 #undef ST
8182
8183         case END: /*  last op of main pattern  */
8184           fake_end:
8185             if (cur_eval) {
8186                 /* we've just finished A in /(??{A})B/; now continue with B */
8187                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8188                 st->u.eval.prev_rex = rex_sv;           /* inner */
8189
8190                 /* Save *all* the positions. */
8191                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8192                 rex_sv = CUR_EVAL.prev_rex;
8193                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8194                 SET_reg_curpm(rex_sv);
8195                 rex = ReANY(rex_sv);
8196                 rexi = RXi_GET(rex);
8197
8198                 st->u.eval.prev_curlyx = cur_curlyx;
8199                 cur_curlyx = CUR_EVAL.prev_curlyx;
8200
8201                 REGCP_SET(st->u.eval.lastcp);
8202
8203                 /* Restore parens of the outer rex without popping the
8204                  * savestack */
8205                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8206
8207                 st->u.eval.prev_eval = cur_eval;
8208                 cur_eval = CUR_EVAL.prev_eval;
8209                 DEBUG_EXECUTE_r(
8210                     Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
8211                                       depth, cur_eval););
8212                 if ( nochange_depth )
8213                     nochange_depth--;
8214
8215                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8216
8217                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
8218                                     locinput); /* match B */
8219             }
8220
8221             if (locinput < reginfo->till) {
8222                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8223                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8224                                       PL_colors[4],
8225                                       (long)(locinput - startpos),
8226                                       (long)(reginfo->till - startpos),
8227                                       PL_colors[5]));
8228
8229                 sayNO_SILENT;           /* Cannot match: too short. */
8230             }
8231             sayYES;                     /* Success! */
8232
8233         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8234             DEBUG_EXECUTE_r(
8235             Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
8236                 depth, PL_colors[4], PL_colors[5]));
8237             sayYES;                     /* Success! */
8238
8239 #undef  ST
8240 #define ST st->u.ifmatch
8241
8242         {
8243             char *newstart;
8244
8245         case SUSPEND:   /* (?>A) */
8246             ST.wanted = 1;
8247             newstart = locinput;
8248             goto do_ifmatch;
8249
8250         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8251             ST.wanted = 0;
8252             goto ifmatch_trivial_fail_test;
8253
8254         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8255             ST.wanted = 1;
8256           ifmatch_trivial_fail_test:
8257             if (scan->flags) {
8258                 char * const s = HOPBACKc(locinput, scan->flags);
8259                 if (!s) {
8260                     /* trivial fail */
8261                     if (logical) {
8262                         logical = 0;
8263                         sw = 1 - cBOOL(ST.wanted);
8264                     }
8265                     else if (ST.wanted)
8266                         sayNO;
8267                     next = scan + ARG(scan);
8268                     if (next == scan)
8269                         next = NULL;
8270                     break;
8271                 }
8272                 newstart = s;
8273             }
8274             else
8275                 newstart = locinput;
8276
8277           do_ifmatch:
8278             ST.me = scan;
8279             ST.logical = logical;
8280             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8281
8282             /* execute body of (?...A) */
8283             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8284             NOT_REACHED; /* NOTREACHED */
8285         }
8286
8287         case IFMATCH_A_fail: /* body of (?...A) failed */
8288             ST.wanted = !ST.wanted;
8289             /* FALLTHROUGH */
8290
8291         case IFMATCH_A: /* body of (?...A) succeeded */
8292             if (ST.logical) {
8293                 sw = cBOOL(ST.wanted);
8294             }
8295             else if (!ST.wanted)
8296                 sayNO;
8297
8298             if (OP(ST.me) != SUSPEND) {
8299                 /* restore old position except for (?>...) */
8300                 locinput = st->locinput;
8301             }
8302             scan = ST.me + ARG(ST.me);
8303             if (scan == ST.me)
8304                 scan = NULL;
8305             continue; /* execute B */
8306
8307 #undef ST
8308
8309         case LONGJMP: /*  alternative with many branches compiles to
8310                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8311             next = scan + ARG(scan);
8312             if (next == scan)
8313                 next = NULL;
8314             break;
8315
8316         case COMMIT:  /*  (*COMMIT)  */
8317             reginfo->cutpoint = reginfo->strend;
8318             /* FALLTHROUGH */
8319
8320         case PRUNE:   /*  (*PRUNE)   */
8321             if (scan->flags)
8322                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8323             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8324             NOT_REACHED; /* NOTREACHED */
8325
8326         case COMMIT_next_fail:
8327             no_final = 1;
8328             /* FALLTHROUGH */
8329             sayNO;
8330             NOT_REACHED; /* NOTREACHED */
8331
8332         case OPFAIL:   /* (*FAIL)  */
8333             if (scan->flags)
8334                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8335             if (logical) {
8336                 /* deal with (?(?!)X|Y) properly,
8337                  * make sure we trigger the no branch
8338                  * of the trailing IFTHEN structure*/
8339                 sw= 0;
8340                 break;
8341             } else {
8342                 sayNO;
8343             }
8344             NOT_REACHED; /* NOTREACHED */
8345
8346 #define ST st->u.mark
8347         case MARKPOINT: /*  (*MARK:foo)  */
8348             ST.prev_mark = mark_state;
8349             ST.mark_name = sv_commit = sv_yes_mark
8350                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8351             mark_state = st;
8352             ST.mark_loc = locinput;
8353             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8354             NOT_REACHED; /* NOTREACHED */
8355
8356         case MARKPOINT_next:
8357             mark_state = ST.prev_mark;
8358             sayYES;
8359             NOT_REACHED; /* NOTREACHED */
8360
8361         case MARKPOINT_next_fail:
8362             if (popmark && sv_eq(ST.mark_name,popmark))
8363             {
8364                 if (ST.mark_loc > startpoint)
8365                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8366                 popmark = NULL; /* we found our mark */
8367                 sv_commit = ST.mark_name;
8368
8369                 DEBUG_EXECUTE_r({
8370                         Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%"SVf"...%s\n",
8371                             depth,
8372                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8373                 });
8374             }
8375             mark_state = ST.prev_mark;
8376             sv_yes_mark = mark_state ?
8377                 mark_state->u.mark.mark_name : NULL;
8378             sayNO;
8379             NOT_REACHED; /* NOTREACHED */
8380
8381         case SKIP:  /*  (*SKIP)  */
8382             if (!scan->flags) {
8383                 /* (*SKIP) : if we fail we cut here*/
8384                 ST.mark_name = NULL;
8385                 ST.mark_loc = locinput;
8386                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8387             } else {
8388                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8389                    otherwise do nothing.  Meaning we need to scan
8390                  */
8391                 regmatch_state *cur = mark_state;
8392                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8393
8394                 while (cur) {
8395                     if ( sv_eq( cur->u.mark.mark_name,
8396                                 find ) )
8397                     {
8398                         ST.mark_name = find;
8399                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8400                     }
8401                     cur = cur->u.mark.prev_mark;
8402                 }
8403             }
8404             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8405             break;
8406
8407         case SKIP_next_fail:
8408             if (ST.mark_name) {
8409                 /* (*CUT:NAME) - Set up to search for the name as we
8410                    collapse the stack*/
8411                 popmark = ST.mark_name;
8412             } else {
8413                 /* (*CUT) - No name, we cut here.*/
8414                 if (ST.mark_loc > startpoint)
8415                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8416                 /* but we set sv_commit to latest mark_name if there
8417                    is one so they can test to see how things lead to this
8418                    cut */
8419                 if (mark_state)
8420                     sv_commit=mark_state->u.mark.mark_name;
8421             }
8422             no_final = 1;
8423             sayNO;
8424             NOT_REACHED; /* NOTREACHED */
8425 #undef ST
8426
8427         case LNBREAK: /* \R */
8428             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8429                 locinput += n;
8430             } else
8431                 sayNO;
8432             break;
8433
8434         default:
8435             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
8436                           PTR2UV(scan), OP(scan));
8437             Perl_croak(aTHX_ "regexp memory corruption");
8438
8439         /* this is a point to jump to in order to increment
8440          * locinput by one character */
8441           increment_locinput:
8442             assert(!NEXTCHR_IS_EOS);
8443             if (utf8_target) {
8444                 locinput += PL_utf8skip[nextchr];
8445                 /* locinput is allowed to go 1 char off the end, but not 2+ */
8446                 if (locinput > reginfo->strend)
8447                     sayNO;
8448             }
8449             else
8450                 locinput++;
8451             break;
8452
8453         } /* end switch */
8454
8455         /* switch break jumps here */
8456         scan = next; /* prepare to execute the next op and ... */
8457         continue;    /* ... jump back to the top, reusing st */
8458         /* NOTREACHED */
8459
8460       push_yes_state:
8461         /* push a state that backtracks on success */
8462         st->u.yes.prev_yes_state = yes_state;
8463         yes_state = st;
8464         /* FALLTHROUGH */
8465       push_state:
8466         /* push a new regex state, then continue at scan  */
8467         {
8468             regmatch_state *newst;
8469
8470             DEBUG_STACK_r({
8471                 regmatch_state *cur = st;
8472                 regmatch_state *curyes = yes_state;
8473                 int curd = depth;
8474                 regmatch_slab *slab = PL_regmatch_slab;
8475                 for (;curd > -1 && (depth-curd < 3);cur--,curd--) {
8476                     if (cur < SLAB_FIRST(slab)) {
8477                         slab = slab->prev;
8478                         cur = SLAB_LAST(slab);
8479                     }
8480                     Perl_re_exec_indentf( aTHX_ "#%-3d %-10s %s\n",
8481                         depth,
8482                         curd, PL_reg_name[cur->resume_state],
8483                         (curyes == cur) ? "yes" : ""
8484                     );
8485                     if (curyes == cur)
8486                         curyes = cur->u.yes.prev_yes_state;
8487                 }
8488             } else
8489                 DEBUG_STATE_pp("push")
8490             );
8491             depth++;
8492             st->locinput = locinput;
8493             newst = st+1;
8494             if (newst >  SLAB_LAST(PL_regmatch_slab))
8495                 newst = S_push_slab(aTHX);
8496             PL_regmatch_state = newst;
8497
8498             locinput = pushinput;
8499             st = newst;
8500             continue;
8501             /* NOTREACHED */
8502         }
8503     }
8504 #ifdef SOLARIS_BAD_OPTIMIZER
8505 #  undef PL_charclass
8506 #endif
8507
8508     /*
8509     * We get here only if there's trouble -- normally "case END" is
8510     * the terminating point.
8511     */
8512     Perl_croak(aTHX_ "corrupted regexp pointers");
8513     NOT_REACHED; /* NOTREACHED */
8514
8515   yes:
8516     if (yes_state) {
8517         /* we have successfully completed a subexpression, but we must now
8518          * pop to the state marked by yes_state and continue from there */
8519         assert(st != yes_state);
8520 #ifdef DEBUGGING
8521         while (st != yes_state) {
8522             st--;
8523             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8524                 PL_regmatch_slab = PL_regmatch_slab->prev;
8525                 st = SLAB_LAST(PL_regmatch_slab);
8526             }
8527             DEBUG_STATE_r({
8528                 if (no_final) {
8529                     DEBUG_STATE_pp("pop (no final)");
8530                 } else {
8531                     DEBUG_STATE_pp("pop (yes)");
8532                 }
8533             });
8534             depth--;
8535         }
8536 #else
8537         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8538             || yes_state > SLAB_LAST(PL_regmatch_slab))
8539         {
8540             /* not in this slab, pop slab */
8541             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8542             PL_regmatch_slab = PL_regmatch_slab->prev;
8543             st = SLAB_LAST(PL_regmatch_slab);
8544         }
8545         depth -= (st - yes_state);
8546 #endif
8547         st = yes_state;
8548         yes_state = st->u.yes.prev_yes_state;
8549         PL_regmatch_state = st;
8550
8551         if (no_final)
8552             locinput= st->locinput;
8553         state_num = st->resume_state + no_final;
8554         goto reenter_switch;
8555     }
8556
8557     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
8558                           PL_colors[4], PL_colors[5]));
8559
8560     if (reginfo->info_aux_eval) {
8561         /* each successfully executed (?{...}) block does the equivalent of
8562          *   local $^R = do {...}
8563          * When popping the save stack, all these locals would be undone;
8564          * bypass this by setting the outermost saved $^R to the latest
8565          * value */
8566         /* I dont know if this is needed or works properly now.
8567          * see code related to PL_replgv elsewhere in this file.
8568          * Yves
8569          */
8570         if (oreplsv != GvSV(PL_replgv))
8571             sv_setsv(oreplsv, GvSV(PL_replgv));
8572     }
8573     result = 1;
8574     goto final_exit;
8575
8576   no:
8577     DEBUG_EXECUTE_r(
8578         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
8579             depth,
8580             PL_colors[4], PL_colors[5])
8581         );
8582
8583   no_silent:
8584     if (no_final) {
8585         if (yes_state) {
8586             goto yes;
8587         } else {
8588             goto final_exit;
8589         }
8590     }
8591     if (depth) {
8592         /* there's a previous state to backtrack to */
8593         st--;
8594         if (st < SLAB_FIRST(PL_regmatch_slab)) {
8595             PL_regmatch_slab = PL_regmatch_slab->prev;
8596             st = SLAB_LAST(PL_regmatch_slab);
8597         }
8598         PL_regmatch_state = st;
8599         locinput= st->locinput;
8600
8601         DEBUG_STATE_pp("pop");
8602         depth--;
8603         if (yes_state == st)
8604             yes_state = st->u.yes.prev_yes_state;
8605
8606         state_num = st->resume_state + 1; /* failure = success + 1 */
8607         PERL_ASYNC_CHECK();
8608         goto reenter_switch;
8609     }
8610     result = 0;
8611
8612   final_exit:
8613     if (rex->intflags & PREGf_VERBARG_SEEN) {
8614         SV *sv_err = get_sv("REGERROR", 1);
8615         SV *sv_mrk = get_sv("REGMARK", 1);
8616         if (result) {
8617             sv_commit = &PL_sv_no;
8618             if (!sv_yes_mark)
8619                 sv_yes_mark = &PL_sv_yes;
8620         } else {
8621             if (!sv_commit)
8622                 sv_commit = &PL_sv_yes;
8623             sv_yes_mark = &PL_sv_no;
8624         }
8625         assert(sv_err);
8626         assert(sv_mrk);
8627         sv_setsv(sv_err, sv_commit);
8628         sv_setsv(sv_mrk, sv_yes_mark);
8629     }
8630
8631
8632     if (last_pushed_cv) {
8633         dSP;
8634         POP_MULTICALL;
8635         PERL_UNUSED_VAR(SP);
8636     }
8637
8638     assert(!result ||  locinput - reginfo->strbeg >= 0);
8639     return result ?  locinput - reginfo->strbeg : -1;
8640 }
8641
8642 /*
8643  - regrepeat - repeatedly match something simple, report how many
8644  *
8645  * What 'simple' means is a node which can be the operand of a quantifier like
8646  * '+', or {1,3}
8647  *
8648  * startposp - pointer a pointer to the start position.  This is updated
8649  *             to point to the byte following the highest successful
8650  *             match.
8651  * p         - the regnode to be repeatedly matched against.
8652  * reginfo   - struct holding match state, such as strend
8653  * max       - maximum number of things to match.
8654  * depth     - (for debugging) backtracking depth.
8655  */
8656 STATIC I32
8657 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8658             regmatch_info *const reginfo, I32 max _pDEPTH)
8659 {
8660     char *scan;     /* Pointer to current position in target string */
8661     I32 c;
8662     char *loceol = reginfo->strend;   /* local version */
8663     I32 hardcount = 0;  /* How many matches so far */
8664     bool utf8_target = reginfo->is_utf8_target;
8665     unsigned int to_complement = 0;  /* Invert the result? */
8666     UV utf8_flags;
8667     _char_class_number classnum;
8668
8669     PERL_ARGS_ASSERT_REGREPEAT;
8670
8671     scan = *startposp;
8672     if (max == REG_INFTY)
8673         max = I32_MAX;
8674     else if (! utf8_target && loceol - scan > max)
8675         loceol = scan + max;
8676
8677     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8678      * to the maximum of how far we should go in it (leaving it set to the real
8679      * end, if the maximum permissible would take us beyond that).  This allows
8680      * us to make the loop exit condition that we haven't gone past <loceol> to
8681      * also mean that we haven't exceeded the max permissible count, saving a
8682      * test each time through the loop.  But it assumes that the OP matches a
8683      * single byte, which is true for most of the OPs below when applied to a
8684      * non-UTF-8 target.  Those relatively few OPs that don't have this
8685      * characteristic will have to compensate.
8686      *
8687      * There is no adjustment for UTF-8 targets, as the number of bytes per
8688      * character varies.  OPs will have to test both that the count is less
8689      * than the max permissible (using <hardcount> to keep track), and that we
8690      * are still within the bounds of the string (using <loceol>.  A few OPs
8691      * match a single byte no matter what the encoding.  They can omit the max
8692      * test if, for the UTF-8 case, they do the adjustment that was skipped
8693      * above.
8694      *
8695      * Thus, the code above sets things up for the common case; and exceptional
8696      * cases need extra work; the common case is to make sure <scan> doesn't
8697      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8698      * count doesn't exceed the maximum permissible */
8699
8700     switch (OP(p)) {
8701     case REG_ANY:
8702         if (utf8_target) {
8703             while (scan < loceol && hardcount < max && *scan != '\n') {
8704                 scan += UTF8SKIP(scan);
8705                 hardcount++;
8706             }
8707         } else {
8708             while (scan < loceol && *scan != '\n')
8709                 scan++;
8710         }
8711         break;
8712     case SANY:
8713         if (utf8_target) {
8714             while (scan < loceol && hardcount < max) {
8715                 scan += UTF8SKIP(scan);
8716                 hardcount++;
8717             }
8718         }
8719         else
8720             scan = loceol;
8721         break;
8722     case EXACTL:
8723         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8724         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8725             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8726         }
8727         /* FALLTHROUGH */
8728     case EXACT:
8729         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8730
8731         c = (U8)*STRING(p);
8732
8733         /* Can use a simple loop if the pattern char to match on is invariant
8734          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8735          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8736          * true iff it doesn't matter if the argument is in UTF-8 or not */
8737         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8738             if (utf8_target && loceol - scan > max) {
8739                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8740                  * since here, to match at all, 1 char == 1 byte */
8741                 loceol = scan + max;
8742             }
8743             while (scan < loceol && UCHARAT(scan) == c) {
8744                 scan++;
8745             }
8746         }
8747         else if (reginfo->is_utf8_pat) {
8748             if (utf8_target) {
8749                 STRLEN scan_char_len;
8750
8751                 /* When both target and pattern are UTF-8, we have to do
8752                  * string EQ */
8753                 while (hardcount < max
8754                        && scan < loceol
8755                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8756                        && memEQ(scan, STRING(p), scan_char_len))
8757                 {
8758                     scan += scan_char_len;
8759                     hardcount++;
8760                 }
8761             }
8762             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8763
8764                 /* Target isn't utf8; convert the character in the UTF-8
8765                  * pattern to non-UTF8, and do a simple loop */
8766                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8767                 while (scan < loceol && UCHARAT(scan) == c) {
8768                     scan++;
8769                 }
8770             } /* else pattern char is above Latin1, can't possibly match the
8771                  non-UTF-8 target */
8772         }
8773         else {
8774
8775             /* Here, the string must be utf8; pattern isn't, and <c> is
8776              * different in utf8 than not, so can't compare them directly.
8777              * Outside the loop, find the two utf8 bytes that represent c, and
8778              * then look for those in sequence in the utf8 string */
8779             U8 high = UTF8_TWO_BYTE_HI(c);
8780             U8 low = UTF8_TWO_BYTE_LO(c);
8781
8782             while (hardcount < max
8783                     && scan + 1 < loceol
8784                     && UCHARAT(scan) == high
8785                     && UCHARAT(scan + 1) == low)
8786             {
8787                 scan += 2;
8788                 hardcount++;
8789             }
8790         }
8791         break;
8792
8793     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8794         assert(! reginfo->is_utf8_pat);
8795         /* FALLTHROUGH */
8796     case EXACTFA:
8797         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8798         goto do_exactf;
8799
8800     case EXACTFL:
8801         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8802         utf8_flags = FOLDEQ_LOCALE;
8803         goto do_exactf;
8804
8805     case EXACTF:   /* This node only generated for non-utf8 patterns */
8806         assert(! reginfo->is_utf8_pat);
8807         utf8_flags = 0;
8808         goto do_exactf;
8809
8810     case EXACTFLU8:
8811         if (! utf8_target) {
8812             break;
8813         }
8814         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8815                                     | FOLDEQ_S2_FOLDS_SANE;
8816         goto do_exactf;
8817
8818     case EXACTFU_SS:
8819     case EXACTFU:
8820         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8821
8822       do_exactf: {
8823         int c1, c2;
8824         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8825
8826         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8827
8828         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8829                                         reginfo))
8830         {
8831             if (c1 == CHRTEST_VOID) {
8832                 /* Use full Unicode fold matching */
8833                 char *tmpeol = reginfo->strend;
8834                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8835                 while (hardcount < max
8836                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8837                                              STRING(p), NULL, pat_len,
8838                                              reginfo->is_utf8_pat, utf8_flags))
8839                 {
8840                     scan = tmpeol;
8841                     tmpeol = reginfo->strend;
8842                     hardcount++;
8843                 }
8844             }
8845             else if (utf8_target) {
8846                 if (c1 == c2) {
8847                     while (scan < loceol
8848                            && hardcount < max
8849                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8850                     {
8851                         scan += UTF8SKIP(scan);
8852                         hardcount++;
8853                     }
8854                 }
8855                 else {
8856                     while (scan < loceol
8857                            && hardcount < max
8858                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8859                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8860                     {
8861                         scan += UTF8SKIP(scan);
8862                         hardcount++;
8863                     }
8864                 }
8865             }
8866             else if (c1 == c2) {
8867                 while (scan < loceol && UCHARAT(scan) == c1) {
8868                     scan++;
8869                 }
8870             }
8871             else {
8872                 while (scan < loceol &&
8873                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8874                 {
8875                     scan++;
8876                 }
8877             }
8878         }
8879         break;
8880     }
8881     case ANYOFL:
8882         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8883
8884         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
8885             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8886         }
8887         /* FALLTHROUGH */
8888     case ANYOFD:
8889     case ANYOF:
8890         if (utf8_target) {
8891             while (hardcount < max
8892                    && scan < loceol
8893                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8894             {
8895                 scan += UTF8SKIP(scan);
8896                 hardcount++;
8897             }
8898         } else {
8899             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan, 0))
8900                 scan++;
8901         }
8902         break;
8903
8904     /* The argument (FLAGS) to all the POSIX node types is the class number */
8905
8906     case NPOSIXL:
8907         to_complement = 1;
8908         /* FALLTHROUGH */
8909
8910     case POSIXL:
8911         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8912         if (! utf8_target) {
8913             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8914                                                                    *scan)))
8915             {
8916                 scan++;
8917             }
8918         } else {
8919             while (hardcount < max && scan < loceol
8920                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8921                                                                   (U8 *) scan)))
8922             {
8923                 scan += UTF8SKIP(scan);
8924                 hardcount++;
8925             }
8926         }
8927         break;
8928
8929     case POSIXD:
8930         if (utf8_target) {
8931             goto utf8_posix;
8932         }
8933         /* FALLTHROUGH */
8934
8935     case POSIXA:
8936         if (utf8_target && loceol - scan > max) {
8937
8938             /* We didn't adjust <loceol> at the beginning of this routine
8939              * because is UTF-8, but it is actually ok to do so, since here, to
8940              * match, 1 char == 1 byte. */
8941             loceol = scan + max;
8942         }
8943         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8944             scan++;
8945         }
8946         break;
8947
8948     case NPOSIXD:
8949         if (utf8_target) {
8950             to_complement = 1;
8951             goto utf8_posix;
8952         }
8953         /* FALLTHROUGH */
8954
8955     case NPOSIXA:
8956         if (! utf8_target) {
8957             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8958                 scan++;
8959             }
8960         }
8961         else {
8962
8963             /* The complement of something that matches only ASCII matches all
8964              * non-ASCII, plus everything in ASCII that isn't in the class. */
8965             while (hardcount < max && scan < loceol
8966                    && (! isASCII_utf8(scan)
8967                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8968             {
8969                 scan += UTF8SKIP(scan);
8970                 hardcount++;
8971             }
8972         }
8973         break;
8974
8975     case NPOSIXU:
8976         to_complement = 1;
8977         /* FALLTHROUGH */
8978
8979     case POSIXU:
8980         if (! utf8_target) {
8981             while (scan < loceol && to_complement
8982                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8983             {
8984                 scan++;
8985             }
8986         }
8987         else {
8988           utf8_posix:
8989             classnum = (_char_class_number) FLAGS(p);
8990             if (classnum < _FIRST_NON_SWASH_CC) {
8991
8992                 /* Here, a swash is needed for above-Latin1 code points.
8993                  * Process as many Latin1 code points using the built-in rules.
8994                  * Go to another loop to finish processing upon encountering
8995                  * the first Latin1 code point.  We could do that in this loop
8996                  * as well, but the other way saves having to test if the swash
8997                  * has been loaded every time through the loop: extra space to
8998                  * save a test. */
8999                 while (hardcount < max && scan < loceol) {
9000                     if (UTF8_IS_INVARIANT(*scan)) {
9001                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9002                                                                    classnum))))
9003                         {
9004                             break;
9005                         }
9006                         scan++;
9007                     }
9008                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9009                         if (! (to_complement
9010                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9011                                                                      *(scan + 1)),
9012                                                     classnum))))
9013                         {
9014                             break;
9015                         }
9016                         scan += 2;
9017                     }
9018                     else {
9019                         goto found_above_latin1;
9020                     }
9021
9022                     hardcount++;
9023                 }
9024             }
9025             else {
9026                 /* For these character classes, the knowledge of how to handle
9027                  * every code point is compiled in to Perl via a macro.  This
9028                  * code is written for making the loops as tight as possible.
9029                  * It could be refactored to save space instead */
9030                 switch (classnum) {
9031                     case _CC_ENUM_SPACE:
9032                         while (hardcount < max
9033                                && scan < loceol
9034                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
9035                         {
9036                             scan += UTF8SKIP(scan);
9037                             hardcount++;
9038                         }
9039                         break;
9040                     case _CC_ENUM_BLANK:
9041                         while (hardcount < max
9042                                && scan < loceol
9043                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
9044                         {
9045                             scan += UTF8SKIP(scan);
9046                             hardcount++;
9047                         }
9048                         break;
9049                     case _CC_ENUM_XDIGIT:
9050                         while (hardcount < max
9051                                && scan < loceol
9052                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
9053                         {
9054                             scan += UTF8SKIP(scan);
9055                             hardcount++;
9056                         }
9057                         break;
9058                     case _CC_ENUM_VERTSPACE:
9059                         while (hardcount < max
9060                                && scan < loceol
9061                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
9062                         {
9063                             scan += UTF8SKIP(scan);
9064                             hardcount++;
9065                         }
9066                         break;
9067                     case _CC_ENUM_CNTRL:
9068                         while (hardcount < max
9069                                && scan < loceol
9070                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
9071                         {
9072                             scan += UTF8SKIP(scan);
9073                             hardcount++;
9074                         }
9075                         break;
9076                     default:
9077                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9078                 }
9079             }
9080         }
9081         break;
9082
9083       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
9084
9085         /* Load the swash if not already present */
9086         if (! PL_utf8_swash_ptrs[classnum]) {
9087             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9088             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9089                                         "utf8",
9090                                         "",
9091                                         &PL_sv_undef, 1, 0,
9092                                         PL_XPosix_ptrs[classnum], &flags);
9093         }
9094
9095         while (hardcount < max && scan < loceol
9096                && to_complement ^ cBOOL(_generic_utf8(
9097                                        classnum,
9098                                        scan,
9099                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
9100                                                    (U8 *) scan,
9101                                                    TRUE))))
9102         {
9103             scan += UTF8SKIP(scan);
9104             hardcount++;
9105         }
9106         break;
9107
9108     case LNBREAK:
9109         if (utf8_target) {
9110             while (hardcount < max && scan < loceol &&
9111                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9112                 scan += c;
9113                 hardcount++;
9114             }
9115         } else {
9116             /* LNBREAK can match one or two latin chars, which is ok, but we
9117              * have to use hardcount in this situation, and throw away the
9118              * adjustment to <loceol> done before the switch statement */
9119             loceol = reginfo->strend;
9120             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9121                 scan+=c;
9122                 hardcount++;
9123             }
9124         }
9125         break;
9126
9127     case BOUNDL:
9128     case NBOUNDL:
9129         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9130         /* FALLTHROUGH */
9131     case BOUND:
9132     case BOUNDA:
9133     case BOUNDU:
9134     case EOS:
9135     case GPOS:
9136     case KEEPS:
9137     case NBOUND:
9138     case NBOUNDA:
9139     case NBOUNDU:
9140     case OPFAIL:
9141     case SBOL:
9142     case SEOL:
9143         /* These are all 0 width, so match right here or not at all. */
9144         break;
9145
9146     default:
9147         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9148         NOT_REACHED; /* NOTREACHED */
9149
9150     }
9151
9152     if (hardcount)
9153         c = hardcount;
9154     else
9155         c = scan - *startposp;
9156     *startposp = scan;
9157
9158     DEBUG_r({
9159         GET_RE_DEBUG_FLAGS_DECL;
9160         DEBUG_EXECUTE_r({
9161             SV * const prop = sv_newmortal();
9162             regprop(prog, prop, p, reginfo, NULL);
9163             Perl_re_exec_indentf( aTHX_  "%s can match %"IVdf" times out of %"IVdf"...\n",
9164                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9165         });
9166     });
9167
9168     return(c);
9169 }
9170
9171
9172 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9173 /*
9174 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9175 create a copy so that changes the caller makes won't change the shared one.
9176 If <altsvp> is non-null, will return NULL in it, for back-compat.
9177  */
9178 SV *
9179 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9180 {
9181     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9182
9183     if (altsvp) {
9184         *altsvp = NULL;
9185     }
9186
9187     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9188 }
9189
9190 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9191
9192 /*
9193  - reginclass - determine if a character falls into a character class
9194
9195   n is the ANYOF-type regnode
9196   p is the target string
9197   p_end points to one byte beyond the end of the target string
9198   utf8_target tells whether p is in UTF-8.
9199
9200   Returns true if matched; false otherwise.
9201
9202   Note that this can be a synthetic start class, a combination of various
9203   nodes, so things you think might be mutually exclusive, such as locale,
9204   aren't.  It can match both locale and non-locale
9205
9206  */
9207
9208 STATIC bool
9209 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9210 {
9211     dVAR;
9212     const char flags = ANYOF_FLAGS(n);
9213     bool match = FALSE;
9214     UV c = *p;
9215
9216     PERL_ARGS_ASSERT_REGINCLASS;
9217
9218     /* If c is not already the code point, get it.  Note that
9219      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9220     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9221         STRLEN c_len = 0;
9222         c = utf8n_to_uvchr(p, p_end - p, &c_len,
9223                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
9224                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
9225                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
9226                  * UTF8_ALLOW_FFFF */
9227         if (c_len == (STRLEN)-1)
9228             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
9229         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9230             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9231         }
9232     }
9233
9234     /* If this character is potentially in the bitmap, check it */
9235     if (c < NUM_ANYOF_CODE_POINTS) {
9236         if (ANYOF_BITMAP_TEST(n, c))
9237             match = TRUE;
9238         else if ((flags
9239                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9240                   && OP(n) == ANYOFD
9241                   && ! utf8_target
9242                   && ! isASCII(c))
9243         {
9244             match = TRUE;
9245         }
9246         else if (flags & ANYOF_LOCALE_FLAGS) {
9247             if ((flags & ANYOFL_FOLD)
9248                 && c < 256
9249                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9250             {
9251                 match = TRUE;
9252             }
9253             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9254                      && c < 256
9255             ) {
9256
9257                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9258                  * if the class includes the Posix character class given by
9259                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9260                  * complemented Posix class given by int(bit/2).  So we loop
9261                  * through the bits, each time changing whether we complement
9262                  * the result or not.  Suppose for the sake of illustration
9263                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9264                  * is set, it means there is a match for this ANYOF node if the
9265                  * character is in the class given by the expression (0 / 2 = 0
9266                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9267                  * and since 'to_complement' is 0, the result will stay TRUE,
9268                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9269                  * bit 1 is 1.  That means there is a match if the character
9270                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9271                  * but will on bit 1.  On the second iteration 'to_complement'
9272                  * will be 1, so the exclusive or will reverse things, so we
9273                  * are testing for \W.  On the third iteration, 'to_complement'
9274                  * will be 0, and we would be testing for \s; the fourth
9275                  * iteration would test for \S, etc.
9276                  *
9277                  * Note that this code assumes that all the classes are closed
9278                  * under folding.  For example, if a character matches \w, then
9279                  * its fold does too; and vice versa.  This should be true for
9280                  * any well-behaved locale for all the currently defined Posix
9281                  * classes, except for :lower: and :upper:, which are handled
9282                  * by the pseudo-class :cased: which matches if either of the
9283                  * other two does.  To get rid of this assumption, an outer
9284                  * loop could be used below to iterate over both the source
9285                  * character, and its fold (if different) */
9286
9287                 int count = 0;
9288                 int to_complement = 0;
9289
9290                 while (count < ANYOF_MAX) {
9291                     if (ANYOF_POSIXL_TEST(n, count)
9292                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9293                     {
9294                         match = TRUE;
9295                         break;
9296                     }
9297                     count++;
9298                     to_complement ^= 1;
9299                 }
9300             }
9301         }
9302     }
9303
9304
9305     /* If the bitmap didn't (or couldn't) match, and something outside the
9306      * bitmap could match, try that. */
9307     if (!match) {
9308         if (c >= NUM_ANYOF_CODE_POINTS
9309             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9310         {
9311             match = TRUE;       /* Everything above the bitmap matches */
9312         }
9313             /* Here doesn't match everything above the bitmap.  If there is
9314              * some information available beyond the bitmap, we may find a
9315              * match in it.  If so, this is most likely because the code point
9316              * is outside the bitmap range.  But rarely, it could be because of
9317              * some other reason.  If so, various flags are set to indicate
9318              * this possibility.  On ANYOFD nodes, there may be matches that
9319              * happen only when the target string is UTF-8; or for other node
9320              * types, because runtime lookup is needed, regardless of the
9321              * UTF-8ness of the target string.  Finally, under /il, there may
9322              * be some matches only possible if the locale is a UTF-8 one. */
9323         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9324                  && (   c >= NUM_ANYOF_CODE_POINTS
9325                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9326                          && (   UNLIKELY(OP(n) != ANYOFD)
9327                              || (utf8_target && ! isASCII_uni(c)
9328 #                               if NUM_ANYOF_CODE_POINTS > 256
9329                                                                  && c < 256
9330 #                               endif
9331                                 )))
9332                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9333                          && IN_UTF8_CTYPE_LOCALE)))
9334         {
9335             SV* only_utf8_locale = NULL;
9336             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9337                                                        &only_utf8_locale, NULL);
9338             if (sw) {
9339                 U8 utf8_buffer[2];
9340                 U8 * utf8_p;
9341                 if (utf8_target) {
9342                     utf8_p = (U8 *) p;
9343                 } else { /* Convert to utf8 */
9344                     utf8_p = utf8_buffer;
9345                     append_utf8_from_native_byte(*p, &utf8_p);
9346                     utf8_p = utf8_buffer;
9347                 }
9348
9349                 if (swash_fetch(sw, utf8_p, TRUE)) {
9350                     match = TRUE;
9351                 }
9352             }
9353             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9354                 match = _invlist_contains_cp(only_utf8_locale, c);
9355             }
9356         }
9357
9358         if (UNICODE_IS_SUPER(c)
9359             && (flags
9360                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9361             && OP(n) != ANYOFD
9362             && ckWARN_d(WARN_NON_UNICODE))
9363         {
9364             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9365                 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
9366         }
9367     }
9368
9369 #if ANYOF_INVERT != 1
9370     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9371      * use it, don't */
9372 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9373 #endif
9374
9375     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9376     return (flags & ANYOF_INVERT) ^ match;
9377 }
9378
9379 STATIC U8 *
9380 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9381 {
9382     /* return the position 'off' UTF-8 characters away from 's', forward if
9383      * 'off' >= 0, backwards if negative.  But don't go outside of position
9384      * 'lim', which better be < s  if off < 0 */
9385
9386     PERL_ARGS_ASSERT_REGHOP3;
9387
9388     if (off >= 0) {
9389         while (off-- && s < lim) {
9390             /* XXX could check well-formedness here */
9391             s += UTF8SKIP(s);
9392         }
9393     }
9394     else {
9395         while (off++ && s > lim) {
9396             s--;
9397             if (UTF8_IS_CONTINUED(*s)) {
9398                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9399                     s--;
9400                 if (! UTF8_IS_START(*s)) {
9401                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9402                 }
9403             }
9404             /* XXX could check well-formedness here */
9405         }
9406     }
9407     return s;
9408 }
9409
9410 STATIC U8 *
9411 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9412 {
9413     PERL_ARGS_ASSERT_REGHOP4;
9414
9415     if (off >= 0) {
9416         while (off-- && s < rlim) {
9417             /* XXX could check well-formedness here */
9418             s += UTF8SKIP(s);
9419         }
9420     }
9421     else {
9422         while (off++ && s > llim) {
9423             s--;
9424             if (UTF8_IS_CONTINUED(*s)) {
9425                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9426                     s--;
9427                 if (! UTF8_IS_START(*s)) {
9428                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9429                 }
9430             }
9431             /* XXX could check well-formedness here */
9432         }
9433     }
9434     return s;
9435 }
9436
9437 /* like reghop3, but returns NULL on overrun, rather than returning last
9438  * char pos */
9439
9440 STATIC U8 *
9441 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9442 {
9443     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9444
9445     if (off >= 0) {
9446         while (off-- && s < lim) {
9447             /* XXX could check well-formedness here */
9448             s += UTF8SKIP(s);
9449         }
9450         if (off >= 0)
9451             return NULL;
9452     }
9453     else {
9454         while (off++ && s > lim) {
9455             s--;
9456             if (UTF8_IS_CONTINUED(*s)) {
9457                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9458                     s--;
9459                 if (! UTF8_IS_START(*s)) {
9460                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9461                 }
9462             }
9463             /* XXX could check well-formedness here */
9464         }
9465         if (off <= 0)
9466             return NULL;
9467     }
9468     return s;
9469 }
9470
9471
9472 /* when executing a regex that may have (?{}), extra stuff needs setting
9473    up that will be visible to the called code, even before the current
9474    match has finished. In particular:
9475
9476    * $_ is localised to the SV currently being matched;
9477    * pos($_) is created if necessary, ready to be updated on each call-out
9478      to code;
9479    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9480      isn't set until the current pattern is successfully finished), so that
9481      $1 etc of the match-so-far can be seen;
9482    * save the old values of subbeg etc of the current regex, and  set then
9483      to the current string (again, this is normally only done at the end
9484      of execution)
9485 */
9486
9487 static void
9488 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9489 {
9490     MAGIC *mg;
9491     regexp *const rex = ReANY(reginfo->prog);
9492     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9493
9494     eval_state->rex = rex;
9495
9496     if (reginfo->sv) {
9497         /* Make $_ available to executed code. */
9498         if (reginfo->sv != DEFSV) {
9499             SAVE_DEFSV;
9500             DEFSV_set(reginfo->sv);
9501         }
9502
9503         if (!(mg = mg_find_mglob(reginfo->sv))) {
9504             /* prepare for quick setting of pos */
9505             mg = sv_magicext_mglob(reginfo->sv);
9506             mg->mg_len = -1;
9507         }
9508         eval_state->pos_magic = mg;
9509         eval_state->pos       = mg->mg_len;
9510         eval_state->pos_flags = mg->mg_flags;
9511     }
9512     else
9513         eval_state->pos_magic = NULL;
9514
9515     if (!PL_reg_curpm) {
9516         /* PL_reg_curpm is a fake PMOP that we can attach the current
9517          * regex to and point PL_curpm at, so that $1 et al are visible
9518          * within a /(?{})/. It's just allocated once per interpreter the
9519          * first time its needed */
9520         Newxz(PL_reg_curpm, 1, PMOP);
9521 #ifdef USE_ITHREADS
9522         {
9523             SV* const repointer = &PL_sv_undef;
9524             /* this regexp is also owned by the new PL_reg_curpm, which
9525                will try to free it.  */
9526             av_push(PL_regex_padav, repointer);
9527             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9528             PL_regex_pad = AvARRAY(PL_regex_padav);
9529         }
9530 #endif
9531     }
9532     SET_reg_curpm(reginfo->prog);
9533     eval_state->curpm = PL_curpm;
9534     PL_curpm = PL_reg_curpm;
9535     if (RXp_MATCH_COPIED(rex)) {
9536         /*  Here is a serious problem: we cannot rewrite subbeg,
9537             since it may be needed if this match fails.  Thus
9538             $` inside (?{}) could fail... */
9539         eval_state->subbeg     = rex->subbeg;
9540         eval_state->sublen     = rex->sublen;
9541         eval_state->suboffset  = rex->suboffset;
9542         eval_state->subcoffset = rex->subcoffset;
9543 #ifdef PERL_ANY_COW
9544         eval_state->saved_copy = rex->saved_copy;
9545 #endif
9546         RXp_MATCH_COPIED_off(rex);
9547     }
9548     else
9549         eval_state->subbeg = NULL;
9550     rex->subbeg = (char *)reginfo->strbeg;
9551     rex->suboffset = 0;
9552     rex->subcoffset = 0;
9553     rex->sublen = reginfo->strend - reginfo->strbeg;
9554 }
9555
9556
9557 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
9558
9559 static void
9560 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
9561 {
9562     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9563     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
9564     regmatch_slab *s;
9565
9566     Safefree(aux->poscache);
9567
9568     if (eval_state) {
9569
9570         /* undo the effects of S_setup_eval_state() */
9571
9572         if (eval_state->subbeg) {
9573             regexp * const rex = eval_state->rex;
9574             rex->subbeg     = eval_state->subbeg;
9575             rex->sublen     = eval_state->sublen;
9576             rex->suboffset  = eval_state->suboffset;
9577             rex->subcoffset = eval_state->subcoffset;
9578 #ifdef PERL_ANY_COW
9579             rex->saved_copy = eval_state->saved_copy;
9580 #endif
9581             RXp_MATCH_COPIED_on(rex);
9582         }
9583         if (eval_state->pos_magic)
9584         {
9585             eval_state->pos_magic->mg_len = eval_state->pos;
9586             eval_state->pos_magic->mg_flags =
9587                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9588                | (eval_state->pos_flags & MGf_BYTES);
9589         }
9590
9591         PL_curpm = eval_state->curpm;
9592     }
9593
9594     PL_regmatch_state = aux->old_regmatch_state;
9595     PL_regmatch_slab  = aux->old_regmatch_slab;
9596
9597     /* free all slabs above current one - this must be the last action
9598      * of this function, as aux and eval_state are allocated within
9599      * slabs and may be freed here */
9600
9601     s = PL_regmatch_slab->next;
9602     if (s) {
9603         PL_regmatch_slab->next = NULL;
9604         while (s) {
9605             regmatch_slab * const osl = s;
9606             s = s->next;
9607             Safefree(osl);
9608         }
9609     }
9610 }
9611
9612
9613 STATIC void
9614 S_to_utf8_substr(pTHX_ regexp *prog)
9615 {
9616     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9617      * on the converted value */
9618
9619     int i = 1;
9620
9621     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9622
9623     do {
9624         if (prog->substrs->data[i].substr
9625             && !prog->substrs->data[i].utf8_substr) {
9626             SV* const sv = newSVsv(prog->substrs->data[i].substr);
9627             prog->substrs->data[i].utf8_substr = sv;
9628             sv_utf8_upgrade(sv);
9629             if (SvVALID(prog->substrs->data[i].substr)) {
9630                 if (SvTAIL(prog->substrs->data[i].substr)) {
9631                     /* Trim the trailing \n that fbm_compile added last
9632                        time.  */
9633                     SvCUR_set(sv, SvCUR(sv) - 1);
9634                     /* Whilst this makes the SV technically "invalid" (as its
9635                        buffer is no longer followed by "\0") when fbm_compile()
9636                        adds the "\n" back, a "\0" is restored.  */
9637                     fbm_compile(sv, FBMcf_TAIL);
9638                 } else
9639                     fbm_compile(sv, 0);
9640             }
9641             if (prog->substrs->data[i].substr == prog->check_substr)
9642                 prog->check_utf8 = sv;
9643         }
9644     } while (i--);
9645 }
9646
9647 STATIC bool
9648 S_to_byte_substr(pTHX_ regexp *prog)
9649 {
9650     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9651      * on the converted value; returns FALSE if can't be converted. */
9652
9653     int i = 1;
9654
9655     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9656
9657     do {
9658         if (prog->substrs->data[i].utf8_substr
9659             && !prog->substrs->data[i].substr) {
9660             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9661             if (! sv_utf8_downgrade(sv, TRUE)) {
9662                 return FALSE;
9663             }
9664             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9665                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9666                     /* Trim the trailing \n that fbm_compile added last
9667                         time.  */
9668                     SvCUR_set(sv, SvCUR(sv) - 1);
9669                     fbm_compile(sv, FBMcf_TAIL);
9670                 } else
9671                     fbm_compile(sv, 0);
9672             }
9673             prog->substrs->data[i].substr = sv;
9674             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9675                 prog->check_substr = sv;
9676         }
9677     } while (i--);
9678
9679     return TRUE;
9680 }
9681
9682 /*
9683  * ex: set ts=8 sts=4 sw=4 et:
9684  */