regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid only if 'c', the character being looke-up, is an invariant under
 111  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 112  * everything matchable is straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 114                                 ? reginclass(prog,p,c,c+1,u)                \
 115                                 : ANYOF_BITMAP_TEST(p,*(c)))
 116
 117 /*
 118  * Forwards.
 119  */
 120
 121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 122 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 123
 124 #define HOPc(pos,off) \
 125         (char *)(reginfo->is_utf8_target \
 126             ? reghop3((U8*)pos, off, \
 127                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 128             : (U8*)(pos + off))
 129
 130 #define HOPBACKc(pos, off) \
 131         (char*)(reginfo->is_utf8_target \
 132             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \
 133             : (pos - off >= reginfo->strbeg)    \
 134                 ? (U8*)pos - off                \
 135                 : NULL)
 136
 137 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 138 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 139
 140 /* lim must be +ve. Returns NULL on overshoot */
 141 #define HOPMAYBE3(pos,off,lim) \
 142         (reginfo->is_utf8_target                        \
 143             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 144             : ((U8*)pos + off <= lim)                   \
 145                 ? (U8*)pos + off                        \
 146                 : NULL)
 147
 148 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 149  * off must be >=0; args should be vars rather than expressions */
 150 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 151     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 152     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 153
 154 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 155     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 156     : (U8*)(pos + off))
 157 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 158
 159 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 160 #define NEXTCHR_IS_EOS (nextchr < 0)
 161
 162 #define SET_nextchr \
 163     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 164
 165 #define SET_locinput(p) \
 166     locinput = (p);  \
 167     SET_nextchr
 168
 169
 170 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 171         if (!swash_ptr) {                                                     \
 172             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 173             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 174                                          1, 0, invlist, &flags);              \
 175             assert(swash_ptr);                                                \
 176         }                                                                     \
 177     } STMT_END
 178
 179 /* If in debug mode, we test that a known character properly matches */
 180 #ifdef DEBUGGING
 181 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 182                                           property_name,                      \
 183                                           invlist,                            \
 184                                           utf8_char_in_property)              \
 185         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 186         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 187 #else
 188 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 189                                           property_name,                      \
 190                                           invlist,                            \
 191                                           utf8_char_in_property)              \
 192         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 193 #endif
 194
 195 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 196                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 197                                         "",                                   \
 198                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 199                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 200
 201 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 202 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 203
 204 /* for use after a quantifier and before an EXACT-like node -- japhy */
 205 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 206  *
 207  * NOTE that *nothing* that affects backtracking should be in here, specifically
 208  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 209  * node that is in between two EXACT like nodes when ascertaining what the required
 210  * "follow" character is. This should probably be moved to regex compile time
 211  * although it may be done at run time beause of the REF possibility - more
 212  * investigation required. -- demerphq
 213 */
 214 #define JUMPABLE(rn) (                                                             \
 215     OP(rn) == OPEN ||                                                              \
 216     (OP(rn) == CLOSE &&                                                            \
 217      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 218     OP(rn) == EVAL ||                                                              \
 219     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 220     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 221     OP(rn) == KEEPS ||                                                             \
 222     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 223 )
 224 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 225
 226 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 227
 228 #if 0
 229 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 230    we don't need this definition.  XXX These are now out-of-sync*/
 231 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 232 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 233 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 234
 235 #else
 236 /* ... so we use this as its faster. */
 237 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 238 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 239 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 240 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 241
 242 #endif
 243
 244 /*
 245   Search for mandatory following text node; for lookahead, the text must
 246   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 247 */
 248 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 249     while (JUMPABLE(rn)) { \
 250         const OPCODE type = OP(rn); \
 251         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 252             rn = NEXTOPER(NEXTOPER(rn)); \
 253         else if (type == PLUS) \
 254             rn = NEXTOPER(rn); \
 255         else if (type == IFMATCH) \
 256             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 257         else rn += NEXT_OFF(rn); \
 258     } \
 259 } STMT_END
 260
 261 #define SLAB_FIRST(s) (&(s)->states[0])
 262 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 263
 264 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 265 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 266 static regmatch_state * S_push_slab(pTHX);
 267
 268 #define REGCP_PAREN_ELEMS 3
 269 #define REGCP_OTHER_ELEMS 3
 270 #define REGCP_FRAME_ELEMS 1
 271 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 272  * are needed for the regexp context stack bookkeeping. */
 273
 274 STATIC CHECKPOINT
 275 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 276 {
 277     const int retval = PL_savestack_ix;
 278     const int paren_elems_to_push =
 279                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 280     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 281     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 282     I32 p;
 283     GET_RE_DEBUG_FLAGS_DECL;
 284
 285     PERL_ARGS_ASSERT_REGCPPUSH;
 286
 287     if (paren_elems_to_push < 0)
 288         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 289                    (int)paren_elems_to_push, (int)maxopenparen,
 290                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 291
 292     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 293         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 294                    " out of range (%lu-%ld)",
 295                    total_elems,
 296                    (unsigned long)maxopenparen,
 297                    (long)parenfloor);
 298
 299     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 300
 301     DEBUG_BUFFERS_r(
 302         if ((int)maxopenparen > (int)parenfloor)
 303             Perl_re_printf( aTHX_
 304                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 305                 PTR2UV(rex),
 306                 PTR2UV(rex->offs)
 307             );
 308     );
 309     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 310 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 311         SSPUSHIV(rex->offs[p].end);
 312         SSPUSHIV(rex->offs[p].start);
 313         SSPUSHINT(rex->offs[p].start_tmp);
 314         DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
 315             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 316             (UV)p,
 317             (IV)rex->offs[p].start,
 318             (IV)rex->offs[p].start_tmp,
 319             (IV)rex->offs[p].end
 320         ));
 321     }
 322 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 323     SSPUSHINT(maxopenparen);
 324     SSPUSHINT(rex->lastparen);
 325     SSPUSHINT(rex->lastcloseparen);
 326     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 327
 328     return retval;
 329 }
 330
 331 /* These are needed since we do not localize EVAL nodes: */
 332 #define REGCP_SET(cp)                                           \
 333     DEBUG_STATE_r(                                              \
 334         Perl_re_exec_indentf( aTHX_                                         \
 335             "Setting an EVAL scope, savestack=%"IVdf",\n",      \
 336             depth, (IV)PL_savestack_ix                          \
 337         )                                                       \
 338     );                                                          \
 339     cp = PL_savestack_ix
 340
 341 #define REGCP_UNWIND(cp)                                        \
 342     DEBUG_STATE_r(                                              \
 343         if (cp != PL_savestack_ix)                              \
 344             Perl_re_exec_indentf( aTHX_                                     \
 345                 "Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n",\
 346                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 347             )                                                   \
 348     );                                                          \
 349     regcpblow(cp)
 350
 351 #define UNWIND_PAREN(lp, lcp)               \
 352     for (n = rex->lastparen; n > lp; n--)   \
 353         rex->offs[n].end = -1;              \
 354     rex->lastparen = n;                     \
 355     rex->lastcloseparen = lcp;
 356
 357
 358 STATIC void
 359 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 360 {
 361     UV i;
 362     U32 paren;
 363     GET_RE_DEBUG_FLAGS_DECL;
 364
 365     PERL_ARGS_ASSERT_REGCPPOP;
 366
 367     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 368     i = SSPOPUV;
 369     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 370     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 371     rex->lastcloseparen = SSPOPINT;
 372     rex->lastparen = SSPOPINT;
 373     *maxopenparen_p = SSPOPINT;
 374
 375     i -= REGCP_OTHER_ELEMS;
 376     /* Now restore the parentheses context. */
 377     DEBUG_BUFFERS_r(
 378         if (i || rex->lastparen + 1 <= rex->nparens)
 379             Perl_re_printf( aTHX_
 380                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 381                 PTR2UV(rex),
 382                 PTR2UV(rex->offs)
 383             );
 384     );
 385     paren = *maxopenparen_p;
 386     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 387         SSize_t tmps;
 388         rex->offs[paren].start_tmp = SSPOPINT;
 389         rex->offs[paren].start = SSPOPIV;
 390         tmps = SSPOPIV;
 391         if (paren <= rex->lastparen)
 392             rex->offs[paren].end = tmps;
 393         DEBUG_BUFFERS_r( Perl_re_printf( aTHX_
 394             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 395             (UV)paren,
 396             (IV)rex->offs[paren].start,
 397             (IV)rex->offs[paren].start_tmp,
 398             (IV)rex->offs[paren].end,
 399             (paren > rex->lastparen ? "(skipped)" : ""));
 400         );
 401         paren--;
 402     }
 403 #if 1
 404     /* It would seem that the similar code in regtry()
 405      * already takes care of this, and in fact it is in
 406      * a better location to since this code can #if 0-ed out
 407      * but the code in regtry() is needed or otherwise tests
 408      * requiring null fields (pat.t#187 and split.t#{13,14}
 409      * (as of patchlevel 7877)  will fail.  Then again,
 410      * this code seems to be necessary or otherwise
 411      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 412      * --jhi updated by dapm */
 413     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 414         if (i > *maxopenparen_p)
 415             rex->offs[i].start = -1;
 416         rex->offs[i].end = -1;
 417         DEBUG_BUFFERS_r( Perl_re_printf( aTHX_
 418             "    \\%"UVuf": %s   ..-1 undeffing\n",
 419             (UV)i,
 420             (i > *maxopenparen_p) ? "-1" : "  "
 421         ));
 422     }
 423 #endif
 424 }
 425
 426 /* restore the parens and associated vars at savestack position ix,
 427  * but without popping the stack */
 428
 429 STATIC void
 430 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 431 {
 432     I32 tmpix = PL_savestack_ix;
 433     PL_savestack_ix = ix;
 434     regcppop(rex, maxopenparen_p);
 435     PL_savestack_ix = tmpix;
 436 }
 437
 438 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 439
 440 STATIC bool
 441 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 442 {
 443     /* Returns a boolean as to whether or not 'character' is a member of the
 444      * Posix character class given by 'classnum' that should be equivalent to a
 445      * value in the typedef '_char_class_number'.
 446      *
 447      * Ideally this could be replaced by a just an array of function pointers
 448      * to the C library functions that implement the macros this calls.
 449      * However, to compile, the precise function signatures are required, and
 450      * these may vary from platform to to platform.  To avoid having to figure
 451      * out what those all are on each platform, I (khw) am using this method,
 452      * which adds an extra layer of function call overhead (unless the C
 453      * optimizer strips it away).  But we don't particularly care about
 454      * performance with locales anyway. */
 455
 456     switch ((_char_class_number) classnum) {
 457         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 458         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 459         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 460         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 461         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 462                                         || isUPPER_LC(character);
 463         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 464         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 465         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 466         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 467         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 468         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 469         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 470         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 471         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 472         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 473         default:    /* VERTSPACE should never occur in locales */
 474             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 475     }
 476
 477     NOT_REACHED; /* NOTREACHED */
 478     return FALSE;
 479 }
 480
 481 STATIC bool
 482 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 483 {
 484     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 485      * 'character' is a member of the Posix character class given by 'classnum'
 486      * that should be equivalent to a value in the typedef
 487      * '_char_class_number'.
 488      *
 489      * This just calls isFOO_lc on the code point for the character if it is in
 490      * the range 0-255.  Outside that range, all characters use Unicode
 491      * rules, ignoring any locale.  So use the Unicode function if this class
 492      * requires a swash, and use the Unicode macro otherwise. */
 493
 494     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 495
 496     if (UTF8_IS_INVARIANT(*character)) {
 497         return isFOO_lc(classnum, *character);
 498     }
 499     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 500         return isFOO_lc(classnum,
 501                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 502     }
 503
 504     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 505
 506     if (classnum < _FIRST_NON_SWASH_CC) {
 507
 508         /* Initialize the swash unless done already */
 509         if (! PL_utf8_swash_ptrs[classnum]) {
 510             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 511             PL_utf8_swash_ptrs[classnum] =
 512                     _core_swash_init("utf8",
 513                                      "",
 514                                      &PL_sv_undef, 1, 0,
 515                                      PL_XPosix_ptrs[classnum], &flags);
 516         }
 517
 518         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 519                                  character,
 520                                  TRUE /* is UTF */ ));
 521     }
 522
 523     switch ((_char_class_number) classnum) {
 524         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 525         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 526         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 527         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 528         default:                 break;
 529     }
 530
 531     return FALSE; /* Things like CNTRL are always below 256 */
 532 }
 533
 534 /*
 535  * pregexec and friends
 536  */
 537
 538 #ifndef PERL_IN_XSUB_RE
 539 /*
 540  - pregexec - match a regexp against a string
 541  */
 542 I32
 543 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 544          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 545 /* stringarg: the point in the string at which to begin matching */
 546 /* strend:    pointer to null at end of string */
 547 /* strbeg:    real beginning of string */
 548 /* minend:    end of match must be >= minend bytes after stringarg. */
 549 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 550  *            itself is accessed via the pointers above */
 551 /* nosave:    For optimizations. */
 552 {
 553     PERL_ARGS_ASSERT_PREGEXEC;
 554
 555     return
 556         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 557                       nosave ? 0 : REXEC_COPY_STR);
 558 }
 559 #endif
 560
 561
 562
 563 /* re_intuit_start():
 564  *
 565  * Based on some optimiser hints, try to find the earliest position in the
 566  * string where the regex could match.
 567  *
 568  *   rx:     the regex to match against
 569  *   sv:     the SV being matched: only used for utf8 flag; the string
 570  *           itself is accessed via the pointers below. Note that on
 571  *           something like an overloaded SV, SvPOK(sv) may be false
 572  *           and the string pointers may point to something unrelated to
 573  *           the SV itself.
 574  *   strbeg: real beginning of string
 575  *   strpos: the point in the string at which to begin matching
 576  *   strend: pointer to the byte following the last char of the string
 577  *   flags   currently unused; set to 0
 578  *   data:   currently unused; set to NULL
 579  *
 580  * The basic idea of re_intuit_start() is to use some known information
 581  * about the pattern, namely:
 582  *
 583  *   a) the longest known anchored substring (i.e. one that's at a
 584  *      constant offset from the beginning of the pattern; but not
 585  *      necessarily at a fixed offset from the beginning of the
 586  *      string);
 587  *   b) the longest floating substring (i.e. one that's not at a constant
 588  *      offset from the beginning of the pattern);
 589  *   c) Whether the pattern is anchored to the string; either
 590  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 591  *      or anchored to pos(): /\G/;
 592  *   d) A start class: a real or synthetic character class which
 593  *      represents which characters are legal at the start of the pattern;
 594  *
 595  * to either quickly reject the match, or to find the earliest position
 596  * within the string at which the pattern might match, thus avoiding
 597  * running the full NFA engine at those earlier locations, only to
 598  * eventually fail and retry further along.
 599  *
 600  * Returns NULL if the pattern can't match, or returns the address within
 601  * the string which is the earliest place the match could occur.
 602  *
 603  * The longest of the anchored and floating substrings is called 'check'
 604  * and is checked first. The other is called 'other' and is checked
 605  * second. The 'other' substring may not be present.  For example,
 606  *
 607  *    /(abc|xyz)ABC\d{0,3}DEFG/
 608  *
 609  * will have
 610  *
 611  *   check substr (float)    = "DEFG", offset 6..9 chars
 612  *   other substr (anchored) = "ABC",  offset 3..3 chars
 613  *   stclass = [ax]
 614  *
 615  * Be aware that during the course of this function, sometimes 'anchored'
 616  * refers to a substring being anchored relative to the start of the
 617  * pattern, and sometimes to the pattern itself being anchored relative to
 618  * the string. For example:
 619  *
 620  *   /\dabc/:   "abc" is anchored to the pattern;
 621  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 622  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 623  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 624  *                    but the pattern is anchored to the string.
 625  */
 626
 627 char *
 628 Perl_re_intuit_start(pTHX_
 629                     REGEXP * const rx,
 630                     SV *sv,
 631                     const char * const strbeg,
 632                     char *strpos,
 633                     char *strend,
 634                     const U32 flags,
 635                     re_scream_pos_data *data)
 636 {
 637     struct regexp *const prog = ReANY(rx);
 638     SSize_t start_shift = prog->check_offset_min;
 639     /* Should be nonnegative! */
 640     SSize_t end_shift   = 0;
 641     /* current lowest pos in string where the regex can start matching */
 642     char *rx_origin = strpos;
 643     SV *check;
 644     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 645     U8   other_ix = 1 - prog->substrs->check_ix;
 646     bool ml_anch = 0;
 647     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 648     char *check_at = NULL;              /* check substr found at this pos */
 649     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 650     RXi_GET_DECL(prog,progi);
 651     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 652     regmatch_info *const reginfo = &reginfo_buf;
 653     GET_RE_DEBUG_FLAGS_DECL;
 654
 655     PERL_ARGS_ASSERT_RE_INTUIT_START;
 656     PERL_UNUSED_ARG(flags);
 657     PERL_UNUSED_ARG(data);
 658
 659     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 660                 "Intuit: trying to determine minimum start position...\n"));
 661
 662     /* for now, assume that all substr offsets are positive. If at some point
 663      * in the future someone wants to do clever things with lookbehind and
 664      * -ve offsets, they'll need to fix up any code in this function
 665      * which uses these offsets. See the thread beginning
 666      * <20140113145929.GF27210@iabyn.com>
 667      */
 668     assert(prog->substrs->data[0].min_offset >= 0);
 669     assert(prog->substrs->data[0].max_offset >= 0);
 670     assert(prog->substrs->data[1].min_offset >= 0);
 671     assert(prog->substrs->data[1].max_offset >= 0);
 672     assert(prog->substrs->data[2].min_offset >= 0);
 673     assert(prog->substrs->data[2].max_offset >= 0);
 674
 675     /* for now, assume that if both present, that the floating substring
 676      * doesn't start before the anchored substring.
 677      * If you break this assumption (e.g. doing better optimisations
 678      * with lookahead/behind), then you'll need to audit the code in this
 679      * function carefully first
 680      */
 681     assert(
 682             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 683               && (prog->float_utf8    || prog->float_substr))
 684            || (prog->float_min_offset >= prog->anchored_offset));
 685
 686     /* byte rather than char calculation for efficiency. It fails
 687      * to quickly reject some cases that can't match, but will reject
 688      * them later after doing full char arithmetic */
 689     if (prog->minlen > strend - strpos) {
 690         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 691                               "  String too short...\n"));
 692         goto fail;
 693     }
 694
 695     RX_MATCH_UTF8_set(rx,utf8_target);
 696     reginfo->is_utf8_target = cBOOL(utf8_target);
 697     reginfo->info_aux = NULL;
 698     reginfo->strbeg = strbeg;
 699     reginfo->strend = strend;
 700     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 701     reginfo->intuit = 1;
 702     /* not actually used within intuit, but zero for safety anyway */
 703     reginfo->poscache_maxiter = 0;
 704
 705     if (utf8_target) {
 706         if ((!prog->anchored_utf8 && prog->anchored_substr)
 707                 || (!prog->float_utf8 && prog->float_substr))
 708             to_utf8_substr(prog);
 709         check = prog->check_utf8;
 710     } else {
 711         if (!prog->check_substr && prog->check_utf8) {
 712             if (! to_byte_substr(prog)) {
 713                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 714             }
 715         }
 716         check = prog->check_substr;
 717     }
 718
 719     /* dump the various substring data */
 720     DEBUG_OPTIMISE_MORE_r({
 721         int i;
 722         for (i=0; i<=2; i++) {
 723             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 724                                   : prog->substrs->data[i].substr);
 725             if (!sv)
 726                 continue;
 727
 728             Perl_re_printf( aTHX_
 729                 "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 730                 " useful=%"IVdf" utf8=%d [%s]\n",
 731                 i,
 732                 (IV)prog->substrs->data[i].min_offset,
 733                 (IV)prog->substrs->data[i].max_offset,
 734                 (IV)prog->substrs->data[i].end_shift,
 735                 BmUSEFUL(sv),
 736                 utf8_target ? 1 : 0,
 737                 SvPEEK(sv));
 738         }
 739     });
 740
 741     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 742
 743         /* ml_anch: check after \n?
 744          *
 745          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 746          * with /.*.../, these flags will have been added by the
 747          * compiler:
 748          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 749          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 750          */
 751         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 752                    && !(prog->intflags & PREGf_IMPLICIT);
 753
 754         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 755             /* we are only allowed to match at BOS or \G */
 756
 757             /* trivially reject if there's a BOS anchor and we're not at BOS.
 758              *
 759              * Note that we don't try to do a similar quick reject for
 760              * \G, since generally the caller will have calculated strpos
 761              * based on pos() and gofs, so the string is already correctly
 762              * anchored by definition; and handling the exceptions would
 763              * be too fiddly (e.g. REXEC_IGNOREPOS).
 764              */
 765             if (   strpos != strbeg
 766                 && (prog->intflags & PREGf_ANCH_SBOL))
 767             {
 768                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 769                                 "  Not at start...\n"));
 770                 goto fail;
 771             }
 772
 773             /* in the presence of an anchor, the anchored (relative to the
 774              * start of the regex) substr must also be anchored relative
 775              * to strpos. So quickly reject if substr isn't found there.
 776              * This works for \G too, because the caller will already have
 777              * subtracted gofs from pos, and gofs is the offset from the
 778              * \G to the start of the regex. For example, in /.abc\Gdef/,
 779              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 780              * caller will have set strpos=pos()-4; we look for the substr
 781              * at position pos()-4+1, which lines up with the "a" */
 782
 783             if (prog->check_offset_min == prog->check_offset_max) {
 784                 /* Substring at constant offset from beg-of-str... */
 785                 SSize_t slen = SvCUR(check);
 786                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 787
 788                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 789                     "  Looking for check substr at fixed offset %"IVdf"...\n",
 790                     (IV)prog->check_offset_min));
 791
 792                 if (SvTAIL(check)) {
 793                     /* In this case, the regex is anchored at the end too.
 794                      * Unless it's a multiline match, the lengths must match
 795                      * exactly, give or take a \n.  NB: slen >= 1 since
 796                      * the last char of check is \n */
 797                     if (!multiline
 798                         && (   strend - s > slen
 799                             || strend - s < slen - 1
 800                             || (strend - s == slen && strend[-1] != '\n')))
 801                     {
 802                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 803                                             "  String too long...\n"));
 804                         goto fail_finish;
 805                     }
 806                     /* Now should match s[0..slen-2] */
 807                     slen--;
 808                 }
 809                 if (slen && (*SvPVX_const(check) != *s
 810                     || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 811                 {
 812                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 813                                     "  String not equal...\n"));
 814                     goto fail_finish;
 815                 }
 816
 817                 check_at = s;
 818                 goto success_at_start;
 819             }
 820         }
 821     }
 822
 823     end_shift = prog->check_end_shift;
 824
 825 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 826     if (end_shift < 0)
 827         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 828                    (IV)end_shift, RX_PRECOMP(prog));
 829 #endif
 830
 831   restart:
 832
 833     /* This is the (re)entry point of the main loop in this function.
 834      * The goal of this loop is to:
 835      * 1) find the "check" substring in the region rx_origin..strend
 836      *    (adjusted by start_shift / end_shift). If not found, reject
 837      *    immediately.
 838      * 2) If it exists, look for the "other" substr too if defined; for
 839      *    example, if the check substr maps to the anchored substr, then
 840      *    check the floating substr, and vice-versa. If not found, go
 841      *    back to (1) with rx_origin suitably incremented.
 842      * 3) If we find an rx_origin position that doesn't contradict
 843      *    either of the substrings, then check the possible additional
 844      *    constraints on rx_origin of /^.../m or a known start class.
 845      *    If these fail, then depending on which constraints fail, jump
 846      *    back to here, or to various other re-entry points further along
 847      *    that skip some of the first steps.
 848      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 849      *    substring. If the start position was determined to be at the
 850      *    beginning of the string  - so, not rejected, but not optimised,
 851      *    since we have to run regmatch from position 0 - decrement the
 852      *    BmUSEFUL() count. Otherwise increment it.
 853      */
 854
 855
 856     /* first, look for the 'check' substring */
 857
 858     {
 859         U8* start_point;
 860         U8* end_point;
 861
 862         DEBUG_OPTIMISE_MORE_r({
 863             Perl_re_printf( aTHX_
 864                 "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 865                 " Start shift: %"IVdf" End shift %"IVdf
 866                 " Real end Shift: %"IVdf"\n",
 867                 (IV)(rx_origin - strbeg),
 868                 (IV)prog->check_offset_min,
 869                 (IV)start_shift,
 870                 (IV)end_shift,
 871                 (IV)prog->check_end_shift);
 872         });
 873
 874         end_point = HOP3(strend, -end_shift, strbeg);
 875         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 876         if (!start_point)
 877             goto fail_finish;
 878
 879
 880         /* If the regex is absolutely anchored to either the start of the
 881          * string (SBOL) or to pos() (ANCH_GPOS), then
 882          * check_offset_max represents an upper bound on the string where
 883          * the substr could start. For the ANCH_GPOS case, we assume that
 884          * the caller of intuit will have already set strpos to
 885          * pos()-gofs, so in this case strpos + offset_max will still be
 886          * an upper bound on the substr.
 887          */
 888         if (!ml_anch
 889             && prog->intflags & PREGf_ANCH
 890             && prog->check_offset_max != SSize_t_MAX)
 891         {
 892             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 893             const char * const anchor =
 894                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 895
 896             /* do a bytes rather than chars comparison. It's conservative;
 897              * so it skips doing the HOP if the result can't possibly end
 898              * up earlier than the old value of end_point.
 899              */
 900             if ((char*)end_point - anchor > prog->check_offset_max) {
 901                 end_point = HOP3lim((U8*)anchor,
 902                                 prog->check_offset_max,
 903                                 end_point -len)
 904                             + len;
 905             }
 906         }
 907
 908         check_at = fbm_instr( start_point, end_point,
 909                       check, multiline ? FBMrf_MULTILINE : 0);
 910
 911         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 912             "  doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
 913             (IV)((char*)start_point - strbeg),
 914             (IV)((char*)end_point   - strbeg),
 915             (IV)(check_at ? check_at - strbeg : -1)
 916         ));
 917
 918         /* Update the count-of-usability, remove useless subpatterns,
 919             unshift s.  */
 920
 921         DEBUG_EXECUTE_r({
 922             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 923                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 924             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
 925                               (check_at ? "Found" : "Did not find"),
 926                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 927                     ? "anchored" : "floating"),
 928                 quoted,
 929                 RE_SV_TAIL(check),
 930                 (check_at ? " at offset " : "...\n") );
 931         });
 932
 933         if (!check_at)
 934             goto fail_finish;
 935         /* set rx_origin to the minimum position where the regex could start
 936          * matching, given the constraint of the just-matched check substring.
 937          * But don't set it lower than previously.
 938          */
 939
 940         if (check_at - rx_origin > prog->check_offset_max)
 941             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 942         /* Finish the diagnostic message */
 943         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 944             "%ld (rx_origin now %"IVdf")...\n",
 945             (long)(check_at - strbeg),
 946             (IV)(rx_origin - strbeg)
 947         ));
 948     }
 949
 950
 951     /* now look for the 'other' substring if defined */
 952
 953     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 954                     : prog->substrs->data[other_ix].substr)
 955     {
 956         /* Take into account the "other" substring. */
 957         char *last, *last1;
 958         char *s;
 959         SV* must;
 960         struct reg_substr_datum *other;
 961
 962       do_other_substr:
 963         other = &prog->substrs->data[other_ix];
 964
 965         /* if "other" is anchored:
 966          * we've previously found a floating substr starting at check_at.
 967          * This means that the regex origin must lie somewhere
 968          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 969          * and max:                 HOP3(check_at, -check_offset_min)
 970          * (except that min will be >= strpos)
 971          * So the fixed  substr must lie somewhere between
 972          *  HOP3(min, anchored_offset)
 973          *  HOP3(max, anchored_offset) + SvCUR(substr)
 974          */
 975
 976         /* if "other" is floating
 977          * Calculate last1, the absolute latest point where the
 978          * floating substr could start in the string, ignoring any
 979          * constraints from the earlier fixed match. It is calculated
 980          * as follows:
 981          *
 982          * strend - prog->minlen (in chars) is the absolute latest
 983          * position within the string where the origin of the regex
 984          * could appear. The latest start point for the floating
 985          * substr is float_min_offset(*) on from the start of the
 986          * regex.  last1 simply combines thee two offsets.
 987          *
 988          * (*) You might think the latest start point should be
 989          * float_max_offset from the regex origin, and technically
 990          * you'd be correct. However, consider
 991          *    /a\d{2,4}bcd\w/
 992          * Here, float min, max are 3,5 and minlen is 7.
 993          * This can match either
 994          *    /a\d\dbcd\w/
 995          *    /a\d\d\dbcd\w/
 996          *    /a\d\d\d\dbcd\w/
 997          * In the first case, the regex matches minlen chars; in the
 998          * second, minlen+1, in the third, minlen+2.
 999          * In the first case, the floating offset is 3 (which equals
1000          * float_min), in the second, 4, and in the third, 5 (which
1001          * equals float_max). In all cases, the floating string bcd
1002          * can never start more than 4 chars from the end of the
1003          * string, which equals minlen - float_min. As the substring
1004          * starts to match more than float_min from the start of the
1005          * regex, it makes the regex match more than minlen chars,
1006          * and the two cancel each other out. So we can always use
1007          * float_min - minlen, rather than float_max - minlen for the
1008          * latest position in the string.
1009          *
1010          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1011          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1012          */
1013
1014         assert(prog->minlen >= other->min_offset);
1015         last1 = HOP3c(strend,
1016                         other->min_offset - prog->minlen, strbeg);
1017
1018         if (other_ix) {/* i.e. if (other-is-float) */
1019             /* last is the latest point where the floating substr could
1020              * start, *given* any constraints from the earlier fixed
1021              * match. This constraint is that the floating string starts
1022              * <= float_max_offset chars from the regex origin (rx_origin).
1023              * If this value is less than last1, use it instead.
1024              */
1025             assert(rx_origin <= last1);
1026             last =
1027                 /* this condition handles the offset==infinity case, and
1028                  * is a short-cut otherwise. Although it's comparing a
1029                  * byte offset to a char length, it does so in a safe way,
1030                  * since 1 char always occupies 1 or more bytes,
1031                  * so if a string range is  (last1 - rx_origin) bytes,
1032                  * it will be less than or equal to  (last1 - rx_origin)
1033                  * chars; meaning it errs towards doing the accurate HOP3
1034                  * rather than just using last1 as a short-cut */
1035                 (last1 - rx_origin) < other->max_offset
1036                     ? last1
1037                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1038         }
1039         else {
1040             assert(strpos + start_shift <= check_at);
1041             last = HOP4c(check_at, other->min_offset - start_shift,
1042                         strbeg, strend);
1043         }
1044
1045         s = HOP3c(rx_origin, other->min_offset, strend);
1046         if (s < other_last)     /* These positions already checked */
1047             s = other_last;
1048
1049         must = utf8_target ? other->utf8_substr : other->substr;
1050         assert(SvPOK(must));
1051         {
1052             char *from = s;
1053             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1054
1055             if (to > strend)
1056                 to = strend;
1057             if (from > to) {
1058                 s = NULL;
1059                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1060                     "  skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1061                     (IV)(from - strbeg),
1062                     (IV)(to   - strbeg)
1063                 ));
1064             }
1065             else {
1066                 s = fbm_instr(
1067                     (unsigned char*)from,
1068                     (unsigned char*)to,
1069                     must,
1070                     multiline ? FBMrf_MULTILINE : 0
1071                 );
1072                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1073                     "  doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1074                     (IV)(from - strbeg),
1075                     (IV)(to   - strbeg),
1076                     (IV)(s ? s - strbeg : -1)
1077                 ));
1078             }
1079         }
1080
1081         DEBUG_EXECUTE_r({
1082             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1083                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1084             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1085                 s ? "Found" : "Contradicts",
1086                 other_ix ? "floating" : "anchored",
1087                 quoted, RE_SV_TAIL(must));
1088         });
1089
1090
1091         if (!s) {
1092             /* last1 is latest possible substr location. If we didn't
1093              * find it before there, we never will */
1094             if (last >= last1) {
1095                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1096                                         "; giving up...\n"));
1097                 goto fail_finish;
1098             }
1099
1100             /* try to find the check substr again at a later
1101              * position. Maybe next time we'll find the "other" substr
1102              * in range too */
1103             other_last = HOP3c(last, 1, strend) /* highest failure */;
1104             rx_origin =
1105                 other_ix /* i.e. if other-is-float */
1106                     ? HOP3c(rx_origin, 1, strend)
1107                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1108             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1109                 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1110                 (other_ix ? "floating" : "anchored"),
1111                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1112                 (IV)(rx_origin - strbeg)
1113             ));
1114             goto restart;
1115         }
1116         else {
1117             if (other_ix) { /* if (other-is-float) */
1118                 /* other_last is set to s, not s+1, since its possible for
1119                  * a floating substr to fail first time, then succeed
1120                  * second time at the same floating position; e.g.:
1121                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1122                  * The first time round, anchored and float match at
1123                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1124                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1125                  */
1126                 other_last = s;
1127             }
1128             else {
1129                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1130                 other_last = HOP3c(s, 1, strend);
1131             }
1132             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1133                 " at offset %ld (rx_origin now %"IVdf")...\n",
1134                   (long)(s - strbeg),
1135                 (IV)(rx_origin - strbeg)
1136               ));
1137
1138         }
1139     }
1140     else {
1141         DEBUG_OPTIMISE_MORE_r(
1142             Perl_re_printf( aTHX_
1143                 "  Check-only match: offset min:%"IVdf" max:%"IVdf
1144                 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1145                 " strend:%"IVdf"\n",
1146                 (IV)prog->check_offset_min,
1147                 (IV)prog->check_offset_max,
1148                 (IV)(check_at-strbeg),
1149                 (IV)(rx_origin-strbeg),
1150                 (IV)(rx_origin-check_at),
1151                 (IV)(strend-strbeg)
1152             )
1153         );
1154     }
1155
1156   postprocess_substr_matches:
1157
1158     /* handle the extra constraint of /^.../m if present */
1159
1160     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1161         char *s;
1162
1163         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1164                         "  looking for /^/m anchor"));
1165
1166         /* we have failed the constraint of a \n before rx_origin.
1167          * Find the next \n, if any, even if it's beyond the current
1168          * anchored and/or floating substrings. Whether we should be
1169          * scanning ahead for the next \n or the next substr is debatable.
1170          * On the one hand you'd expect rare substrings to appear less
1171          * often than \n's. On the other hand, searching for \n means
1172          * we're effectively flipping between check_substr and "\n" on each
1173          * iteration as the current "rarest" string candidate, which
1174          * means for example that we'll quickly reject the whole string if
1175          * hasn't got a \n, rather than trying every substr position
1176          * first
1177          */
1178
1179         s = HOP3c(strend, - prog->minlen, strpos);
1180         if (s <= rx_origin ||
1181             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1182         {
1183             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1184                             "  Did not find /%s^%s/m...\n",
1185                             PL_colors[0], PL_colors[1]));
1186             goto fail_finish;
1187         }
1188
1189         /* earliest possible origin is 1 char after the \n.
1190          * (since *rx_origin == '\n', it's safe to ++ here rather than
1191          * HOP(rx_origin, 1)) */
1192         rx_origin++;
1193
1194         if (prog->substrs->check_ix == 0  /* check is anchored */
1195             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1196         {
1197             /* Position contradicts check-string; either because
1198              * check was anchored (and thus has no wiggle room),
1199              * or check was float and rx_origin is above the float range */
1200             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1201                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1202                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1203             goto restart;
1204         }
1205
1206         /* if we get here, the check substr must have been float,
1207          * is in range, and we may or may not have had an anchored
1208          * "other" substr which still contradicts */
1209         assert(prog->substrs->check_ix); /* check is float */
1210
1211         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1212             /* whoops, the anchored "other" substr exists, so we still
1213              * contradict. On the other hand, the float "check" substr
1214              * didn't contradict, so just retry the anchored "other"
1215              * substr */
1216             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1217                 "  Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
1218                 PL_colors[0], PL_colors[1],
1219                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1220                 (IV)(rx_origin - strbeg)
1221             ));
1222             goto do_other_substr;
1223         }
1224
1225         /* success: we don't contradict the found floating substring
1226          * (and there's no anchored substr). */
1227         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1228             "  Found /%s^%s/m with rx_origin %ld...\n",
1229             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1230     }
1231     else {
1232         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1233             "  (multiline anchor test skipped)\n"));
1234     }
1235
1236   success_at_start:
1237
1238
1239     /* if we have a starting character class, then test that extra constraint.
1240      * (trie stclasses are too expensive to use here, we are better off to
1241      * leave it to regmatch itself) */
1242
1243     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1244         const U8* const str = (U8*)STRING(progi->regstclass);
1245
1246         /* XXX this value could be pre-computed */
1247         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1248                     ?  (reginfo->is_utf8_pat
1249                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1250                         : STR_LEN(progi->regstclass))
1251                     : 1);
1252         char * endpos;
1253         char *s;
1254         /* latest pos that a matching float substr constrains rx start to */
1255         char *rx_max_float = NULL;
1256
1257         /* if the current rx_origin is anchored, either by satisfying an
1258          * anchored substring constraint, or a /^.../m constraint, then we
1259          * can reject the current origin if the start class isn't found
1260          * at the current position. If we have a float-only match, then
1261          * rx_origin is constrained to a range; so look for the start class
1262          * in that range. if neither, then look for the start class in the
1263          * whole rest of the string */
1264
1265         /* XXX DAPM it's not clear what the minlen test is for, and why
1266          * it's not used in the floating case. Nothing in the test suite
1267          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1268          * Here are some old comments, which may or may not be correct:
1269          *
1270          *   minlen == 0 is possible if regstclass is \b or \B,
1271          *   and the fixed substr is ''$.
1272          *   Since minlen is already taken into account, rx_origin+1 is
1273          *   before strend; accidentally, minlen >= 1 guaranties no false
1274          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1275          *   0) below assumes that regstclass does not come from lookahead...
1276          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1277          *   This leaves EXACTF-ish only, which are dealt with in
1278          *   find_byclass().
1279          */
1280
1281         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1282             endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1283         else if (prog->float_substr || prog->float_utf8) {
1284             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1285             endpos= HOP3c(rx_max_float, cl_l, strend);
1286         }
1287         else
1288             endpos= strend;
1289
1290         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1291             "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1292             " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1293               (IV)start_shift, (IV)(check_at - strbeg),
1294               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1295
1296         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1297                             reginfo);
1298         if (!s) {
1299             if (endpos == strend) {
1300                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1301                                 "  Could not match STCLASS...\n") );
1302                 goto fail;
1303             }
1304             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1305                                "  This position contradicts STCLASS...\n") );
1306             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1307                         && !(prog->intflags & PREGf_IMPLICIT))
1308                 goto fail;
1309
1310             /* Contradict one of substrings */
1311             if (prog->anchored_substr || prog->anchored_utf8) {
1312                 if (prog->substrs->check_ix == 1) { /* check is float */
1313                     /* Have both, check_string is floating */
1314                     assert(rx_origin + start_shift <= check_at);
1315                     if (rx_origin + start_shift != check_at) {
1316                         /* not at latest position float substr could match:
1317                          * Recheck anchored substring, but not floating.
1318                          * The condition above is in bytes rather than
1319                          * chars for efficiency. It's conservative, in
1320                          * that it errs on the side of doing 'goto
1321                          * do_other_substr'. In this case, at worst,
1322                          * an extra anchored search may get done, but in
1323                          * practice the extra fbm_instr() is likely to
1324                          * get skipped anyway. */
1325                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1326                             "  about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1327                             (long)(other_last - strbeg),
1328                             (IV)(rx_origin - strbeg)
1329                         ));
1330                         goto do_other_substr;
1331                     }
1332                 }
1333             }
1334             else {
1335                 /* float-only */
1336
1337                 if (ml_anch) {
1338                     /* In the presence of ml_anch, we might be able to
1339                      * find another \n without breaking the current float
1340                      * constraint. */
1341
1342                     /* strictly speaking this should be HOP3c(..., 1, ...),
1343                      * but since we goto a block of code that's going to
1344                      * search for the next \n if any, its safe here */
1345                     rx_origin++;
1346                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1347                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1348                               PL_colors[0], PL_colors[1],
1349                               (long)(rx_origin - strbeg)) );
1350                     goto postprocess_substr_matches;
1351                 }
1352
1353                 /* strictly speaking this can never be true; but might
1354                  * be if we ever allow intuit without substrings */
1355                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1356                     goto fail;
1357
1358                 rx_origin = rx_max_float;
1359             }
1360
1361             /* at this point, any matching substrings have been
1362              * contradicted. Start again... */
1363
1364             rx_origin = HOP3c(rx_origin, 1, strend);
1365
1366             /* uses bytes rather than char calculations for efficiency.
1367              * It's conservative: it errs on the side of doing 'goto restart',
1368              * where there is code that does a proper char-based test */
1369             if (rx_origin + start_shift + end_shift > strend) {
1370                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1371                                        "  Could not match STCLASS...\n") );
1372                 goto fail;
1373             }
1374             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1375                 "  about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1376                 (prog->substrs->check_ix ? "floating" : "anchored"),
1377                 (long)(rx_origin + start_shift - strbeg),
1378                 (IV)(rx_origin - strbeg)
1379             ));
1380             goto restart;
1381         }
1382
1383         /* Success !!! */
1384
1385         if (rx_origin != s) {
1386             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1387                         "  By STCLASS: moving %ld --> %ld\n",
1388                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1389                    );
1390         }
1391         else {
1392             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1393                                   "  Does not contradict STCLASS...\n");
1394                    );
1395         }
1396     }
1397
1398     /* Decide whether using the substrings helped */
1399
1400     if (rx_origin != strpos) {
1401         /* Fixed substring is found far enough so that the match
1402            cannot start at strpos. */
1403
1404         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1405         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1406     }
1407     else {
1408         /* The found rx_origin position does not prohibit matching at
1409          * strpos, so calling intuit didn't gain us anything. Decrement
1410          * the BmUSEFUL() count on the check substring, and if we reach
1411          * zero, free it.  */
1412         if (!(prog->intflags & PREGf_NAUGHTY)
1413             && (utf8_target ? (
1414                 prog->check_utf8                /* Could be deleted already */
1415                 && --BmUSEFUL(prog->check_utf8) < 0
1416                 && (prog->check_utf8 == prog->float_utf8)
1417             ) : (
1418                 prog->check_substr              /* Could be deleted already */
1419                 && --BmUSEFUL(prog->check_substr) < 0
1420                 && (prog->check_substr == prog->float_substr)
1421             )))
1422         {
1423             /* If flags & SOMETHING - do not do it many times on the same match */
1424             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1425             /* XXX Does the destruction order has to change with utf8_target? */
1426             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1427             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1428             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1429             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1430             check = NULL;                       /* abort */
1431             /* XXXX This is a remnant of the old implementation.  It
1432                     looks wasteful, since now INTUIT can use many
1433                     other heuristics. */
1434             prog->extflags &= ~RXf_USE_INTUIT;
1435         }
1436     }
1437
1438     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1440              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1441
1442     return rx_origin;
1443
1444   fail_finish:                          /* Substring not found */
1445     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1446         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1447   fail:
1448     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1449                           PL_colors[4], PL_colors[5]));
1450     return NULL;
1451 }
1452
1453
1454 #define DECL_TRIE_TYPE(scan) \
1455     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1456                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1457                  trie_utf8l, trie_flu8 }                                            \
1458                     trie_type = ((scan->flags == EXACT)                             \
1459                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1460                                  : (scan->flags == EXACTL)                          \
1461                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1462                                     : (scan->flags == EXACTFA)                      \
1463                                       ? (utf8_target                                \
1464                                          ? trie_utf8_exactfa_fold                   \
1465                                          : trie_latin_utf8_exactfa_fold)            \
1466                                       : (scan->flags == EXACTFLU8                   \
1467                                          ? trie_flu8                                \
1468                                          : (utf8_target                             \
1469                                            ? trie_utf8_fold                         \
1470                                            :   trie_latin_utf8_fold)))
1471
1472 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1473 STMT_START {                                                                        \
1474     STRLEN skiplen;                                                                 \
1475     U8 flags = FOLD_FLAGS_FULL;                                                     \
1476     switch (trie_type) {                                                            \
1477     case trie_flu8:                                                                 \
1478         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1479         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1480             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1481         }                                                                           \
1482         goto do_trie_utf8_fold;                                                     \
1483     case trie_utf8_exactfa_fold:                                                    \
1484         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1485         /* FALLTHROUGH */                                                           \
1486     case trie_utf8_fold:                                                            \
1487       do_trie_utf8_fold:                                                            \
1488         if ( foldlen>0 ) {                                                          \
1489             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1490             foldlen -= len;                                                         \
1491             uscan += len;                                                           \
1492             len=0;                                                                  \
1493         } else {                                                                    \
1494             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1495             len = UTF8SKIP(uc);                                                     \
1496             skiplen = UVCHR_SKIP( uvc );                                            \
1497             foldlen -= skiplen;                                                     \
1498             uscan = foldbuf + skiplen;                                              \
1499         }                                                                           \
1500         break;                                                                      \
1501     case trie_latin_utf8_exactfa_fold:                                              \
1502         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1503         /* FALLTHROUGH */                                                           \
1504     case trie_latin_utf8_fold:                                                      \
1505         if ( foldlen>0 ) {                                                          \
1506             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1507             foldlen -= len;                                                         \
1508             uscan += len;                                                           \
1509             len=0;                                                                  \
1510         } else {                                                                    \
1511             len = 1;                                                                \
1512             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1513             skiplen = UVCHR_SKIP( uvc );                                            \
1514             foldlen -= skiplen;                                                     \
1515             uscan = foldbuf + skiplen;                                              \
1516         }                                                                           \
1517         break;                                                                      \
1518     case trie_utf8l:                                                                \
1519         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1520         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1521             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1522         }                                                                           \
1523         /* FALLTHROUGH */                                                           \
1524     case trie_utf8:                                                                 \
1525         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1526         break;                                                                      \
1527     case trie_plain:                                                                \
1528         uvc = (UV)*uc;                                                              \
1529         len = 1;                                                                    \
1530     }                                                                               \
1531     if (uvc < 256) {                                                                \
1532         charid = trie->charmap[ uvc ];                                              \
1533     }                                                                               \
1534     else {                                                                          \
1535         charid = 0;                                                                 \
1536         if (widecharmap) {                                                          \
1537             SV** const svpp = hv_fetch(widecharmap,                                 \
1538                         (char*)&uvc, sizeof(UV), 0);                                \
1539             if (svpp)                                                               \
1540                 charid = (U16)SvIV(*svpp);                                          \
1541         }                                                                           \
1542     }                                                                               \
1543 } STMT_END
1544
1545 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1546     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1547                 startpos, doutf8, depth)
1548
1549 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1550 STMT_START {                                              \
1551     while (s <= e) {                                      \
1552         if ( (COND)                                       \
1553              && (ln == 1 || folder(s, pat_string, ln))    \
1554              && (reginfo->intuit || regtry(reginfo, &s)) )\
1555             goto got_it;                                  \
1556         s++;                                              \
1557     }                                                     \
1558 } STMT_END
1559
1560 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1561 STMT_START {                                          \
1562     while (s < strend) {                              \
1563         CODE                                          \
1564         s += UTF8SKIP(s);                             \
1565     }                                                 \
1566 } STMT_END
1567
1568 #define REXEC_FBC_SCAN(CODE)                          \
1569 STMT_START {                                          \
1570     while (s < strend) {                              \
1571         CODE                                          \
1572         s++;                                          \
1573     }                                                 \
1574 } STMT_END
1575
1576 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1577 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1578     if (COND) {                                                \
1579         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1580             goto got_it;                                       \
1581         else                                                   \
1582             tmp = doevery;                                     \
1583     }                                                          \
1584     else                                                       \
1585         tmp = 1;                                               \
1586 )
1587
1588 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1589 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1590     if (COND) {                                                \
1591         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1592             goto got_it;                                       \
1593         else                                                   \
1594             tmp = doevery;                                     \
1595     }                                                          \
1596     else                                                       \
1597         tmp = 1;                                               \
1598 )
1599
1600 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1601     if (utf8_target) {                                         \
1602         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1603     }                                                          \
1604     else {                                                     \
1605         REXEC_FBC_CLASS_SCAN(COND);                            \
1606     }
1607
1608 /* The three macros below are slightly different versions of the same logic.
1609  *
1610  * The first is for /a and /aa when the target string is UTF-8.  This can only
1611  * match ascii, but it must advance based on UTF-8.   The other two handle the
1612  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1613  * for the boundary (or non-boundary) between a word and non-word character.
1614  * The utf8 and non-utf8 cases have the same logic, but the details must be
1615  * different.  Find the "wordness" of the character just prior to this one, and
1616  * compare it with the wordness of this one.  If they differ, we have a
1617  * boundary.  At the beginning of the string, pretend that the previous
1618  * character was a new-line.
1619  *
1620  * All these macros uncleanly have side-effects with each other and outside
1621  * variables.  So far it's been too much trouble to clean-up
1622  *
1623  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1624  *               a word character or not.
1625  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1626  *               word/non-word
1627  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1628  *
1629  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1630  * are looking for a boundary or for a non-boundary.  If we are looking for a
1631  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1632  * see if this tentative match actually works, and if so, to quit the loop
1633  * here.  And vice-versa if we are looking for a non-boundary.
1634  *
1635  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1636  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1637  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1638  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1639  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1640  * complement.  But in that branch we complement tmp, meaning that at the
1641  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1642  * which means at the top of the loop in the next iteration, it is
1643  * TEST_NON_UTF8(s-1) */
1644 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1645     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1646     tmp = TEST_NON_UTF8(tmp);                                                  \
1647     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1648         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1649             tmp = !tmp;                                                        \
1650             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1651         }                                                                      \
1652         else {                                                                 \
1653             IF_FAIL;                                                           \
1654         }                                                                      \
1655     );                                                                         \
1656
1657 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1658  * TEST_UTF8 is a macro that for the same input code points returns identically
1659  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1660 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1661     if (s == reginfo->strbeg) {                                                \
1662         tmp = '\n';                                                            \
1663     }                                                                          \
1664     else { /* Back-up to the start of the previous character */                \
1665         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1666         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1667                                                        0, UTF8_ALLOW_DEFAULT); \
1668     }                                                                          \
1669     tmp = TEST_UV(tmp);                                                        \
1670     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1671     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1672         if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1673             tmp = !tmp;                                                        \
1674             IF_SUCCESS;                                                        \
1675         }                                                                      \
1676         else {                                                                 \
1677             IF_FAIL;                                                           \
1678         }                                                                      \
1679     );
1680
1681 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1682  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1683  * macros below */
1684 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1685     if (utf8_target) {                                                         \
1686         UTF8_CODE                                                              \
1687     }                                                                          \
1688     else {  /* Not utf8 */                                                     \
1689         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1690         tmp = TEST_NON_UTF8(tmp);                                              \
1691         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1692             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1693                 IF_SUCCESS;                                                    \
1694                 tmp = !tmp;                                                    \
1695             }                                                                  \
1696             else {                                                             \
1697                 IF_FAIL;                                                       \
1698             }                                                                  \
1699         );                                                                     \
1700     }                                                                          \
1701     /* Here, things have been set up by the previous code so that tmp is the   \
1702      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1703      * utf8ness of the target).  We also have to check if this matches against \
1704      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1705      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1706      * string */                                                               \
1707     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1708         IF_SUCCESS;                                                            \
1709     }                                                                          \
1710     else {                                                                     \
1711         IF_FAIL;                                                               \
1712     }
1713
1714 /* This is the macro to use when we want to see if something that looks like it
1715  * could match, actually does, and if so exits the loop */
1716 #define REXEC_FBC_TRYIT                            \
1717     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1718         goto got_it
1719
1720 /* The only difference between the BOUND and NBOUND cases is that
1721  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1722  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1723  * with the other one being empty (PLACEHOLDER is defined as empty).
1724  *
1725  * The TEST_FOO parameters are for operating on different forms of input, but
1726  * all should be ones that return identically for the same underlying code
1727  * points */
1728 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1729     FBC_BOUND_COMMON(                                                          \
1730           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1731           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1732
1733 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1734     FBC_BOUND_COMMON(                                                          \
1735             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1736             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1737
1738 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1739     FBC_BOUND_COMMON(                                                          \
1740           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1741           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1742
1743 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1744     FBC_BOUND_COMMON(                                                          \
1745             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1746             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1747
1748 #ifdef DEBUGGING
1749 static IV
1750 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1751   IV cp_out = Perl__invlist_search(invlist, cp_in);
1752   assert(cp_out >= 0);
1753   return cp_out;
1754 }
1755 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1756         invmap[S_get_break_val_cp_checked(invlist, cp)]
1757 #else
1758 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1759         invmap[_invlist_search(invlist, cp)]
1760 #endif
1761
1762 /* Takes a pointer to an inversion list, a pointer to its corresponding
1763  * inversion map, and a code point, and returns the code point's value
1764  * according to the two arrays.  It assumes that all code points have a value.
1765  * This is used as the base macro for macros for particular properties */
1766 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1767         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1768
1769 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1770  * of a code point, returning the value for the first code point in the string.
1771  * And it takes the particular macro name that finds the desired value given a
1772  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1773 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1774              (__ASSERT_(pos < strend)                                          \
1775                  /* Note assumes is valid UTF-8 */                             \
1776              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1777
1778 /* Returns the GCB value for the input code point */
1779 #define getGCB_VAL_CP(cp)                                                      \
1780           _generic_GET_BREAK_VAL_CP(                                           \
1781                                     PL_GCB_invlist,                            \
1782                                     _Perl_GCB_invmap,                          \
1783                                     (cp))
1784
1785 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1786  * bounded by pos and strend */
1787 #define getGCB_VAL_UTF8(pos, strend)                                           \
1788     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1789
1790 /* Returns the LB value for the input code point */
1791 #define getLB_VAL_CP(cp)                                                       \
1792           _generic_GET_BREAK_VAL_CP(                                           \
1793                                     PL_LB_invlist,                             \
1794                                     _Perl_LB_invmap,                           \
1795                                     (cp))
1796
1797 /* Returns the LB value for the first code point in the UTF-8 encoded string
1798  * bounded by pos and strend */
1799 #define getLB_VAL_UTF8(pos, strend)                                            \
1800     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1801
1802
1803 /* Returns the SB value for the input code point */
1804 #define getSB_VAL_CP(cp)                                                       \
1805           _generic_GET_BREAK_VAL_CP(                                           \
1806                                     PL_SB_invlist,                             \
1807                                     _Perl_SB_invmap,                     \
1808                                     (cp))
1809
1810 /* Returns the SB value for the first code point in the UTF-8 encoded string
1811  * bounded by pos and strend */
1812 #define getSB_VAL_UTF8(pos, strend)                                            \
1813     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1814
1815 /* Returns the WB value for the input code point */
1816 #define getWB_VAL_CP(cp)                                                       \
1817           _generic_GET_BREAK_VAL_CP(                                           \
1818                                     PL_WB_invlist,                             \
1819                                     _Perl_WB_invmap,                         \
1820                                     (cp))
1821
1822 /* Returns the WB value for the first code point in the UTF-8 encoded string
1823  * bounded by pos and strend */
1824 #define getWB_VAL_UTF8(pos, strend)                                            \
1825     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1826
1827 /* We know what class REx starts with.  Try to find this position... */
1828 /* if reginfo->intuit, its a dryrun */
1829 /* annoyingly all the vars in this routine have different names from their counterparts
1830    in regmatch. /grrr */
1831 STATIC char *
1832 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1833     const char *strend, regmatch_info *reginfo)
1834 {
1835     dVAR;
1836     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1837     char *pat_string;   /* The pattern's exactish string */
1838     char *pat_end;          /* ptr to end char of pat_string */
1839     re_fold_t folder;   /* Function for computing non-utf8 folds */
1840     const U8 *fold_array;   /* array for folding ords < 256 */
1841     STRLEN ln;
1842     STRLEN lnc;
1843     U8 c1;
1844     U8 c2;
1845     char *e;
1846     I32 tmp = 1;        /* Scratch variable? */
1847     const bool utf8_target = reginfo->is_utf8_target;
1848     UV utf8_fold_flags = 0;
1849     const bool is_utf8_pat = reginfo->is_utf8_pat;
1850     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1851                                    with a result inverts that result, as 0^1 =
1852                                    1 and 1^1 = 0 */
1853     _char_class_number classnum;
1854
1855     RXi_GET_DECL(prog,progi);
1856
1857     PERL_ARGS_ASSERT_FIND_BYCLASS;
1858
1859     /* We know what class it must start with. */
1860     switch (OP(c)) {
1861     case ANYOFL:
1862         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1863
1864         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
1865             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1866         }
1867
1868         /* FALLTHROUGH */
1869     case ANYOFD:
1870     case ANYOF:
1871         if (utf8_target) {
1872             REXEC_FBC_UTF8_CLASS_SCAN(
1873                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1874         }
1875         else {
1876             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s, 0));
1877         }
1878         break;
1879
1880     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1881         assert(! is_utf8_pat);
1882         /* FALLTHROUGH */
1883     case EXACTFA:
1884         if (is_utf8_pat || utf8_target) {
1885             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1886             goto do_exactf_utf8;
1887         }
1888         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1889         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1890         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1891
1892     case EXACTF:   /* This node only generated for non-utf8 patterns */
1893         assert(! is_utf8_pat);
1894         if (utf8_target) {
1895             utf8_fold_flags = 0;
1896             goto do_exactf_utf8;
1897         }
1898         fold_array = PL_fold;
1899         folder = foldEQ;
1900         goto do_exactf_non_utf8;
1901
1902     case EXACTFL:
1903         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1904         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1905             utf8_fold_flags = FOLDEQ_LOCALE;
1906             goto do_exactf_utf8;
1907         }
1908         fold_array = PL_fold_locale;
1909         folder = foldEQ_locale;
1910         goto do_exactf_non_utf8;
1911
1912     case EXACTFU_SS:
1913         if (is_utf8_pat) {
1914             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1915         }
1916         goto do_exactf_utf8;
1917
1918     case EXACTFLU8:
1919             if (! utf8_target) {    /* All code points in this node require
1920                                        UTF-8 to express.  */
1921                 break;
1922             }
1923             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1924                                              | FOLDEQ_S2_FOLDS_SANE;
1925             goto do_exactf_utf8;
1926
1927     case EXACTFU:
1928         if (is_utf8_pat || utf8_target) {
1929             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1930             goto do_exactf_utf8;
1931         }
1932
1933         /* Any 'ss' in the pattern should have been replaced by regcomp,
1934          * so we don't have to worry here about this single special case
1935          * in the Latin1 range */
1936         fold_array = PL_fold_latin1;
1937         folder = foldEQ_latin1;
1938
1939         /* FALLTHROUGH */
1940
1941       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1942                            are no glitches with fold-length differences
1943                            between the target string and pattern */
1944
1945         /* The idea in the non-utf8 EXACTF* cases is to first find the
1946          * first character of the EXACTF* node and then, if necessary,
1947          * case-insensitively compare the full text of the node.  c1 is the
1948          * first character.  c2 is its fold.  This logic will not work for
1949          * Unicode semantics and the german sharp ss, which hence should
1950          * not be compiled into a node that gets here. */
1951         pat_string = STRING(c);
1952         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1953
1954         /* We know that we have to match at least 'ln' bytes (which is the
1955          * same as characters, since not utf8).  If we have to match 3
1956          * characters, and there are only 2 availabe, we know without
1957          * trying that it will fail; so don't start a match past the
1958          * required minimum number from the far end */
1959         e = HOP3c(strend, -((SSize_t)ln), s);
1960
1961         if (reginfo->intuit && e < s) {
1962             e = s;                      /* Due to minlen logic of intuit() */
1963         }
1964
1965         c1 = *pat_string;
1966         c2 = fold_array[c1];
1967         if (c1 == c2) { /* If char and fold are the same */
1968             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1969         }
1970         else {
1971             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1972         }
1973         break;
1974
1975       do_exactf_utf8:
1976       {
1977         unsigned expansion;
1978
1979         /* If one of the operands is in utf8, we can't use the simpler folding
1980          * above, due to the fact that many different characters can have the
1981          * same fold, or portion of a fold, or different- length fold */
1982         pat_string = STRING(c);
1983         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1984         pat_end = pat_string + ln;
1985         lnc = is_utf8_pat       /* length to match in characters */
1986                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1987                 : ln;
1988
1989         /* We have 'lnc' characters to match in the pattern, but because of
1990          * multi-character folding, each character in the target can match
1991          * up to 3 characters (Unicode guarantees it will never exceed
1992          * this) if it is utf8-encoded; and up to 2 if not (based on the
1993          * fact that the Latin 1 folds are already determined, and the
1994          * only multi-char fold in that range is the sharp-s folding to
1995          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1996          * string character.  Adjust lnc accordingly, rounding up, so that
1997          * if we need to match at least 4+1/3 chars, that really is 5. */
1998         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1999         lnc = (lnc + expansion - 1) / expansion;
2000
2001         /* As in the non-UTF8 case, if we have to match 3 characters, and
2002          * only 2 are left, it's guaranteed to fail, so don't start a
2003          * match that would require us to go beyond the end of the string
2004          */
2005         e = HOP3c(strend, -((SSize_t)lnc), s);
2006
2007         if (reginfo->intuit && e < s) {
2008             e = s;                      /* Due to minlen logic of intuit() */
2009         }
2010
2011         /* XXX Note that we could recalculate e to stop the loop earlier,
2012          * as the worst case expansion above will rarely be met, and as we
2013          * go along we would usually find that e moves further to the left.
2014          * This would happen only after we reached the point in the loop
2015          * where if there were no expansion we should fail.  Unclear if
2016          * worth the expense */
2017
2018         while (s <= e) {
2019             char *my_strend= (char *)strend;
2020             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2021                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2022                 && (reginfo->intuit || regtry(reginfo, &s)) )
2023             {
2024                 goto got_it;
2025             }
2026             s += (utf8_target) ? UTF8SKIP(s) : 1;
2027         }
2028         break;
2029     }
2030
2031     case BOUNDL:
2032         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2033         if (FLAGS(c) != TRADITIONAL_BOUND) {
2034             if (! IN_UTF8_CTYPE_LOCALE) {
2035                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2036                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2037             }
2038             goto do_boundu;
2039         }
2040
2041         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2042         break;
2043
2044     case NBOUNDL:
2045         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2046         if (FLAGS(c) != TRADITIONAL_BOUND) {
2047             if (! IN_UTF8_CTYPE_LOCALE) {
2048                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2049                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2050             }
2051             goto do_nboundu;
2052         }
2053
2054         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2055         break;
2056
2057     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2058                    meaning */
2059         assert(FLAGS(c) == TRADITIONAL_BOUND);
2060
2061         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2062         break;
2063
2064     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2065                    meaning */
2066         assert(FLAGS(c) == TRADITIONAL_BOUND);
2067
2068         FBC_BOUND_A(isWORDCHAR_A);
2069         break;
2070
2071     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2072                    meaning */
2073         assert(FLAGS(c) == TRADITIONAL_BOUND);
2074
2075         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2076         break;
2077
2078     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2079                    meaning */
2080         assert(FLAGS(c) == TRADITIONAL_BOUND);
2081
2082         FBC_NBOUND_A(isWORDCHAR_A);
2083         break;
2084
2085     case NBOUNDU:
2086         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2087             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2088             break;
2089         }
2090
2091       do_nboundu:
2092
2093         to_complement = 1;
2094         /* FALLTHROUGH */
2095
2096     case BOUNDU:
2097       do_boundu:
2098         switch((bound_type) FLAGS(c)) {
2099             case TRADITIONAL_BOUND:
2100                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2101                 break;
2102             case GCB_BOUND:
2103                 if (s == reginfo->strbeg) {
2104                     if (reginfo->intuit || regtry(reginfo, &s))
2105                     {
2106                         goto got_it;
2107                     }
2108
2109                     /* Didn't match.  Try at the next position (if there is one) */
2110                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2111                     if (UNLIKELY(s >= reginfo->strend)) {
2112                         break;
2113                     }
2114                 }
2115
2116                 if (utf8_target) {
2117                     GCB_enum before = getGCB_VAL_UTF8(
2118                                                reghop3((U8*)s, -1,
2119                                                        (U8*)(reginfo->strbeg)),
2120                                                (U8*) reginfo->strend);
2121                     while (s < strend) {
2122                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2123                                                         (U8*) reginfo->strend);
2124                         if (   (to_complement ^ isGCB(before,
2125                                                       after,
2126                                                       (U8*) reginfo->strbeg,
2127                                                       (U8*) s,
2128                                                       utf8_target))
2129                             && (reginfo->intuit || regtry(reginfo, &s)))
2130                         {
2131                             goto got_it;
2132                         }
2133                         before = after;
2134                         s += UTF8SKIP(s);
2135                     }
2136                 }
2137                 else {  /* Not utf8.  Everything is a GCB except between CR and
2138                            LF */
2139                     while (s < strend) {
2140                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2141                                               || UCHARAT(s) != '\n'))
2142                             && (reginfo->intuit || regtry(reginfo, &s)))
2143                         {
2144                             goto got_it;
2145                         }
2146                         s++;
2147                     }
2148                 }
2149
2150                 /* And, since this is a bound, it can match after the final
2151                  * character in the string */
2152                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2153                     goto got_it;
2154                 }
2155                 break;
2156
2157             case LB_BOUND:
2158                 if (s == reginfo->strbeg) {
2159                     if (reginfo->intuit || regtry(reginfo, &s)) {
2160                         goto got_it;
2161                     }
2162                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2163                     if (UNLIKELY(s >= reginfo->strend)) {
2164                         break;
2165                     }
2166                 }
2167
2168                 if (utf8_target) {
2169                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2170                                                                -1,
2171                                                                (U8*)(reginfo->strbeg)),
2172                                                        (U8*) reginfo->strend);
2173                     while (s < strend) {
2174                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2175                         if (to_complement ^ isLB(before,
2176                                                  after,
2177                                                  (U8*) reginfo->strbeg,
2178                                                  (U8*) s,
2179                                                  (U8*) reginfo->strend,
2180                                                  utf8_target)
2181                             && (reginfo->intuit || regtry(reginfo, &s)))
2182                         {
2183                             goto got_it;
2184                         }
2185                         before = after;
2186                         s += UTF8SKIP(s);
2187                     }
2188                 }
2189                 else {  /* Not utf8. */
2190                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2191                     while (s < strend) {
2192                         LB_enum after = getLB_VAL_CP((U8) *s);
2193                         if (to_complement ^ isLB(before,
2194                                                  after,
2195                                                  (U8*) reginfo->strbeg,
2196                                                  (U8*) s,
2197                                                  (U8*) reginfo->strend,
2198                                                  utf8_target)
2199                             && (reginfo->intuit || regtry(reginfo, &s)))
2200                         {
2201                             goto got_it;
2202                         }
2203                         before = after;
2204                         s++;
2205                     }
2206                 }
2207
2208                 if (reginfo->intuit || regtry(reginfo, &s)) {
2209                     goto got_it;
2210                 }
2211
2212                 break;
2213
2214             case SB_BOUND:
2215                 if (s == reginfo->strbeg) {
2216                     if (reginfo->intuit || regtry(reginfo, &s)) {
2217                         goto got_it;
2218                     }
2219                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2220                     if (UNLIKELY(s >= reginfo->strend)) {
2221                         break;
2222                     }
2223                 }
2224
2225                 if (utf8_target) {
2226                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2227                                                         -1,
2228                                                         (U8*)(reginfo->strbeg)),
2229                                                       (U8*) reginfo->strend);
2230                     while (s < strend) {
2231                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2232                                                          (U8*) reginfo->strend);
2233                         if ((to_complement ^ isSB(before,
2234                                                   after,
2235                                                   (U8*) reginfo->strbeg,
2236                                                   (U8*) s,
2237                                                   (U8*) reginfo->strend,
2238                                                   utf8_target))
2239                             && (reginfo->intuit || regtry(reginfo, &s)))
2240                         {
2241                             goto got_it;
2242                         }
2243                         before = after;
2244                         s += UTF8SKIP(s);
2245                     }
2246                 }
2247                 else {  /* Not utf8. */
2248                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2249                     while (s < strend) {
2250                         SB_enum after = getSB_VAL_CP((U8) *s);
2251                         if ((to_complement ^ isSB(before,
2252                                                   after,
2253                                                   (U8*) reginfo->strbeg,
2254                                                   (U8*) s,
2255                                                   (U8*) reginfo->strend,
2256                                                   utf8_target))
2257                             && (reginfo->intuit || regtry(reginfo, &s)))
2258                         {
2259                             goto got_it;
2260                         }
2261                         before = after;
2262                         s++;
2263                     }
2264                 }
2265
2266                 /* Here are at the final position in the target string.  The SB
2267                  * value is always true here, so matches, depending on other
2268                  * constraints */
2269                 if (reginfo->intuit || regtry(reginfo, &s)) {
2270                     goto got_it;
2271                 }
2272
2273                 break;
2274
2275             case WB_BOUND:
2276                 if (s == reginfo->strbeg) {
2277                     if (reginfo->intuit || regtry(reginfo, &s)) {
2278                         goto got_it;
2279                     }
2280                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2281                     if (UNLIKELY(s >= reginfo->strend)) {
2282                         break;
2283                     }
2284                 }
2285
2286                 if (utf8_target) {
2287                     /* We are at a boundary between char_sub_0 and char_sub_1.
2288                      * We also keep track of the value for char_sub_-1 as we
2289                      * loop through the line.   Context may be needed to make a
2290                      * determination, and if so, this can save having to
2291                      * recalculate it */
2292                     WB_enum previous = WB_UNKNOWN;
2293                     WB_enum before = getWB_VAL_UTF8(
2294                                               reghop3((U8*)s,
2295                                                       -1,
2296                                                       (U8*)(reginfo->strbeg)),
2297                                               (U8*) reginfo->strend);
2298                     while (s < strend) {
2299                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2300                                                         (U8*) reginfo->strend);
2301                         if ((to_complement ^ isWB(previous,
2302                                                   before,
2303                                                   after,
2304                                                   (U8*) reginfo->strbeg,
2305                                                   (U8*) s,
2306                                                   (U8*) reginfo->strend,
2307                                                   utf8_target))
2308                             && (reginfo->intuit || regtry(reginfo, &s)))
2309                         {
2310                             goto got_it;
2311                         }
2312                         previous = before;
2313                         before = after;
2314                         s += UTF8SKIP(s);
2315                     }
2316                 }
2317                 else {  /* Not utf8. */
2318                     WB_enum previous = WB_UNKNOWN;
2319                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2320                     while (s < strend) {
2321                         WB_enum after = getWB_VAL_CP((U8) *s);
2322                         if ((to_complement ^ isWB(previous,
2323                                                   before,
2324                                                   after,
2325                                                   (U8*) reginfo->strbeg,
2326                                                   (U8*) s,
2327                                                   (U8*) reginfo->strend,
2328                                                   utf8_target))
2329                             && (reginfo->intuit || regtry(reginfo, &s)))
2330                         {
2331                             goto got_it;
2332                         }
2333                         previous = before;
2334                         before = after;
2335                         s++;
2336                     }
2337                 }
2338
2339                 if (reginfo->intuit || regtry(reginfo, &s)) {
2340                     goto got_it;
2341                 }
2342         }
2343         break;
2344
2345     case LNBREAK:
2346         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2347                         is_LNBREAK_latin1_safe(s, strend)
2348         );
2349         break;
2350
2351     /* The argument to all the POSIX node types is the class number to pass to
2352      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2353
2354     case NPOSIXL:
2355         to_complement = 1;
2356         /* FALLTHROUGH */
2357
2358     case POSIXL:
2359         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2360         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2361                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2362         break;
2363
2364     case NPOSIXD:
2365         to_complement = 1;
2366         /* FALLTHROUGH */
2367
2368     case POSIXD:
2369         if (utf8_target) {
2370             goto posix_utf8;
2371         }
2372         goto posixa;
2373
2374     case NPOSIXA:
2375         if (utf8_target) {
2376             /* The complement of something that matches only ASCII matches all
2377              * non-ASCII, plus everything in ASCII that isn't in the class. */
2378             REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2379                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2380             break;
2381         }
2382
2383         to_complement = 1;
2384         /* FALLTHROUGH */
2385
2386     case POSIXA:
2387       posixa:
2388         /* Don't need to worry about utf8, as it can match only a single
2389          * byte invariant character. */
2390         REXEC_FBC_CLASS_SCAN(
2391                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2392         break;
2393
2394     case NPOSIXU:
2395         to_complement = 1;
2396         /* FALLTHROUGH */
2397
2398     case POSIXU:
2399         if (! utf8_target) {
2400             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2401                                                                     FLAGS(c))));
2402         }
2403         else {
2404
2405           posix_utf8:
2406             classnum = (_char_class_number) FLAGS(c);
2407             if (classnum < _FIRST_NON_SWASH_CC) {
2408                 while (s < strend) {
2409
2410                     /* We avoid loading in the swash as long as possible, but
2411                      * should we have to, we jump to a separate loop.  This
2412                      * extra 'if' statement is what keeps this code from being
2413                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2414                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2415                         goto found_above_latin1;
2416                     }
2417                     if ((UTF8_IS_INVARIANT(*s)
2418                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2419                                                                 classnum)))
2420                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
2421                             && to_complement ^ cBOOL(
2422                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2423                                                                       *(s + 1)),
2424                                               classnum))))
2425                     {
2426                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2427                             goto got_it;
2428                         else {
2429                             tmp = doevery;
2430                         }
2431                     }
2432                     else {
2433                         tmp = 1;
2434                     }
2435                     s += UTF8SKIP(s);
2436                 }
2437             }
2438             else switch (classnum) {    /* These classes are implemented as
2439                                            macros */
2440                 case _CC_ENUM_SPACE:
2441                     REXEC_FBC_UTF8_CLASS_SCAN(
2442                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
2443                     break;
2444
2445                 case _CC_ENUM_BLANK:
2446                     REXEC_FBC_UTF8_CLASS_SCAN(
2447                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
2448                     break;
2449
2450                 case _CC_ENUM_XDIGIT:
2451                     REXEC_FBC_UTF8_CLASS_SCAN(
2452                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2453                     break;
2454
2455                 case _CC_ENUM_VERTSPACE:
2456                     REXEC_FBC_UTF8_CLASS_SCAN(
2457                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
2458                     break;
2459
2460                 case _CC_ENUM_CNTRL:
2461                     REXEC_FBC_UTF8_CLASS_SCAN(
2462                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
2463                     break;
2464
2465                 default:
2466                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2467                     NOT_REACHED; /* NOTREACHED */
2468             }
2469         }
2470         break;
2471
2472       found_above_latin1:   /* Here we have to load a swash to get the result
2473                                for the current code point */
2474         if (! PL_utf8_swash_ptrs[classnum]) {
2475             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2476             PL_utf8_swash_ptrs[classnum] =
2477                     _core_swash_init("utf8",
2478                                      "",
2479                                      &PL_sv_undef, 1, 0,
2480                                      PL_XPosix_ptrs[classnum], &flags);
2481         }
2482
2483         /* This is a copy of the loop above for swash classes, though using the
2484          * FBC macro instead of being expanded out.  Since we've loaded the
2485          * swash, we don't have to check for that each time through the loop */
2486         REXEC_FBC_UTF8_CLASS_SCAN(
2487                 to_complement ^ cBOOL(_generic_utf8(
2488                                       classnum,
2489                                       s,
2490                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2491                                                   (U8 *) s, TRUE))));
2492         break;
2493
2494     case AHOCORASICKC:
2495     case AHOCORASICK:
2496         {
2497             DECL_TRIE_TYPE(c);
2498             /* what trie are we using right now */
2499             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2500             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2501             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2502
2503             const char *last_start = strend - trie->minlen;
2504 #ifdef DEBUGGING
2505             const char *real_start = s;
2506 #endif
2507             STRLEN maxlen = trie->maxlen;
2508             SV *sv_points;
2509             U8 **points; /* map of where we were in the input string
2510                             when reading a given char. For ASCII this
2511                             is unnecessary overhead as the relationship
2512                             is always 1:1, but for Unicode, especially
2513                             case folded Unicode this is not true. */
2514             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2515             U8 *bitmap=NULL;
2516
2517
2518             GET_RE_DEBUG_FLAGS_DECL;
2519
2520             /* We can't just allocate points here. We need to wrap it in
2521              * an SV so it gets freed properly if there is a croak while
2522              * running the match */
2523             ENTER;
2524             SAVETMPS;
2525             sv_points=newSV(maxlen * sizeof(U8 *));
2526             SvCUR_set(sv_points,
2527                 maxlen * sizeof(U8 *));
2528             SvPOK_on(sv_points);
2529             sv_2mortal(sv_points);
2530             points=(U8**)SvPV_nolen(sv_points );
2531             if ( trie_type != trie_utf8_fold
2532                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2533             {
2534                 if (trie->bitmap)
2535                     bitmap=(U8*)trie->bitmap;
2536                 else
2537                     bitmap=(U8*)ANYOF_BITMAP(c);
2538             }
2539             /* this is the Aho-Corasick algorithm modified a touch
2540                to include special handling for long "unknown char" sequences.
2541                The basic idea being that we use AC as long as we are dealing
2542                with a possible matching char, when we encounter an unknown char
2543                (and we have not encountered an accepting state) we scan forward
2544                until we find a legal starting char.
2545                AC matching is basically that of trie matching, except that when
2546                we encounter a failing transition, we fall back to the current
2547                states "fail state", and try the current char again, a process
2548                we repeat until we reach the root state, state 1, or a legal
2549                transition. If we fail on the root state then we can either
2550                terminate if we have reached an accepting state previously, or
2551                restart the entire process from the beginning if we have not.
2552
2553              */
2554             while (s <= last_start) {
2555                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2556                 U8 *uc = (U8*)s;
2557                 U16 charid = 0;
2558                 U32 base = 1;
2559                 U32 state = 1;
2560                 UV uvc = 0;
2561                 STRLEN len = 0;
2562                 STRLEN foldlen = 0;
2563                 U8 *uscan = (U8*)NULL;
2564                 U8 *leftmost = NULL;
2565 #ifdef DEBUGGING
2566                 U32 accepted_word= 0;
2567 #endif
2568                 U32 pointpos = 0;
2569
2570                 while ( state && uc <= (U8*)strend ) {
2571                     int failed=0;
2572                     U32 word = aho->states[ state ].wordnum;
2573
2574                     if( state==1 ) {
2575                         if ( bitmap ) {
2576                             DEBUG_TRIE_EXECUTE_r(
2577                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2578                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2579                                         (char *)uc, utf8_target, 0 );
2580                                     Perl_re_printf( aTHX_
2581                                         " Scanning for legal start char...\n");
2582                                 }
2583                             );
2584                             if (utf8_target) {
2585                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2586                                     uc += UTF8SKIP(uc);
2587                                 }
2588                             } else {
2589                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2590                                     uc++;
2591                                 }
2592                             }
2593                             s= (char *)uc;
2594                         }
2595                         if (uc >(U8*)last_start) break;
2596                     }
2597
2598                     if ( word ) {
2599                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2600                         if (!leftmost || lpos < leftmost) {
2601                             DEBUG_r(accepted_word=word);
2602                             leftmost= lpos;
2603                         }
2604                         if (base==0) break;
2605
2606                     }
2607                     points[pointpos++ % maxlen]= uc;
2608                     if (foldlen || uc < (U8*)strend) {
2609                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2610                                          widecharmap, uc,
2611                                          uscan, len, uvc, charid, foldlen,
2612                                          foldbuf, uniflags);
2613                         DEBUG_TRIE_EXECUTE_r({
2614                             dump_exec_pos( (char *)uc, c, strend,
2615                                         real_start, s, utf8_target, 0);
2616                             Perl_re_printf( aTHX_
2617                                 " Charid:%3u CP:%4"UVxf" ",
2618                                  charid, uvc);
2619                         });
2620                     }
2621                     else {
2622                         len = 0;
2623                         charid = 0;
2624                     }
2625
2626
2627                     do {
2628 #ifdef DEBUGGING
2629                         word = aho->states[ state ].wordnum;
2630 #endif
2631                         base = aho->states[ state ].trans.base;
2632
2633                         DEBUG_TRIE_EXECUTE_r({
2634                             if (failed)
2635                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2636                                     s,   utf8_target, 0 );
2637                             Perl_re_printf( aTHX_
2638                                 "%sState: %4"UVxf", word=%"UVxf,
2639                                 failed ? " Fail transition to " : "",
2640                                 (UV)state, (UV)word);
2641                         });
2642                         if ( base ) {
2643                             U32 tmp;
2644                             I32 offset;
2645                             if (charid &&
2646                                  ( ((offset = base + charid
2647                                     - 1 - trie->uniquecharcount)) >= 0)
2648                                  && ((U32)offset < trie->lasttrans)
2649                                  && trie->trans[offset].check == state
2650                                  && (tmp=trie->trans[offset].next))
2651                             {
2652                                 DEBUG_TRIE_EXECUTE_r(
2653                                     Perl_re_printf( aTHX_ " - legal\n"));
2654                                 state = tmp;
2655                                 break;
2656                             }
2657                             else {
2658                                 DEBUG_TRIE_EXECUTE_r(
2659                                     Perl_re_printf( aTHX_ " - fail\n"));
2660                                 failed = 1;
2661                                 state = aho->fail[state];
2662                             }
2663                         }
2664                         else {
2665                             /* we must be accepting here */
2666                             DEBUG_TRIE_EXECUTE_r(
2667                                     Perl_re_printf( aTHX_ " - accepting\n"));
2668                             failed = 1;
2669                             break;
2670                         }
2671                     } while(state);
2672                     uc += len;
2673                     if (failed) {
2674                         if (leftmost)
2675                             break;
2676                         if (!state) state = 1;
2677                     }
2678                 }
2679                 if ( aho->states[ state ].wordnum ) {
2680                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2681                     if (!leftmost || lpos < leftmost) {
2682                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2683                         leftmost = lpos;
2684                     }
2685                 }
2686                 if (leftmost) {
2687                     s = (char*)leftmost;
2688                     DEBUG_TRIE_EXECUTE_r({
2689                         Perl_re_printf( aTHX_  "Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2690                             (UV)accepted_word, (IV)(s - real_start)
2691                         );
2692                     });
2693                     if (reginfo->intuit || regtry(reginfo, &s)) {
2694                         FREETMPS;
2695                         LEAVE;
2696                         goto got_it;
2697                     }
2698                     s = HOPc(s,1);
2699                     DEBUG_TRIE_EXECUTE_r({
2700                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2701                     });
2702                 } else {
2703                     DEBUG_TRIE_EXECUTE_r(
2704                         Perl_re_printf( aTHX_ "No match.\n"));
2705                     break;
2706                 }
2707             }
2708             FREETMPS;
2709             LEAVE;
2710         }
2711         break;
2712     default:
2713         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2714     }
2715     return 0;
2716   got_it:
2717     return s;
2718 }
2719
2720 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2721  * flags have same meanings as with regexec_flags() */
2722
2723 static void
2724 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2725                             char *strbeg,
2726                             char *strend,
2727                             SV *sv,
2728                             U32 flags,
2729                             bool utf8_target)
2730 {
2731     struct regexp *const prog = ReANY(rx);
2732
2733     if (flags & REXEC_COPY_STR) {
2734 #ifdef PERL_ANY_COW
2735         if (SvCANCOW(sv)) {
2736             DEBUG_C(Perl_re_printf( aTHX_
2737                               "Copy on write: regexp capture, type %d\n",
2738                                     (int) SvTYPE(sv)));
2739             /* Create a new COW SV to share the match string and store
2740              * in saved_copy, unless the current COW SV in saved_copy
2741              * is valid and suitable for our purpose */
2742             if ((   prog->saved_copy
2743                  && SvIsCOW(prog->saved_copy)
2744                  && SvPOKp(prog->saved_copy)
2745                  && SvIsCOW(sv)
2746                  && SvPOKp(sv)
2747                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2748             {
2749                 /* just reuse saved_copy SV */
2750                 if (RXp_MATCH_COPIED(prog)) {
2751                     Safefree(prog->subbeg);
2752                     RXp_MATCH_COPIED_off(prog);
2753                 }
2754             }
2755             else {
2756                 /* create new COW SV to share string */
2757                 RX_MATCH_COPY_FREE(rx);
2758                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2759             }
2760             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2761             assert (SvPOKp(prog->saved_copy));
2762             prog->sublen  = strend - strbeg;
2763             prog->suboffset = 0;
2764             prog->subcoffset = 0;
2765         } else
2766 #endif
2767         {
2768             SSize_t min = 0;
2769             SSize_t max = strend - strbeg;
2770             SSize_t sublen;
2771
2772             if (    (flags & REXEC_COPY_SKIP_POST)
2773                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2774                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2775             ) { /* don't copy $' part of string */
2776                 U32 n = 0;
2777                 max = -1;
2778                 /* calculate the right-most part of the string covered
2779                  * by a capture. Due to lookahead, this may be to
2780                  * the right of $&, so we have to scan all captures */
2781                 while (n <= prog->lastparen) {
2782                     if (prog->offs[n].end > max)
2783                         max = prog->offs[n].end;
2784                     n++;
2785                 }
2786                 if (max == -1)
2787                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2788                             ? prog->offs[0].start
2789                             : 0;
2790                 assert(max >= 0 && max <= strend - strbeg);
2791             }
2792
2793             if (    (flags & REXEC_COPY_SKIP_PRE)
2794                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2795                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2796             ) { /* don't copy $` part of string */
2797                 U32 n = 0;
2798                 min = max;
2799                 /* calculate the left-most part of the string covered
2800                  * by a capture. Due to lookbehind, this may be to
2801                  * the left of $&, so we have to scan all captures */
2802                 while (min && n <= prog->lastparen) {
2803                     if (   prog->offs[n].start != -1
2804                         && prog->offs[n].start < min)
2805                     {
2806                         min = prog->offs[n].start;
2807                     }
2808                     n++;
2809                 }
2810                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2811                     && min >  prog->offs[0].end
2812                 )
2813                     min = prog->offs[0].end;
2814
2815             }
2816
2817             assert(min >= 0 && min <= max && min <= strend - strbeg);
2818             sublen = max - min;
2819
2820             if (RX_MATCH_COPIED(rx)) {
2821                 if (sublen > prog->sublen)
2822                     prog->subbeg =
2823                             (char*)saferealloc(prog->subbeg, sublen+1);
2824             }
2825             else
2826                 prog->subbeg = (char*)safemalloc(sublen+1);
2827             Copy(strbeg + min, prog->subbeg, sublen, char);
2828             prog->subbeg[sublen] = '\0';
2829             prog->suboffset = min;
2830             prog->sublen = sublen;
2831             RX_MATCH_COPIED_on(rx);
2832         }
2833         prog->subcoffset = prog->suboffset;
2834         if (prog->suboffset && utf8_target) {
2835             /* Convert byte offset to chars.
2836              * XXX ideally should only compute this if @-/@+
2837              * has been seen, a la PL_sawampersand ??? */
2838
2839             /* If there's a direct correspondence between the
2840              * string which we're matching and the original SV,
2841              * then we can use the utf8 len cache associated with
2842              * the SV. In particular, it means that under //g,
2843              * sv_pos_b2u() will use the previously cached
2844              * position to speed up working out the new length of
2845              * subcoffset, rather than counting from the start of
2846              * the string each time. This stops
2847              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2848              * from going quadratic */
2849             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2850                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2851                                                 SV_GMAGIC|SV_CONST_RETURN);
2852             else
2853                 prog->subcoffset = utf8_length((U8*)strbeg,
2854                                     (U8*)(strbeg+prog->suboffset));
2855         }
2856     }
2857     else {
2858         RX_MATCH_COPY_FREE(rx);
2859         prog->subbeg = strbeg;
2860         prog->suboffset = 0;
2861         prog->subcoffset = 0;
2862         prog->sublen = strend - strbeg;
2863     }
2864 }
2865
2866
2867
2868
2869 /*
2870  - regexec_flags - match a regexp against a string
2871  */
2872 I32
2873 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2874               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2875 /* stringarg: the point in the string at which to begin matching */
2876 /* strend:    pointer to null at end of string */
2877 /* strbeg:    real beginning of string */
2878 /* minend:    end of match must be >= minend bytes after stringarg. */
2879 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2880  *            itself is accessed via the pointers above */
2881 /* data:      May be used for some additional optimizations.
2882               Currently unused. */
2883 /* flags:     For optimizations. See REXEC_* in regexp.h */
2884
2885 {
2886     struct regexp *const prog = ReANY(rx);
2887     char *s;
2888     regnode *c;
2889     char *startpos;
2890     SSize_t minlen;             /* must match at least this many chars */
2891     SSize_t dontbother = 0;     /* how many characters not to try at end */
2892     const bool utf8_target = cBOOL(DO_UTF8(sv));
2893     I32 multiline;
2894     RXi_GET_DECL(prog,progi);
2895     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2896     regmatch_info *const reginfo = &reginfo_buf;
2897     regexp_paren_pair *swap = NULL;
2898     I32 oldsave;
2899     GET_RE_DEBUG_FLAGS_DECL;
2900
2901     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2902     PERL_UNUSED_ARG(data);
2903
2904     /* Be paranoid... */
2905     if (prog == NULL) {
2906         Perl_croak(aTHX_ "NULL regexp parameter");
2907     }
2908
2909     DEBUG_EXECUTE_r(
2910         debug_start_match(rx, utf8_target, stringarg, strend,
2911         "Matching");
2912     );
2913
2914     startpos = stringarg;
2915
2916     /* set these early as they may be used by the HOP macros below */
2917     reginfo->strbeg = strbeg;
2918     reginfo->strend = strend;
2919     reginfo->is_utf8_target = cBOOL(utf8_target);
2920
2921     if (prog->intflags & PREGf_GPOS_SEEN) {
2922         MAGIC *mg;
2923
2924         /* set reginfo->ganch, the position where \G can match */
2925
2926         reginfo->ganch =
2927             (flags & REXEC_IGNOREPOS)
2928             ? stringarg /* use start pos rather than pos() */
2929             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2930               /* Defined pos(): */
2931             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2932             : strbeg; /* pos() not defined; use start of string */
2933
2934         DEBUG_GPOS_r(Perl_re_printf( aTHX_
2935             "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2936
2937         /* in the presence of \G, we may need to start looking earlier in
2938          * the string than the suggested start point of stringarg:
2939          * if prog->gofs is set, then that's a known, fixed minimum
2940          * offset, such as
2941          * /..\G/:   gofs = 2
2942          * /ab|c\G/: gofs = 1
2943          * or if the minimum offset isn't known, then we have to go back
2944          * to the start of the string, e.g. /w+\G/
2945          */
2946
2947         if (prog->intflags & PREGf_ANCH_GPOS) {
2948             if (prog->gofs) {
2949                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
2950                 if (!startpos ||
2951                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
2952                 {
2953                     DEBUG_r(Perl_re_printf( aTHX_
2954                             "fail: ganch-gofs before earliest possible start\n"));
2955                     return 0;
2956                 }
2957             }
2958             else
2959                 startpos = reginfo->ganch;
2960         }
2961         else if (prog->gofs) {
2962             startpos = HOPBACKc(startpos, prog->gofs);
2963             if (!startpos)
2964                 startpos = strbeg;
2965         }
2966         else if (prog->intflags & PREGf_GPOS_FLOAT)
2967             startpos = strbeg;
2968     }
2969
2970     minlen = prog->minlen;
2971     if ((startpos + minlen) > strend || startpos < strbeg) {
2972         DEBUG_r(Perl_re_printf( aTHX_
2973                     "Regex match can't succeed, so not even tried\n"));
2974         return 0;
2975     }
2976
2977     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2978      * which will call destuctors to reset PL_regmatch_state, free higher
2979      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2980      * regmatch_info_aux_eval */
2981
2982     oldsave = PL_savestack_ix;
2983
2984     s = startpos;
2985
2986     if ((prog->extflags & RXf_USE_INTUIT)
2987         && !(flags & REXEC_CHECKED))
2988     {
2989         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2990                                     flags, NULL);
2991         if (!s)
2992             return 0;
2993
2994         if (prog->extflags & RXf_CHECK_ALL) {
2995             /* we can match based purely on the result of INTUIT.
2996              * Set up captures etc just for $& and $-[0]
2997              * (an intuit-only match wont have $1,$2,..) */
2998             assert(!prog->nparens);
2999
3000             /* s/// doesn't like it if $& is earlier than where we asked it to
3001              * start searching (which can happen on something like /.\G/) */
3002             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3003                     && (s < stringarg))
3004             {
3005                 /* this should only be possible under \G */
3006                 assert(prog->intflags & PREGf_GPOS_SEEN);
3007                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3008                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3009                 goto phooey;
3010             }
3011
3012             /* match via INTUIT shouldn't have any captures.
3013              * Let @-, @+, $^N know */
3014             prog->lastparen = prog->lastcloseparen = 0;
3015             RX_MATCH_UTF8_set(rx, utf8_target);
3016             prog->offs[0].start = s - strbeg;
3017             prog->offs[0].end = utf8_target
3018                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3019                 : s - strbeg + prog->minlenret;
3020             if ( !(flags & REXEC_NOT_FIRST) )
3021                 S_reg_set_capture_string(aTHX_ rx,
3022                                         strbeg, strend,
3023                                         sv, flags, utf8_target);
3024
3025             return 1;
3026         }
3027     }
3028
3029     multiline = prog->extflags & RXf_PMf_MULTILINE;
3030
3031     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3032         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3033                               "String too short [regexec_flags]...\n"));
3034         goto phooey;
3035     }
3036
3037     /* Check validity of program. */
3038     if (UCHARAT(progi->program) != REG_MAGIC) {
3039         Perl_croak(aTHX_ "corrupted regexp program");
3040     }
3041
3042     RX_MATCH_TAINTED_off(rx);
3043     RX_MATCH_UTF8_set(rx, utf8_target);
3044
3045     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3046     reginfo->intuit = 0;
3047     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3048     reginfo->warned = FALSE;
3049     reginfo->sv = sv;
3050     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3051     /* see how far we have to get to not match where we matched before */
3052     reginfo->till = stringarg + minend;
3053
3054     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3055         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3056            S_cleanup_regmatch_info_aux has executed (registered by
3057            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3058            magic belonging to this SV.
3059            Not newSVsv, either, as it does not COW.
3060         */
3061         reginfo->sv = newSV(0);
3062         SvSetSV_nosteal(reginfo->sv, sv);
3063         SAVEFREESV(reginfo->sv);
3064     }
3065
3066     /* reserve next 2 or 3 slots in PL_regmatch_state:
3067      * slot N+0: may currently be in use: skip it
3068      * slot N+1: use for regmatch_info_aux struct
3069      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3070      * slot N+3: ready for use by regmatch()
3071      */
3072
3073     {
3074         regmatch_state *old_regmatch_state;
3075         regmatch_slab  *old_regmatch_slab;
3076         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3077
3078         /* on first ever match, allocate first slab */
3079         if (!PL_regmatch_slab) {
3080             Newx(PL_regmatch_slab, 1, regmatch_slab);
3081             PL_regmatch_slab->prev = NULL;
3082             PL_regmatch_slab->next = NULL;
3083             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3084         }
3085
3086         old_regmatch_state = PL_regmatch_state;
3087         old_regmatch_slab  = PL_regmatch_slab;
3088
3089         for (i=0; i <= max; i++) {
3090             if (i == 1)
3091                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3092             else if (i ==2)
3093                 reginfo->info_aux_eval =
3094                 reginfo->info_aux->info_aux_eval =
3095                             &(PL_regmatch_state->u.info_aux_eval);
3096
3097             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3098                 PL_regmatch_state = S_push_slab(aTHX);
3099         }
3100
3101         /* note initial PL_regmatch_state position; at end of match we'll
3102          * pop back to there and free any higher slabs */
3103
3104         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3105         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3106         reginfo->info_aux->poscache = NULL;
3107
3108         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3109
3110         if ((prog->extflags & RXf_EVAL_SEEN))
3111             S_setup_eval_state(aTHX_ reginfo);
3112         else
3113             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3114     }
3115
3116     /* If there is a "must appear" string, look for it. */
3117
3118     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3119         /* We have to be careful. If the previous successful match
3120            was from this regex we don't want a subsequent partially
3121            successful match to clobber the old results.
3122            So when we detect this possibility we add a swap buffer
3123            to the re, and switch the buffer each match. If we fail,
3124            we switch it back; otherwise we leave it swapped.
3125         */
3126         swap = prog->offs;
3127         /* do we need a save destructor here for eval dies? */
3128         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3129         DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
3130             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3131             PTR2UV(prog),
3132             PTR2UV(swap),
3133             PTR2UV(prog->offs)
3134         ));
3135     }
3136
3137     if (prog->recurse_locinput)
3138         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3139
3140     /* Simplest case: anchored match need be tried only once, or with
3141      * MBOL, only at the beginning of each line.
3142      *
3143      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3144      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3145      * match at the start of the string then it won't match anywhere else
3146      * either; while with /.*.../, if it doesn't match at the beginning,
3147      * the earliest it could match is at the start of the next line */
3148
3149     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3150         char *end;
3151
3152         if (regtry(reginfo, &s))
3153             goto got_it;
3154
3155         if (!(prog->intflags & PREGf_ANCH_MBOL))
3156             goto phooey;
3157
3158         /* didn't match at start, try at other newline positions */
3159
3160         if (minlen)
3161             dontbother = minlen - 1;
3162         end = HOP3c(strend, -dontbother, strbeg) - 1;
3163
3164         /* skip to next newline */
3165
3166         while (s <= end) { /* note it could be possible to match at the end of the string */
3167             /* NB: newlines are the same in unicode as they are in latin */
3168             if (*s++ != '\n')
3169                 continue;
3170             if (prog->check_substr || prog->check_utf8) {
3171             /* note that with PREGf_IMPLICIT, intuit can only fail
3172              * or return the start position, so it's of limited utility.
3173              * Nevertheless, I made the decision that the potential for
3174              * quick fail was still worth it - DAPM */
3175                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3176                 if (!s)
3177                     goto phooey;
3178             }
3179             if (regtry(reginfo, &s))
3180                 goto got_it;
3181         }
3182         goto phooey;
3183     } /* end anchored search */
3184
3185     if (prog->intflags & PREGf_ANCH_GPOS)
3186     {
3187         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3188         assert(prog->intflags & PREGf_GPOS_SEEN);
3189         /* For anchored \G, the only position it can match from is
3190          * (ganch-gofs); we already set startpos to this above; if intuit
3191          * moved us on from there, we can't possibly succeed */
3192         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3193         if (s == startpos && regtry(reginfo, &s))
3194             goto got_it;
3195         goto phooey;
3196     }
3197
3198     /* Messy cases:  unanchored match. */
3199     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3200         /* we have /x+whatever/ */
3201         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3202         char ch;
3203 #ifdef DEBUGGING
3204         int did_match = 0;
3205 #endif
3206         if (utf8_target) {
3207             if (! prog->anchored_utf8) {
3208                 to_utf8_substr(prog);
3209             }
3210             ch = SvPVX_const(prog->anchored_utf8)[0];
3211             REXEC_FBC_SCAN(
3212                 if (*s == ch) {
3213                     DEBUG_EXECUTE_r( did_match = 1 );
3214                     if (regtry(reginfo, &s)) goto got_it;
3215                     s += UTF8SKIP(s);
3216                     while (s < strend && *s == ch)
3217                         s += UTF8SKIP(s);
3218                 }
3219             );
3220
3221         }
3222         else {
3223             if (! prog->anchored_substr) {
3224                 if (! to_byte_substr(prog)) {
3225                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3226                 }
3227             }
3228             ch = SvPVX_const(prog->anchored_substr)[0];
3229             REXEC_FBC_SCAN(
3230                 if (*s == ch) {
3231                     DEBUG_EXECUTE_r( did_match = 1 );
3232                     if (regtry(reginfo, &s)) goto got_it;
3233                     s++;
3234                     while (s < strend && *s == ch)
3235                         s++;
3236                 }
3237             );
3238         }
3239         DEBUG_EXECUTE_r(if (!did_match)
3240                 Perl_re_printf( aTHX_
3241                                   "Did not find anchored character...\n")
3242                );
3243     }
3244     else if (prog->anchored_substr != NULL
3245               || prog->anchored_utf8 != NULL
3246               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3247                   && prog->float_max_offset < strend - s)) {
3248         SV *must;
3249         SSize_t back_max;
3250         SSize_t back_min;
3251         char *last;
3252         char *last1;            /* Last position checked before */
3253 #ifdef DEBUGGING
3254         int did_match = 0;
3255 #endif
3256         if (prog->anchored_substr || prog->anchored_utf8) {
3257             if (utf8_target) {
3258                 if (! prog->anchored_utf8) {
3259                     to_utf8_substr(prog);
3260                 }
3261                 must = prog->anchored_utf8;
3262             }
3263             else {
3264                 if (! prog->anchored_substr) {
3265                     if (! to_byte_substr(prog)) {
3266                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3267                     }
3268                 }
3269                 must = prog->anchored_substr;
3270             }
3271             back_max = back_min = prog->anchored_offset;
3272         } else {
3273             if (utf8_target) {
3274                 if (! prog->float_utf8) {
3275                     to_utf8_substr(prog);
3276                 }
3277                 must = prog->float_utf8;
3278             }
3279             else {
3280                 if (! prog->float_substr) {
3281                     if (! to_byte_substr(prog)) {
3282                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3283                     }
3284                 }
3285                 must = prog->float_substr;
3286             }
3287             back_max = prog->float_max_offset;
3288             back_min = prog->float_min_offset;
3289         }
3290
3291         if (back_min<0) {
3292             last = strend;
3293         } else {
3294             last = HOP3c(strend,        /* Cannot start after this */
3295                   -(SSize_t)(CHR_SVLEN(must)
3296                          - (SvTAIL(must) != 0) + back_min), strbeg);
3297         }
3298         if (s > reginfo->strbeg)
3299             last1 = HOPc(s, -1);
3300         else
3301             last1 = s - 1;      /* bogus */
3302
3303         /* XXXX check_substr already used to find "s", can optimize if
3304            check_substr==must. */
3305         dontbother = 0;
3306         strend = HOPc(strend, -dontbother);
3307         while ( (s <= last) &&
3308                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3309                                   (unsigned char*)strend, must,
3310                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3311             DEBUG_EXECUTE_r( did_match = 1 );
3312             if (HOPc(s, -back_max) > last1) {
3313                 last1 = HOPc(s, -back_min);
3314                 s = HOPc(s, -back_max);
3315             }
3316             else {
3317                 char * const t = (last1 >= reginfo->strbeg)
3318                                     ? HOPc(last1, 1) : last1 + 1;
3319
3320                 last1 = HOPc(s, -back_min);
3321                 s = t;
3322             }
3323             if (utf8_target) {
3324                 while (s <= last1) {
3325                     if (regtry(reginfo, &s))
3326                         goto got_it;
3327                     if (s >= last1) {
3328                         s++; /* to break out of outer loop */
3329                         break;
3330                     }
3331                     s += UTF8SKIP(s);
3332                 }
3333             }
3334             else {
3335                 while (s <= last1) {
3336                     if (regtry(reginfo, &s))
3337                         goto got_it;
3338                     s++;
3339                 }
3340             }
3341         }
3342         DEBUG_EXECUTE_r(if (!did_match) {
3343             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3344                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3345             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3346                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3347                                ? "anchored" : "floating"),
3348                 quoted, RE_SV_TAIL(must));
3349         });
3350         goto phooey;
3351     }
3352     else if ( (c = progi->regstclass) ) {
3353         if (minlen) {
3354             const OPCODE op = OP(progi->regstclass);
3355             /* don't bother with what can't match */
3356             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3357                 strend = HOPc(strend, -(minlen - 1));
3358         }
3359         DEBUG_EXECUTE_r({
3360             SV * const prop = sv_newmortal();
3361             regprop(prog, prop, c, reginfo, NULL);
3362             {
3363                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3364                     s,strend-s,60);
3365                 Perl_re_printf( aTHX_
3366                     "Matching stclass %.*s against %s (%d bytes)\n",
3367                     (int)SvCUR(prop), SvPVX_const(prop),
3368                      quoted, (int)(strend - s));
3369             }
3370         });
3371         if (find_byclass(prog, c, s, strend, reginfo))
3372             goto got_it;
3373         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3374     }
3375     else {
3376         dontbother = 0;
3377         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3378             /* Trim the end. */
3379             char *last= NULL;
3380             SV* float_real;
3381             STRLEN len;
3382             const char *little;
3383
3384             if (utf8_target) {
3385                 if (! prog->float_utf8) {
3386                     to_utf8_substr(prog);
3387                 }
3388                 float_real = prog->float_utf8;
3389             }
3390             else {
3391                 if (! prog->float_substr) {
3392                     if (! to_byte_substr(prog)) {
3393                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3394                     }
3395                 }
3396                 float_real = prog->float_substr;
3397             }
3398
3399             little = SvPV_const(float_real, len);
3400             if (SvTAIL(float_real)) {
3401                     /* This means that float_real contains an artificial \n on
3402                      * the end due to the presence of something like this:
3403                      * /foo$/ where we can match both "foo" and "foo\n" at the
3404                      * end of the string.  So we have to compare the end of the
3405                      * string first against the float_real without the \n and
3406                      * then against the full float_real with the string.  We
3407                      * have to watch out for cases where the string might be
3408                      * smaller than the float_real or the float_real without
3409                      * the \n. */
3410                     char *checkpos= strend - len;
3411                     DEBUG_OPTIMISE_r(
3412                         Perl_re_printf( aTHX_
3413                             "%sChecking for float_real.%s\n",
3414                             PL_colors[4], PL_colors[5]));
3415                     if (checkpos + 1 < strbeg) {
3416                         /* can't match, even if we remove the trailing \n
3417                          * string is too short to match */
3418                         DEBUG_EXECUTE_r(
3419                             Perl_re_printf( aTHX_
3420                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3421                                 PL_colors[4], PL_colors[5]));
3422                         goto phooey;
3423                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3424                         /* can match, the end of the string matches without the
3425                          * "\n" */
3426                         last = checkpos + 1;
3427                     } else if (checkpos < strbeg) {
3428                         /* cant match, string is too short when the "\n" is
3429                          * included */
3430                         DEBUG_EXECUTE_r(
3431                             Perl_re_printf( aTHX_
3432                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3433                                 PL_colors[4], PL_colors[5]));
3434                         goto phooey;
3435                     } else if (!multiline) {
3436                         /* non multiline match, so compare with the "\n" at the
3437                          * end of the string */
3438                         if (memEQ(checkpos, little, len)) {
3439                             last= checkpos;
3440                         } else {
3441                             DEBUG_EXECUTE_r(
3442                                 Perl_re_printf( aTHX_
3443                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3444                                     PL_colors[4], PL_colors[5]));
3445                             goto phooey;
3446                         }
3447                     } else {
3448                         /* multiline match, so we have to search for a place
3449                          * where the full string is located */
3450                         goto find_last;
3451                     }
3452             } else {
3453                   find_last:
3454                     if (len)
3455                         last = rninstr(s, strend, little, little + len);
3456                     else
3457                         last = strend;  /* matching "$" */
3458             }
3459             if (!last) {
3460                 /* at one point this block contained a comment which was
3461                  * probably incorrect, which said that this was a "should not
3462                  * happen" case.  Even if it was true when it was written I am
3463                  * pretty sure it is not anymore, so I have removed the comment
3464                  * and replaced it with this one. Yves */
3465                 DEBUG_EXECUTE_r(
3466                     Perl_re_printf( aTHX_
3467                         "%sString does not contain required substring, cannot match.%s\n",
3468                         PL_colors[4], PL_colors[5]
3469                     ));
3470                 goto phooey;
3471             }
3472             dontbother = strend - last + prog->float_min_offset;
3473         }
3474         if (minlen && (dontbother < minlen))
3475             dontbother = minlen - 1;
3476         strend -= dontbother;              /* this one's always in bytes! */
3477         /* We don't know much -- general case. */
3478         if (utf8_target) {
3479             for (;;) {
3480                 if (regtry(reginfo, &s))
3481                     goto got_it;
3482                 if (s >= strend)
3483                     break;
3484                 s += UTF8SKIP(s);
3485             };
3486         }
3487         else {
3488             do {
3489                 if (regtry(reginfo, &s))
3490                     goto got_it;
3491             } while (s++ < strend);
3492         }
3493     }
3494
3495     /* Failure. */
3496     goto phooey;
3497
3498   got_it:
3499     /* s/// doesn't like it if $& is earlier than where we asked it to
3500      * start searching (which can happen on something like /.\G/) */
3501     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3502             && (prog->offs[0].start < stringarg - strbeg))
3503     {
3504         /* this should only be possible under \G */
3505         assert(prog->intflags & PREGf_GPOS_SEEN);
3506         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3507             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3508         goto phooey;
3509     }
3510
3511     DEBUG_BUFFERS_r(
3512         if (swap)
3513             Perl_re_printf( aTHX_
3514                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3515                 PTR2UV(prog),
3516                 PTR2UV(swap)
3517             );
3518     );
3519     Safefree(swap);
3520
3521     /* clean up; this will trigger destructors that will free all slabs
3522      * above the current one, and cleanup the regmatch_info_aux
3523      * and regmatch_info_aux_eval sructs */
3524
3525     LEAVE_SCOPE(oldsave);
3526
3527     if (RXp_PAREN_NAMES(prog))
3528         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3529
3530     /* make sure $`, $&, $', and $digit will work later */
3531     if ( !(flags & REXEC_NOT_FIRST) )
3532         S_reg_set_capture_string(aTHX_ rx,
3533                                     strbeg, reginfo->strend,
3534                                     sv, flags, utf8_target);
3535
3536     return 1;
3537
3538   phooey:
3539     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3540                           PL_colors[4], PL_colors[5]));
3541
3542     /* clean up; this will trigger destructors that will free all slabs
3543      * above the current one, and cleanup the regmatch_info_aux
3544      * and regmatch_info_aux_eval sructs */
3545
3546     LEAVE_SCOPE(oldsave);
3547
3548     if (swap) {
3549         /* we failed :-( roll it back */
3550         DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
3551             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3552             PTR2UV(prog),
3553             PTR2UV(prog->offs),
3554             PTR2UV(swap)
3555         ));
3556         Safefree(prog->offs);
3557         prog->offs = swap;
3558     }
3559     return 0;
3560 }
3561
3562
3563 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3564  * Do inc before dec, in case old and new rex are the same */
3565 #define SET_reg_curpm(Re2)                          \
3566     if (reginfo->info_aux_eval) {                   \
3567         (void)ReREFCNT_inc(Re2);                    \
3568         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3569         PM_SETRE((PL_reg_curpm), (Re2));            \
3570     }
3571
3572
3573 /*
3574  - regtry - try match at specific point
3575  */
3576 STATIC bool                     /* 0 failure, 1 success */
3577 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3578 {
3579     CHECKPOINT lastcp;
3580     REGEXP *const rx = reginfo->prog;
3581     regexp *const prog = ReANY(rx);
3582     SSize_t result;
3583 #ifdef DEBUGGING
3584     U32 depth = 0; /* used by REGCP_SET */
3585 #endif
3586     RXi_GET_DECL(prog,progi);
3587     GET_RE_DEBUG_FLAGS_DECL;
3588
3589     PERL_ARGS_ASSERT_REGTRY;
3590
3591     reginfo->cutpoint=NULL;
3592
3593     prog->offs[0].start = *startposp - reginfo->strbeg;
3594     prog->lastparen = 0;
3595     prog->lastcloseparen = 0;
3596
3597     /* XXXX What this code is doing here?!!!  There should be no need
3598        to do this again and again, prog->lastparen should take care of
3599        this!  --ilya*/
3600
3601     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3602      * Actually, the code in regcppop() (which Ilya may be meaning by
3603      * prog->lastparen), is not needed at all by the test suite
3604      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3605      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3606      * Meanwhile, this code *is* needed for the
3607      * above-mentioned test suite tests to succeed.  The common theme
3608      * on those tests seems to be returning null fields from matches.
3609      * --jhi updated by dapm */
3610 #if 1
3611     if (prog->nparens) {
3612         regexp_paren_pair *pp = prog->offs;
3613         I32 i;
3614         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3615             ++pp;
3616             pp->start = -1;
3617             pp->end = -1;
3618         }
3619     }
3620 #endif
3621     REGCP_SET(lastcp);
3622     result = regmatch(reginfo, *startposp, progi->program + 1);
3623     if (result != -1) {
3624         prog->offs[0].end = result;
3625         return 1;
3626     }
3627     if (reginfo->cutpoint)
3628         *startposp= reginfo->cutpoint;
3629     REGCP_UNWIND(lastcp);
3630     return 0;
3631 }
3632
3633
3634 #define sayYES goto yes
3635 #define sayNO goto no
3636 #define sayNO_SILENT goto no_silent
3637
3638 /* we dont use STMT_START/END here because it leads to
3639    "unreachable code" warnings, which are bogus, but distracting. */
3640 #define CACHEsayNO \
3641     if (ST.cache_mask) \
3642        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3643     sayNO
3644
3645 /* this is used to determine how far from the left messages like
3646    'failed...' are printed in regexec.c. It should be set such that
3647    messages are inline with the regop output that created them.
3648 */
3649 #define REPORT_CODE_OFF 29
3650 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3651 #ifdef DEBUGGING
3652 int
3653 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3654 {
3655     va_list ap;
3656     int result;
3657     PerlIO *f= Perl_debug_log;
3658     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3659     va_start(ap, depth);
3660     PerlIO_printf(f, "%*s|%4"UVuf"| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3661     result = PerlIO_vprintf(f, fmt, ap);
3662     va_end(ap);
3663     return result;
3664 }
3665 #endif /* DEBUGGING */
3666
3667
3668 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3669 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3670 #define CHRTEST_NOT_A_CP_1 -999
3671 #define CHRTEST_NOT_A_CP_2 -998
3672
3673 /* grab a new slab and return the first slot in it */
3674
3675 STATIC regmatch_state *
3676 S_push_slab(pTHX)
3677 {
3678 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3679     dMY_CXT;
3680 #endif
3681     regmatch_slab *s = PL_regmatch_slab->next;
3682     if (!s) {
3683         Newx(s, 1, regmatch_slab);
3684         s->prev = PL_regmatch_slab;
3685         s->next = NULL;
3686         PL_regmatch_slab->next = s;
3687     }
3688     PL_regmatch_slab = s;
3689     return SLAB_FIRST(s);
3690 }
3691
3692
3693 /* push a new state then goto it */
3694
3695 #define PUSH_STATE_GOTO(state, node, input) \
3696     pushinput = input; \
3697     scan = node; \
3698     st->resume_state = state; \
3699     goto push_state;
3700
3701 /* push a new state with success backtracking, then goto it */
3702
3703 #define PUSH_YES_STATE_GOTO(state, node, input) \
3704     pushinput = input; \
3705     scan = node; \
3706     st->resume_state = state; \
3707     goto push_yes_state;
3708
3709
3710
3711
3712 /*
3713
3714 regmatch() - main matching routine
3715
3716 This is basically one big switch statement in a loop. We execute an op,
3717 set 'next' to point the next op, and continue. If we come to a point which
3718 we may need to backtrack to on failure such as (A|B|C), we push a
3719 backtrack state onto the backtrack stack. On failure, we pop the top
3720 state, and re-enter the loop at the state indicated. If there are no more
3721 states to pop, we return failure.
3722
3723 Sometimes we also need to backtrack on success; for example /A+/, where
3724 after successfully matching one A, we need to go back and try to
3725 match another one; similarly for lookahead assertions: if the assertion
3726 completes successfully, we backtrack to the state just before the assertion
3727 and then carry on.  In these cases, the pushed state is marked as
3728 'backtrack on success too'. This marking is in fact done by a chain of
3729 pointers, each pointing to the previous 'yes' state. On success, we pop to
3730 the nearest yes state, discarding any intermediate failure-only states.
3731 Sometimes a yes state is pushed just to force some cleanup code to be
3732 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3733 it to free the inner regex.
3734
3735 Note that failure backtracking rewinds the cursor position, while
3736 success backtracking leaves it alone.
3737
3738 A pattern is complete when the END op is executed, while a subpattern
3739 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3740 ops trigger the "pop to last yes state if any, otherwise return true"
3741 behaviour.
3742
3743 A common convention in this function is to use A and B to refer to the two
3744 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3745 the subpattern to be matched possibly multiple times, while B is the entire
3746 rest of the pattern. Variable and state names reflect this convention.
3747
3748 The states in the main switch are the union of ops and failure/success of
3749 substates associated with with that op.  For example, IFMATCH is the op
3750 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3751 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3752 successfully matched A and IFMATCH_A_fail is a state saying that we have
3753 just failed to match A. Resume states always come in pairs. The backtrack
3754 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3755 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3756 on success or failure.
3757
3758 The struct that holds a backtracking state is actually a big union, with
3759 one variant for each major type of op. The variable st points to the
3760 top-most backtrack struct. To make the code clearer, within each
3761 block of code we #define ST to alias the relevant union.
3762
3763 Here's a concrete example of a (vastly oversimplified) IFMATCH
3764 implementation:
3765
3766     switch (state) {
3767     ....
3768
3769 #define ST st->u.ifmatch
3770
3771     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3772         ST.foo = ...; // some state we wish to save
3773         ...
3774         // push a yes backtrack state with a resume value of
3775         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3776         // first node of A:
3777         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3778         // NOTREACHED
3779
3780     case IFMATCH_A: // we have successfully executed A; now continue with B
3781         next = B;
3782         bar = ST.foo; // do something with the preserved value
3783         break;
3784
3785     case IFMATCH_A_fail: // A failed, so the assertion failed
3786         ...;   // do some housekeeping, then ...
3787         sayNO; // propagate the failure
3788
3789 #undef ST
3790
3791     ...
3792     }
3793
3794 For any old-timers reading this who are familiar with the old recursive
3795 approach, the code above is equivalent to:
3796
3797     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3798     {
3799         int foo = ...
3800         ...
3801         if (regmatch(A)) {
3802             next = B;
3803             bar = foo;
3804             break;
3805         }
3806         ...;   // do some housekeeping, then ...
3807         sayNO; // propagate the failure
3808     }
3809
3810 The topmost backtrack state, pointed to by st, is usually free. If you
3811 want to claim it, populate any ST.foo fields in it with values you wish to
3812 save, then do one of
3813
3814         PUSH_STATE_GOTO(resume_state, node, newinput);
3815         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3816
3817 which sets that backtrack state's resume value to 'resume_state', pushes a
3818 new free entry to the top of the backtrack stack, then goes to 'node'.
3819 On backtracking, the free slot is popped, and the saved state becomes the
3820 new free state. An ST.foo field in this new top state can be temporarily
3821 accessed to retrieve values, but once the main loop is re-entered, it
3822 becomes available for reuse.
3823
3824 Note that the depth of the backtrack stack constantly increases during the
3825 left-to-right execution of the pattern, rather than going up and down with
3826 the pattern nesting. For example the stack is at its maximum at Z at the
3827 end of the pattern, rather than at X in the following:
3828
3829     /(((X)+)+)+....(Y)+....Z/
3830
3831 The only exceptions to this are lookahead/behind assertions and the cut,
3832 (?>A), which pop all the backtrack states associated with A before
3833 continuing.
3834
3835 Backtrack state structs are allocated in slabs of about 4K in size.
3836 PL_regmatch_state and st always point to the currently active state,
3837 and PL_regmatch_slab points to the slab currently containing
3838 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3839 allocated, and is never freed until interpreter destruction. When the slab
3840 is full, a new one is allocated and chained to the end. At exit from
3841 regmatch(), slabs allocated since entry are freed.
3842
3843 */
3844
3845
3846 #define DEBUG_STATE_pp(pp)                                  \
3847     DEBUG_STATE_r({                                         \
3848         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
3849         Perl_re_printf( aTHX_                                           \
3850             "%*s" pp " %s%s%s%s%s\n",                       \
3851             INDENT_CHARS(depth), "",                        \
3852             PL_reg_name[st->resume_state],                  \
3853             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3854             ((st==yes_state) ? "Y" : ""),                   \
3855             ((st==mark_state) ? "M" : ""),                  \
3856             ((st==yes_state||st==mark_state) ? "]" : "")    \
3857         );                                                  \
3858     });
3859
3860
3861 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3862
3863 #ifdef DEBUGGING
3864
3865 STATIC void
3866 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3867     const char *start, const char *end, const char *blurb)
3868 {
3869     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3870
3871     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3872
3873     if (!PL_colorset)
3874             reginitcolors();
3875     {
3876         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3877             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3878
3879         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3880             start, end - start, 60);
3881
3882         Perl_re_printf( aTHX_
3883             "%s%s REx%s %s against %s\n",
3884                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3885
3886         if (utf8_target||utf8_pat)
3887             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
3888                 utf8_pat ? "pattern" : "",
3889                 utf8_pat && utf8_target ? " and " : "",
3890                 utf8_target ? "string" : ""
3891             );
3892     }
3893 }
3894
3895 STATIC void
3896 S_dump_exec_pos(pTHX_ const char *locinput,
3897                       const regnode *scan,
3898                       const char *loc_regeol,
3899                       const char *loc_bostr,
3900                       const char *loc_reg_starttry,
3901                       const bool utf8_target,
3902                       const U32 depth
3903                 )
3904 {
3905     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3906     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3907     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3908     /* The part of the string before starttry has one color
3909        (pref0_len chars), between starttry and current
3910        position another one (pref_len - pref0_len chars),
3911        after the current position the third one.
3912        We assume that pref0_len <= pref_len, otherwise we
3913        decrease pref0_len.  */
3914     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3915         ? (5 + taill) - l : locinput - loc_bostr;
3916     int pref0_len;
3917
3918     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3919
3920     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3921         pref_len++;
3922     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3923     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3924         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3925               ? (5 + taill) - pref_len : loc_regeol - locinput);
3926     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3927         l--;
3928     if (pref0_len < 0)
3929         pref0_len = 0;
3930     if (pref0_len > pref_len)
3931         pref0_len = pref_len;
3932     {
3933         const int is_uni = utf8_target ? 1 : 0;
3934
3935         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3936             (locinput - pref_len),pref0_len, 60, 4, 5);
3937
3938         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3939                     (locinput - pref_len + pref0_len),
3940                     pref_len - pref0_len, 60, 2, 3);
3941
3942         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3943                     locinput, loc_regeol - locinput, 10, 0, 1);
3944
3945         const STRLEN tlen=len0+len1+len2;
3946         Perl_re_printf( aTHX_
3947                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|%4u| ",
3948                     (IV)(locinput - loc_bostr),
3949                     len0, s0,
3950                     len1, s1,
3951                     (docolor ? "" : "> <"),
3952                     len2, s2,
3953                     (int)(tlen > 19 ? 0 :  19 - tlen),
3954                     "",
3955                     depth);
3956     }
3957 }
3958
3959 #endif
3960
3961 /* reg_check_named_buff_matched()
3962  * Checks to see if a named buffer has matched. The data array of
3963  * buffer numbers corresponding to the buffer is expected to reside
3964  * in the regexp->data->data array in the slot stored in the ARG() of
3965  * node involved. Note that this routine doesn't actually care about the
3966  * name, that information is not preserved from compilation to execution.
3967  * Returns the index of the leftmost defined buffer with the given name
3968  * or 0 if non of the buffers matched.
3969  */
3970 STATIC I32
3971 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3972 {
3973     I32 n;
3974     RXi_GET_DECL(rex,rexi);
3975     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3976     I32 *nums=(I32*)SvPVX(sv_dat);
3977
3978     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3979
3980     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3981         if ((I32)rex->lastparen >= nums[n] &&
3982             rex->offs[nums[n]].end != -1)
3983         {
3984             return nums[n];
3985         }
3986     }
3987     return 0;
3988 }
3989
3990
3991 static bool
3992 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3993         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3994 {
3995     /* This function determines if there are one or two characters that match
3996      * the first character of the passed-in EXACTish node <text_node>, and if
3997      * so, returns them in the passed-in pointers.
3998      *
3999      * If it determines that no possible character in the target string can
4000      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4001      * the first character in <text_node> requires UTF-8 to represent, and the
4002      * target string isn't in UTF-8.)
4003      *
4004      * If there are more than two characters that could match the beginning of
4005      * <text_node>, or if more context is required to determine a match or not,
4006      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4007      *
4008      * The motiviation behind this function is to allow the caller to set up
4009      * tight loops for matching.  If <text_node> is of type EXACT, there is
4010      * only one possible character that can match its first character, and so
4011      * the situation is quite simple.  But things get much more complicated if
4012      * folding is involved.  It may be that the first character of an EXACTFish
4013      * node doesn't participate in any possible fold, e.g., punctuation, so it
4014      * can be matched only by itself.  The vast majority of characters that are
4015      * in folds match just two things, their lower and upper-case equivalents.
4016      * But not all are like that; some have multiple possible matches, or match
4017      * sequences of more than one character.  This function sorts all that out.
4018      *
4019      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4020      * loop of trying to match A*, we know we can't exit where the thing
4021      * following it isn't a B.  And something can't be a B unless it is the
4022      * beginning of B.  By putting a quick test for that beginning in a tight
4023      * loop, we can rule out things that can't possibly be B without having to
4024      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4025      * character, we can make a tight loop matching A*, using the outputs of
4026      * this function.
4027      *
4028      * If the target string to match isn't in UTF-8, and there aren't
4029      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4030      * the one or two possible octets (which are characters in this situation)
4031      * that can match.  In all cases, if there is only one character that can
4032      * match, *<c1p> and *<c2p> will be identical.
4033      *
4034      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4035      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4036      * can match the beginning of <text_node>.  They should be declared with at
4037      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4038      * undefined what these contain.)  If one or both of the buffers are
4039      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4040      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4041      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4042      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4043      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4044
4045     const bool utf8_target = reginfo->is_utf8_target;
4046
4047     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4048     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4049     bool use_chrtest_void = FALSE;
4050     const bool is_utf8_pat = reginfo->is_utf8_pat;
4051
4052     /* Used when we have both utf8 input and utf8 output, to avoid converting
4053      * to/from code points */
4054     bool utf8_has_been_setup = FALSE;
4055
4056     dVAR;
4057
4058     U8 *pat = (U8*)STRING(text_node);
4059     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4060
4061     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4062
4063         /* In an exact node, only one thing can be matched, that first
4064          * character.  If both the pat and the target are UTF-8, we can just
4065          * copy the input to the output, avoiding finding the code point of
4066          * that character */
4067         if (!is_utf8_pat) {
4068             c2 = c1 = *pat;
4069         }
4070         else if (utf8_target) {
4071             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4072             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4073             utf8_has_been_setup = TRUE;
4074         }
4075         else {
4076             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4077         }
4078     }
4079     else { /* an EXACTFish node */
4080         U8 *pat_end = pat + STR_LEN(text_node);
4081
4082         /* An EXACTFL node has at least some characters unfolded, because what
4083          * they match is not known until now.  So, now is the time to fold
4084          * the first few of them, as many as are needed to determine 'c1' and
4085          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4086          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4087          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4088          * need to fold as many characters as a single character can fold to,
4089          * so that later we can check if the first ones are such a multi-char
4090          * fold.  But, in such a pattern only locale-problematic characters
4091          * aren't folded, so we can skip this completely if the first character
4092          * in the node isn't one of the tricky ones */
4093         if (OP(text_node) == EXACTFL) {
4094
4095             if (! is_utf8_pat) {
4096                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4097                 {
4098                     folded[0] = folded[1] = 's';
4099                     pat = folded;
4100                     pat_end = folded + 2;
4101                 }
4102             }
4103             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4104                 U8 *s = pat;
4105                 U8 *d = folded;
4106                 int i;
4107
4108                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4109                     if (isASCII(*s)) {
4110                         *(d++) = (U8) toFOLD_LC(*s);
4111                         s++;
4112                     }
4113                     else {
4114                         STRLEN len;
4115                         _to_utf8_fold_flags(s,
4116                                             d,
4117                                             &len,
4118                                             FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4119                         d += len;
4120                         s += UTF8SKIP(s);
4121                     }
4122                 }
4123
4124                 pat = folded;
4125                 pat_end = d;
4126             }
4127         }
4128
4129         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4130              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4131         {
4132             /* Multi-character folds require more context to sort out.  Also
4133              * PL_utf8_foldclosures used below doesn't handle them, so have to
4134              * be handled outside this routine */
4135             use_chrtest_void = TRUE;
4136         }
4137         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4138             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4139             if (c1 > 255) {
4140                 /* Load the folds hash, if not already done */
4141                 SV** listp;
4142                 if (! PL_utf8_foldclosures) {
4143                     _load_PL_utf8_foldclosures();
4144                 }
4145
4146                 /* The fold closures data structure is a hash with the keys
4147                  * being the UTF-8 of every character that is folded to, like
4148                  * 'k', and the values each an array of all code points that
4149                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4150                  * Multi-character folds are not included */
4151                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4152                                         (char *) pat,
4153                                         UTF8SKIP(pat),
4154                                         FALSE))))
4155                 {
4156                     /* Not found in the hash, therefore there are no folds
4157                     * containing it, so there is only a single character that
4158                     * could match */
4159                     c2 = c1;
4160                 }
4161                 else {  /* Does participate in folds */
4162                     AV* list = (AV*) *listp;
4163                     if (av_tindex_nomg(list) != 1) {
4164
4165                         /* If there aren't exactly two folds to this, it is
4166                          * outside the scope of this function */
4167                         use_chrtest_void = TRUE;
4168                     }
4169                     else {  /* There are two.  Get them */
4170                         SV** c_p = av_fetch(list, 0, FALSE);
4171                         if (c_p == NULL) {
4172                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4173                         }
4174                         c1 = SvUV(*c_p);
4175
4176                         c_p = av_fetch(list, 1, FALSE);
4177                         if (c_p == NULL) {
4178                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4179                         }
4180                         c2 = SvUV(*c_p);
4181
4182                         /* Folds that cross the 255/256 boundary are forbidden
4183                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4184                          * one is ASCIII.  Since the pattern character is above
4185                          * 255, and its only other match is below 256, the only
4186                          * legal match will be to itself.  We have thrown away
4187                          * the original, so have to compute which is the one
4188                          * above 255. */
4189                         if ((c1 < 256) != (c2 < 256)) {
4190                             if ((OP(text_node) == EXACTFL
4191                                  && ! IN_UTF8_CTYPE_LOCALE)
4192                                 || ((OP(text_node) == EXACTFA
4193                                     || OP(text_node) == EXACTFA_NO_TRIE)
4194                                     && (isASCII(c1) || isASCII(c2))))
4195                             {
4196                                 if (c1 < 256) {
4197                                     c1 = c2;
4198                                 }
4199                                 else {
4200                                     c2 = c1;
4201                                 }
4202                             }
4203                         }
4204                     }
4205                 }
4206             }
4207             else /* Here, c1 is <= 255 */
4208                 if (utf8_target
4209                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4210                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4211                     && ((OP(text_node) != EXACTFA
4212                         && OP(text_node) != EXACTFA_NO_TRIE)
4213                         || ! isASCII(c1)))
4214             {
4215                 /* Here, there could be something above Latin1 in the target
4216                  * which folds to this character in the pattern.  All such
4217                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4218                  * than two characters involved in their folds, so are outside
4219                  * the scope of this function */
4220                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4221                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4222                 }
4223                 else {
4224                     use_chrtest_void = TRUE;
4225                 }
4226             }
4227             else { /* Here nothing above Latin1 can fold to the pattern
4228                       character */
4229                 switch (OP(text_node)) {
4230
4231                     case EXACTFL:   /* /l rules */
4232                         c2 = PL_fold_locale[c1];
4233                         break;
4234
4235                     case EXACTF:   /* This node only generated for non-utf8
4236                                     patterns */
4237                         assert(! is_utf8_pat);
4238                         if (! utf8_target) {    /* /d rules */
4239                             c2 = PL_fold[c1];
4240                             break;
4241                         }
4242                         /* FALLTHROUGH */
4243                         /* /u rules for all these.  This happens to work for
4244                         * EXACTFA as nothing in Latin1 folds to ASCII */
4245                     case EXACTFA_NO_TRIE:   /* This node only generated for
4246                                             non-utf8 patterns */
4247                         assert(! is_utf8_pat);
4248                         /* FALLTHROUGH */
4249                     case EXACTFA:
4250                     case EXACTFU_SS:
4251                     case EXACTFU:
4252                         c2 = PL_fold_latin1[c1];
4253                         break;
4254
4255                     default:
4256                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4257                         NOT_REACHED; /* NOTREACHED */
4258                 }
4259             }
4260         }
4261     }
4262
4263     /* Here have figured things out.  Set up the returns */
4264     if (use_chrtest_void) {
4265         *c2p = *c1p = CHRTEST_VOID;
4266     }
4267     else if (utf8_target) {
4268         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4269             uvchr_to_utf8(c1_utf8, c1);
4270             uvchr_to_utf8(c2_utf8, c2);
4271         }
4272
4273         /* Invariants are stored in both the utf8 and byte outputs; Use
4274          * negative numbers otherwise for the byte ones.  Make sure that the
4275          * byte ones are the same iff the utf8 ones are the same */
4276         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4277         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4278                 ? *c2_utf8
4279                 : (c1 == c2)
4280                   ? CHRTEST_NOT_A_CP_1
4281                   : CHRTEST_NOT_A_CP_2;
4282     }
4283     else if (c1 > 255) {
4284        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4285                            can represent */
4286            return FALSE;
4287        }
4288
4289        *c1p = *c2p = c2;    /* c2 is the only representable value */
4290     }
4291     else {  /* c1 is representable; see about c2 */
4292        *c1p = c1;
4293        *c2p = (c2 < 256) ? c2 : c1;
4294     }
4295
4296     return TRUE;
4297 }
4298
4299 STATIC bool
4300 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4301 {
4302     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4303      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4304
4305     PERL_ARGS_ASSERT_ISGCB;
4306
4307     switch (GCB_table[before][after]) {
4308         case GCB_BREAKABLE:
4309             return TRUE;
4310
4311         case GCB_NOBREAK:
4312             return FALSE;
4313
4314         case GCB_RI_then_RI:
4315             {
4316                 int RI_count = 1;
4317                 U8 * temp_pos = (U8 *) curpos;
4318
4319                 /* Do not break within emoji flag sequences. That is, do not
4320                  * break between regional indicator (RI) symbols if there is an
4321                  * odd number of RI characters before the break point.
4322                  *  GB12     ^ (RI RI)* RI × RI
4323                  *  GB13 [^RI] (RI RI)* RI × RI */
4324
4325                 while (backup_one_GCB(strbeg,
4326                                     &temp_pos,
4327                                     utf8_target) == GCB_Regional_Indicator)
4328                 {
4329                     RI_count++;
4330                 }
4331
4332                 return RI_count % 2 != 1;
4333             }
4334
4335         case GCB_EX_then_EM:
4336
4337             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4338             {
4339                 U8 * temp_pos = (U8 *) curpos;
4340                 GCB_enum prev;
4341
4342                 do {
4343                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4344                 }
4345                 while (prev == GCB_Extend);
4346
4347                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4348             }
4349
4350         default:
4351             break;
4352     }
4353
4354 #ifdef DEBUGGING
4355     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4356                                   before, after, GCB_table[before][after]);
4357     assert(0);
4358 #endif
4359     return TRUE;
4360 }
4361
4362 STATIC GCB_enum
4363 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4364 {
4365     GCB_enum gcb;
4366
4367     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4368
4369     if (*curpos < strbeg) {
4370         return GCB_EDGE;
4371     }
4372
4373     if (utf8_target) {
4374         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4375         U8 * prev_prev_char_pos;
4376
4377         if (! prev_char_pos) {
4378             return GCB_EDGE;
4379         }
4380
4381         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4382             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4383             *curpos = prev_char_pos;
4384             prev_char_pos = prev_prev_char_pos;
4385         }
4386         else {
4387             *curpos = (U8 *) strbeg;
4388             return GCB_EDGE;
4389         }
4390     }
4391     else {
4392         if (*curpos - 2 < strbeg) {
4393             *curpos = (U8 *) strbeg;
4394             return GCB_EDGE;
4395         }
4396         (*curpos)--;
4397         gcb = getGCB_VAL_CP(*(*curpos - 1));
4398     }
4399
4400     return gcb;
4401 }
4402
4403 /* Combining marks attach to most classes that precede them, but this defines
4404  * the exceptions (from TR14) */
4405 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4406                                      || prev == LB_Mandatory_Break      \
4407                                      || prev == LB_Carriage_Return      \
4408                                      || prev == LB_Line_Feed            \
4409                                      || prev == LB_Next_Line            \
4410                                      || prev == LB_Space                \
4411                                      || prev == LB_ZWSpace))
4412
4413 STATIC bool
4414 S_isLB(pTHX_ LB_enum before,
4415              LB_enum after,
4416              const U8 * const strbeg,
4417              const U8 * const curpos,
4418              const U8 * const strend,
4419              const bool utf8_target)
4420 {
4421     U8 * temp_pos = (U8 *) curpos;
4422     LB_enum prev = before;
4423
4424     /* Is the boundary between 'before' and 'after' line-breakable?
4425      * Most of this is just a table lookup of a generated table from Unicode
4426      * rules.  But some rules require context to decide, and so have to be
4427      * implemented in code */
4428
4429     PERL_ARGS_ASSERT_ISLB;
4430
4431     /* Rule numbers in the comments below are as of Unicode 9.0 */
4432
4433   redo:
4434     before = prev;
4435     switch (LB_table[before][after]) {
4436         case LB_BREAKABLE:
4437             return TRUE;
4438
4439         case LB_NOBREAK:
4440         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4441             return FALSE;
4442
4443         case LB_SP_foo + LB_BREAKABLE:
4444         case LB_SP_foo + LB_NOBREAK:
4445         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4446
4447             /* When we have something following a SP, we have to look at the
4448              * context in order to know what to do.
4449              *
4450              * SP SP should not reach here because LB7: Do not break before
4451              * spaces.  (For two spaces in a row there is nothing that
4452              * overrides that) */
4453             assert(after != LB_Space);
4454
4455             /* Here we have a space followed by a non-space.  Mostly this is a
4456              * case of LB18: "Break after spaces".  But there are complications
4457              * as the handling of spaces is somewhat tricky.  They are in a
4458              * number of rules, which have to be applied in priority order, but
4459              * something earlier in the string can cause a rule to be skipped
4460              * and a lower priority rule invoked.  A prime example is LB7 which
4461              * says don't break before a space.  But rule LB8 (lower priority)
4462              * says that the first break opportunity after a ZW is after any
4463              * span of spaces immediately after it.  If a ZW comes before a SP
4464              * in the input, rule LB8 applies, and not LB7.  Other such rules
4465              * involve combining marks which are rules 9 and 10, but they may
4466              * override higher priority rules if they come earlier in the
4467              * string.  Since we're doing random access into the middle of the
4468              * string, we have to look for rules that should get applied based
4469              * on both string position and priority.  Combining marks do not
4470              * attach to either ZW nor SP, so we don't have to consider them
4471              * until later.
4472              *
4473              * To check for LB8, we have to find the first non-space character
4474              * before this span of spaces */
4475             do {
4476                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4477             }
4478             while (prev == LB_Space);
4479
4480             /* LB8 Break before any character following a zero-width space,
4481              * even if one or more spaces intervene.
4482              *      ZW SP* ÷
4483              * So if we have a ZW just before this span, and to get here this
4484              * is the final space in the span. */
4485             if (prev == LB_ZWSpace) {
4486                 return TRUE;
4487             }
4488
4489             /* Here, not ZW SP+.  There are several rules that have higher
4490              * priority than LB18 and can be resolved now, as they don't depend
4491              * on anything earlier in the string (except ZW, which we have
4492              * already handled).  One of these rules is LB11 Do not break
4493              * before Word joiner, but we have specially encoded that in the
4494              * lookup table so it is caught by the single test below which
4495              * catches the other ones. */
4496             if (LB_table[LB_Space][after] - LB_SP_foo
4497                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4498             {
4499                 return FALSE;
4500             }
4501
4502             /* If we get here, we have to XXX consider combining marks. */
4503             if (prev == LB_Combining_Mark) {
4504
4505                 /* What happens with these depends on the character they
4506                  * follow.  */
4507                 do {
4508                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4509                 }
4510                 while (prev == LB_Combining_Mark);
4511
4512                 /* Most times these attach to and inherit the characteristics
4513                  * of that character, but not always, and when not, they are to
4514                  * be treated as AL by rule LB10. */
4515                 if (! LB_CM_ATTACHES_TO(prev)) {
4516                     prev = LB_Alphabetic;
4517                 }
4518             }
4519
4520             /* Here, we have the character preceding the span of spaces all set
4521              * up.  We follow LB18: "Break after spaces" unless the table shows
4522              * that is overriden */
4523             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4524
4525         case LB_CM_ZWJ_foo:
4526
4527             /* We don't know how to treat the CM except by looking at the first
4528              * non-CM character preceding it.  ZWJ is treated as CM */
4529             do {
4530                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4531             }
4532             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4533
4534             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4535              * attatches to it, then it inherits the behavior of 'prev'.  If it
4536              * doesn't attach, it is to be treated as an AL */
4537             if (! LB_CM_ATTACHES_TO(prev)) {
4538                 prev = LB_Alphabetic;
4539             }
4540
4541             goto redo;
4542
4543         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4544         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4545
4546             /* LB21a Don't break after Hebrew + Hyphen.
4547              * HL (HY | BA) × */
4548
4549             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4550                                                           == LB_Hebrew_Letter)
4551             {
4552                 return FALSE;
4553             }
4554
4555             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4556
4557         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4558         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4559
4560             /* LB25a (PR | PO) × ( OP | HY )? NU */
4561             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4562                 return FALSE;
4563             }
4564
4565             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4566                                                                 == LB_BREAKABLE;
4567
4568         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4569         case LB_SY_or_IS_then_various + LB_NOBREAK:
4570         {
4571             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4572
4573             LB_enum temp = prev;
4574             do {
4575                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4576             }
4577             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4578             if (temp == LB_Numeric) {
4579                 return FALSE;
4580             }
4581
4582             return LB_table[prev][after] - LB_SY_or_IS_then_various
4583                                                                == LB_BREAKABLE;
4584         }
4585
4586         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4587         case LB_various_then_PO_or_PR + LB_NOBREAK:
4588         {
4589             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4590
4591             LB_enum temp = prev;
4592             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4593             {
4594                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4595             }
4596             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4597                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4598             }
4599             if (temp == LB_Numeric) {
4600                 return FALSE;
4601             }
4602             return LB_various_then_PO_or_PR;
4603         }
4604
4605         case LB_RI_then_RI + LB_NOBREAK:
4606         case LB_RI_then_RI + LB_BREAKABLE:
4607             {
4608                 int RI_count = 1;
4609
4610                 /* LB30a Break between two regional indicator symbols if and
4611                  * only if there are an even number of regional indicators
4612                  * preceding the position of the break.
4613                  *
4614                  *  sot (RI RI)* RI × RI
4615                  *  [^RI] (RI RI)* RI × RI */
4616
4617                 while (backup_one_LB(strbeg,
4618                                      &temp_pos,
4619                                      utf8_target) == LB_Regional_Indicator)
4620                 {
4621                     RI_count++;
4622                 }
4623
4624                 return RI_count % 2 == 0;
4625             }
4626
4627         default:
4628             break;
4629     }
4630
4631 #ifdef DEBUGGING
4632     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4633                                   before, after, LB_table[before][after]);
4634     assert(0);
4635 #endif
4636     return TRUE;
4637 }
4638
4639 STATIC LB_enum
4640 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4641 {
4642     LB_enum lb;
4643
4644     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4645
4646     if (*curpos >= strend) {
4647         return LB_EDGE;
4648     }
4649
4650     if (utf8_target) {
4651         *curpos += UTF8SKIP(*curpos);
4652         if (*curpos >= strend) {
4653             return LB_EDGE;
4654         }
4655         lb = getLB_VAL_UTF8(*curpos, strend);
4656     }
4657     else {
4658         (*curpos)++;
4659         if (*curpos >= strend) {
4660             return LB_EDGE;
4661         }
4662         lb = getLB_VAL_CP(**curpos);
4663     }
4664
4665     return lb;
4666 }
4667
4668 STATIC LB_enum
4669 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4670 {
4671     LB_enum lb;
4672
4673     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4674
4675     if (*curpos < strbeg) {
4676         return LB_EDGE;
4677     }
4678
4679     if (utf8_target) {
4680         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4681         U8 * prev_prev_char_pos;
4682
4683         if (! prev_char_pos) {
4684             return LB_EDGE;
4685         }
4686
4687         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4688             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4689             *curpos = prev_char_pos;
4690             prev_char_pos = prev_prev_char_pos;
4691         }
4692         else {
4693             *curpos = (U8 *) strbeg;
4694             return LB_EDGE;
4695         }
4696     }
4697     else {
4698         if (*curpos - 2 < strbeg) {
4699             *curpos = (U8 *) strbeg;
4700             return LB_EDGE;
4701         }
4702         (*curpos)--;
4703         lb = getLB_VAL_CP(*(*curpos - 1));
4704     }
4705
4706     return lb;
4707 }
4708
4709 STATIC bool
4710 S_isSB(pTHX_ SB_enum before,
4711              SB_enum after,
4712              const U8 * const strbeg,
4713              const U8 * const curpos,
4714              const U8 * const strend,
4715              const bool utf8_target)
4716 {
4717     /* returns a boolean indicating if there is a Sentence Boundary Break
4718      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4719
4720     U8 * lpos = (U8 *) curpos;
4721     bool has_para_sep = FALSE;
4722     bool has_sp = FALSE;
4723
4724     PERL_ARGS_ASSERT_ISSB;
4725
4726     /* Break at the start and end of text.
4727         SB1.  sot  ÷
4728         SB2.  ÷  eot
4729       But unstated in Unicode is don't break if the text is empty */
4730     if (before == SB_EDGE || after == SB_EDGE) {
4731         return before != after;
4732     }
4733
4734     /* SB 3: Do not break within CRLF. */
4735     if (before == SB_CR && after == SB_LF) {
4736         return FALSE;
4737     }
4738
4739     /* Break after paragraph separators.  CR and LF are considered
4740      * so because Unicode views text as like word processing text where there
4741      * are no newlines except between paragraphs, and the word processor takes
4742      * care of wrapping without there being hard line-breaks in the text *./
4743        SB4.  Sep | CR | LF  ÷ */
4744     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4745         return TRUE;
4746     }
4747
4748     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4749      * (See Section 6.2, Replacing Ignore Rules.)
4750         SB5.  X (Extend | Format)*  →  X */
4751     if (after == SB_Extend || after == SB_Format) {
4752
4753         /* Implied is that the these characters attach to everything
4754          * immediately prior to them except for those separator-type
4755          * characters.  And the rules earlier have already handled the case
4756          * when one of those immediately precedes the extend char */
4757         return FALSE;
4758     }
4759
4760     if (before == SB_Extend || before == SB_Format) {
4761         U8 * temp_pos = lpos;
4762         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4763         if (   backup != SB_EDGE
4764             && backup != SB_Sep
4765             && backup != SB_CR
4766             && backup != SB_LF)
4767         {
4768             before = backup;
4769             lpos = temp_pos;
4770         }
4771
4772         /* Here, both 'before' and 'backup' are these types; implied is that we
4773          * don't break between them */
4774         if (backup == SB_Extend || backup == SB_Format) {
4775             return FALSE;
4776         }
4777     }
4778
4779     /* Do not break after ambiguous terminators like period, if they are
4780      * immediately followed by a number or lowercase letter, if they are
4781      * between uppercase letters, if the first following letter (optionally
4782      * after certain punctuation) is lowercase, or if they are followed by
4783      * "continuation" punctuation such as comma, colon, or semicolon. For
4784      * example, a period may be an abbreviation or numeric period, and thus may
4785      * not mark the end of a sentence.
4786
4787      * SB6. ATerm  ×  Numeric */
4788     if (before == SB_ATerm && after == SB_Numeric) {
4789         return FALSE;
4790     }
4791
4792     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4793     if (before == SB_ATerm && after == SB_Upper) {
4794         U8 * temp_pos = lpos;
4795         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4796         if (backup == SB_Upper || backup == SB_Lower) {
4797             return FALSE;
4798         }
4799     }
4800
4801     /* The remaining rules that aren't the final one, all require an STerm or
4802      * an ATerm after having backed up over some Close* Sp*, and in one case an
4803      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4804      * So do that backup now, setting flags if either Sp or a paragraph
4805      * separator are found */
4806
4807     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4808         has_para_sep = TRUE;
4809         before = backup_one_SB(strbeg, &lpos, utf8_target);
4810     }
4811
4812     if (before == SB_Sp) {
4813         has_sp = TRUE;
4814         do {
4815             before = backup_one_SB(strbeg, &lpos, utf8_target);
4816         }
4817         while (before == SB_Sp);
4818     }
4819
4820     while (before == SB_Close) {
4821         before = backup_one_SB(strbeg, &lpos, utf8_target);
4822     }
4823
4824     /* The next few rules apply only when the backed-up-to is an ATerm, and in
4825      * most cases an STerm */
4826     if (before == SB_STerm || before == SB_ATerm) {
4827
4828         /* So, here the lhs matches
4829          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
4830          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
4831          * The rules that apply here are:
4832          *
4833          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
4834                                            | LF | STerm | ATerm) )* Lower
4835            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4836            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
4837            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
4838            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
4839          */
4840
4841         /* And all but SB11 forbid having seen a paragraph separator */
4842         if (! has_para_sep) {
4843             if (before == SB_ATerm) {          /* SB8 */
4844                 U8 * rpos = (U8 *) curpos;
4845                 SB_enum later = after;
4846
4847                 while (    later != SB_OLetter
4848                         && later != SB_Upper
4849                         && later != SB_Lower
4850                         && later != SB_Sep
4851                         && later != SB_CR
4852                         && later != SB_LF
4853                         && later != SB_STerm
4854                         && later != SB_ATerm
4855                         && later != SB_EDGE)
4856                 {
4857                     later = advance_one_SB(&rpos, strend, utf8_target);
4858                 }
4859                 if (later == SB_Lower) {
4860                     return FALSE;
4861                 }
4862             }
4863
4864             if (   after == SB_SContinue    /* SB8a */
4865                 || after == SB_STerm
4866                 || after == SB_ATerm)
4867             {
4868                 return FALSE;
4869             }
4870
4871             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
4872                 if (   after == SB_Close
4873                     || after == SB_Sp
4874                     || after == SB_Sep
4875                     || after == SB_CR
4876                     || after == SB_LF)
4877                 {
4878                     return FALSE;
4879                 }
4880             }
4881
4882             /* SB10.  This and SB9 could probably be combined some way, but khw
4883              * has decided to follow the Unicode rule book precisely for
4884              * simplified maintenance */
4885             if (   after == SB_Sp
4886                 || after == SB_Sep
4887                 || after == SB_CR
4888                 || after == SB_LF)
4889             {
4890                 return FALSE;
4891             }
4892         }
4893
4894         /* SB11.  */
4895         return TRUE;
4896     }
4897
4898     /* Otherwise, do not break.
4899     SB12.  Any  ×  Any */
4900
4901     return FALSE;
4902 }
4903
4904 STATIC SB_enum
4905 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4906 {
4907     SB_enum sb;
4908
4909     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4910
4911     if (*curpos >= strend) {
4912         return SB_EDGE;
4913     }
4914
4915     if (utf8_target) {
4916         do {
4917             *curpos += UTF8SKIP(*curpos);
4918             if (*curpos >= strend) {
4919                 return SB_EDGE;
4920             }
4921             sb = getSB_VAL_UTF8(*curpos, strend);
4922         } while (sb == SB_Extend || sb == SB_Format);
4923     }
4924     else {
4925         do {
4926             (*curpos)++;
4927             if (*curpos >= strend) {
4928                 return SB_EDGE;
4929             }
4930             sb = getSB_VAL_CP(**curpos);
4931         } while (sb == SB_Extend || sb == SB_Format);
4932     }
4933
4934     return sb;
4935 }
4936
4937 STATIC SB_enum
4938 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4939 {
4940     SB_enum sb;
4941
4942     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4943
4944     if (*curpos < strbeg) {
4945         return SB_EDGE;
4946     }
4947
4948     if (utf8_target) {
4949         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4950         if (! prev_char_pos) {
4951             return SB_EDGE;
4952         }
4953
4954         /* Back up over Extend and Format.  curpos is always just to the right
4955          * of the characater whose value we are getting */
4956         do {
4957             U8 * prev_prev_char_pos;
4958             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4959                                                                       strbeg)))
4960             {
4961                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4962                 *curpos = prev_char_pos;
4963                 prev_char_pos = prev_prev_char_pos;
4964             }
4965             else {
4966                 *curpos = (U8 *) strbeg;
4967                 return SB_EDGE;
4968             }
4969         } while (sb == SB_Extend || sb == SB_Format);
4970     }
4971     else {
4972         do {
4973             if (*curpos - 2 < strbeg) {
4974                 *curpos = (U8 *) strbeg;
4975                 return SB_EDGE;
4976             }
4977             (*curpos)--;
4978             sb = getSB_VAL_CP(*(*curpos - 1));
4979         } while (sb == SB_Extend || sb == SB_Format);
4980     }
4981
4982     return sb;
4983 }
4984
4985 STATIC bool
4986 S_isWB(pTHX_ WB_enum previous,
4987              WB_enum before,
4988              WB_enum after,
4989              const U8 * const strbeg,
4990              const U8 * const curpos,
4991              const U8 * const strend,
4992              const bool utf8_target)
4993 {
4994     /*  Return a boolean as to if the boundary between 'before' and 'after' is
4995      *  a Unicode word break, using their published algorithm, but tailored for
4996      *  Perl by treating spans of white space as one unit.  Context may be
4997      *  needed to make this determination.  If the value for the character
4998      *  before 'before' is known, it is passed as 'previous'; otherwise that
4999      *  should be set to WB_UNKNOWN.  The other input parameters give the
5000      *  boundaries and current position in the matching of the string.  That
5001      *  is, 'curpos' marks the position where the character whose wb value is
5002      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5003
5004     U8 * before_pos = (U8 *) curpos;
5005     U8 * after_pos = (U8 *) curpos;
5006     WB_enum prev = before;
5007     WB_enum next;
5008
5009     PERL_ARGS_ASSERT_ISWB;
5010
5011     /* Rule numbers in the comments below are as of Unicode 9.0 */
5012
5013   redo:
5014     before = prev;
5015     switch (WB_table[before][after]) {
5016         case WB_BREAKABLE:
5017             return TRUE;
5018
5019         case WB_NOBREAK:
5020             return FALSE;
5021
5022         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5023             next = advance_one_WB(&after_pos, strend, utf8_target,
5024                                  FALSE /* Don't skip Extend nor Format */ );
5025             /* A space immediately preceeding an Extend or Format is attached
5026              * to by them, and hence gets separated from previous spaces.
5027              * Otherwise don't break between horizontal white space */
5028             return next == WB_Extend || next == WB_Format;
5029
5030         /* WB4 Ignore Format and Extend characters, except when they appear at
5031          * the beginning of a region of text.  This code currently isn't
5032          * general purpose, but it works as the rules are currently and likely
5033          * to be laid out.  The reason it works is that when 'they appear at
5034          * the beginning of a region of text', the rule is to break before
5035          * them, just like any other character.  Therefore, the default rule
5036          * applies and we don't have to look in more depth.  Should this ever
5037          * change, we would have to have 2 'case' statements, like in the rules
5038          * below, and backup a single character (not spacing over the extend
5039          * ones) and then see if that is one of the region-end characters and
5040          * go from there */
5041         case WB_Ex_or_FO_or_ZWJ_then_foo:
5042             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5043             goto redo;
5044
5045         case WB_DQ_then_HL + WB_BREAKABLE:
5046         case WB_DQ_then_HL + WB_NOBREAK:
5047
5048             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5049
5050             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5051                                                             == WB_Hebrew_Letter)
5052             {
5053                 return FALSE;
5054             }
5055
5056              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5057
5058         case WB_HL_then_DQ + WB_BREAKABLE:
5059         case WB_HL_then_DQ + WB_NOBREAK:
5060
5061             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5062
5063             if (advance_one_WB(&after_pos, strend, utf8_target,
5064                                        TRUE /* Do skip Extend and Format */ )
5065                                                             == WB_Hebrew_Letter)
5066             {
5067                 return FALSE;
5068             }
5069
5070             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5071
5072         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5073         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5074
5075             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5076              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5077
5078             next = advance_one_WB(&after_pos, strend, utf8_target,
5079                                        TRUE /* Do skip Extend and Format */ );
5080
5081             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5082             {
5083                 return FALSE;
5084             }
5085
5086             return WB_table[before][after]
5087                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5088
5089         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5090         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5091
5092             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5093              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5094
5095             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5096             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5097             {
5098                 return FALSE;
5099             }
5100
5101             return WB_table[before][after]
5102                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5103
5104         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5105         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5106
5107             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5108              * */
5109
5110             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5111                                                             == WB_Numeric)
5112             {
5113                 return FALSE;
5114             }
5115
5116             return WB_table[before][after]
5117                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5118
5119         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5120         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5121
5122             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5123
5124             if (advance_one_WB(&after_pos, strend, utf8_target,
5125                                        TRUE /* Do skip Extend and Format */ )
5126                                                             == WB_Numeric)
5127             {
5128                 return FALSE;
5129             }
5130
5131             return WB_table[before][after]
5132                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5133
5134         case WB_RI_then_RI + WB_NOBREAK:
5135         case WB_RI_then_RI + WB_BREAKABLE:
5136             {
5137                 int RI_count = 1;
5138
5139                 /* Do not break within emoji flag sequences. That is, do not
5140                  * break between regional indicator (RI) symbols if there is an
5141                  * odd number of RI characters before the potential break
5142                  * point.
5143                  *
5144                  * WB15     ^ (RI RI)* RI × RI
5145                  * WB16 [^RI] (RI RI)* RI × RI */
5146
5147                 while (backup_one_WB(&previous,
5148                                      strbeg,
5149                                      &before_pos,
5150                                      utf8_target) == WB_Regional_Indicator)
5151                 {
5152                     RI_count++;
5153                 }
5154
5155                 return RI_count % 2 != 1;
5156             }
5157
5158         default:
5159             break;
5160     }
5161
5162 #ifdef DEBUGGING
5163     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5164                                   before, after, WB_table[before][after]);
5165     assert(0);
5166 #endif
5167     return TRUE;
5168 }
5169
5170 STATIC WB_enum
5171 S_advance_one_WB(pTHX_ U8 ** curpos,
5172                        const U8 * const strend,
5173                        const bool utf8_target,
5174                        const bool skip_Extend_Format)
5175 {
5176     WB_enum wb;
5177
5178     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5179
5180     if (*curpos >= strend) {
5181         return WB_EDGE;
5182     }
5183
5184     if (utf8_target) {
5185
5186         /* Advance over Extend and Format */
5187         do {
5188             *curpos += UTF8SKIP(*curpos);
5189             if (*curpos >= strend) {
5190                 return WB_EDGE;
5191             }
5192             wb = getWB_VAL_UTF8(*curpos, strend);
5193         } while (    skip_Extend_Format
5194                  && (wb == WB_Extend || wb == WB_Format));
5195     }
5196     else {
5197         do {
5198             (*curpos)++;
5199             if (*curpos >= strend) {
5200                 return WB_EDGE;
5201             }
5202             wb = getWB_VAL_CP(**curpos);
5203         } while (    skip_Extend_Format
5204                  && (wb == WB_Extend || wb == WB_Format));
5205     }
5206
5207     return wb;
5208 }
5209
5210 STATIC WB_enum
5211 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5212 {
5213     WB_enum wb;
5214
5215     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5216
5217     /* If we know what the previous character's break value is, don't have
5218         * to look it up */
5219     if (*previous != WB_UNKNOWN) {
5220         wb = *previous;
5221
5222         /* But we need to move backwards by one */
5223         if (utf8_target) {
5224             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5225             if (! *curpos) {
5226                 *previous = WB_EDGE;
5227                 *curpos = (U8 *) strbeg;
5228             }
5229             else {
5230                 *previous = WB_UNKNOWN;
5231             }
5232         }
5233         else {
5234             (*curpos)--;
5235             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5236         }
5237
5238         /* And we always back up over these three types */
5239         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5240             return wb;
5241         }
5242     }
5243
5244     if (*curpos < strbeg) {
5245         return WB_EDGE;
5246     }
5247
5248     if (utf8_target) {
5249         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5250         if (! prev_char_pos) {
5251             return WB_EDGE;
5252         }
5253
5254         /* Back up over Extend and Format.  curpos is always just to the right
5255          * of the characater whose value we are getting */
5256         do {
5257             U8 * prev_prev_char_pos;
5258             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5259                                                    -1,
5260                                                    strbeg)))
5261             {
5262                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5263                 *curpos = prev_char_pos;
5264                 prev_char_pos = prev_prev_char_pos;
5265             }
5266             else {
5267                 *curpos = (U8 *) strbeg;
5268                 return WB_EDGE;
5269             }
5270         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5271     }
5272     else {
5273         do {
5274             if (*curpos - 2 < strbeg) {
5275                 *curpos = (U8 *) strbeg;
5276                 return WB_EDGE;
5277             }
5278             (*curpos)--;
5279             wb = getWB_VAL_CP(*(*curpos - 1));
5280         } while (wb == WB_Extend || wb == WB_Format);
5281     }
5282
5283     return wb;
5284 }
5285
5286 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5287 (                                                           \
5288     (   ( st )                                         ) && \
5289     (   ( st )->u.eval.close_paren                     ) && \
5290     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5291 )
5292
5293 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5294 (                                                           \
5295     (   ( st )                                         ) && \
5296     (   ( st )->u.eval.close_paren                     ) && \
5297     (   ( expr )                                       ) && \
5298     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5299 )
5300
5301
5302 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5303     (st)->u.eval.close_paren = ( (expr) + 1 )
5304
5305 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5306     (st)->u.eval.close_paren = 0
5307
5308 /* returns -1 on failure, $+[0] on success */
5309 STATIC SSize_t
5310 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5311 {
5312
5313 #if PERL_VERSION < 9 && !defined(PERL_CORE)
5314     dMY_CXT;
5315 #endif
5316     dVAR;
5317     const bool utf8_target = reginfo->is_utf8_target;
5318     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5319     REGEXP *rex_sv = reginfo->prog;
5320     regexp *rex = ReANY(rex_sv);
5321     RXi_GET_DECL(rex,rexi);
5322     /* the current state. This is a cached copy of PL_regmatch_state */
5323     regmatch_state *st;
5324     /* cache heavy used fields of st in registers */
5325     regnode *scan;
5326     regnode *next;
5327     U32 n = 0;  /* general value; init to avoid compiler warning */
5328     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5329     char *locinput = startpos;
5330     char *pushinput; /* where to continue after a PUSH */
5331     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5332
5333     bool result = 0;        /* return value of S_regmatch */
5334     int depth = 0;          /* depth of backtrack stack */
5335     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5336     const U32 max_nochange_depth =
5337         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5338         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5339     regmatch_state *yes_state = NULL; /* state to pop to on success of
5340                                                             subpattern */
5341     /* mark_state piggy backs on the yes_state logic so that when we unwind
5342        the stack on success we can update the mark_state as we go */
5343     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5344     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5345     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5346     U32 state_num;
5347     bool no_final = 0;      /* prevent failure from backtracking? */
5348     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5349     char *startpoint = locinput;
5350     SV *popmark = NULL;     /* are we looking for a mark? */
5351     SV *sv_commit = NULL;   /* last mark name seen in failure */
5352     SV *sv_yes_mark = NULL; /* last mark name we have seen
5353                                during a successful match */
5354     U32 lastopen = 0;       /* last open we saw */
5355     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
5356     SV* const oreplsv = GvSVn(PL_replgv);
5357     /* these three flags are set by various ops to signal information to
5358      * the very next op. They have a useful lifetime of exactly one loop
5359      * iteration, and are not preserved or restored by state pushes/pops
5360      */
5361     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5362     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5363     int logical = 0;        /* the following EVAL is:
5364                                 0: (?{...})
5365                                 1: (?(?{...})X|Y)
5366                                 2: (??{...})
5367                                or the following IFMATCH/UNLESSM is:
5368                                 false: plain (?=foo)
5369                                 true:  used as a condition: (?(?=foo))
5370                             */
5371     PAD* last_pad = NULL;
5372     dMULTICALL;
5373     U8 gimme = G_SCALAR;
5374     CV *caller_cv = NULL;       /* who called us */
5375     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5376     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
5377     U32 maxopenparen = 0;       /* max '(' index seen so far */
5378     int to_complement;  /* Invert the result? */
5379     _char_class_number classnum;
5380     bool is_utf8_pat = reginfo->is_utf8_pat;
5381     bool match = FALSE;
5382
5383 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5384 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5385 #  define SOLARIS_BAD_OPTIMIZER
5386     const U32 *pl_charclass_dup = PL_charclass;
5387 #  define PL_charclass pl_charclass_dup
5388 #endif
5389
5390 #ifdef DEBUGGING
5391     GET_RE_DEBUG_FLAGS_DECL;
5392 #endif
5393
5394     /* protect against undef(*^R) */
5395     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5396
5397     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5398     multicall_oldcatch = 0;
5399     PERL_UNUSED_VAR(multicall_cop);
5400
5401     PERL_ARGS_ASSERT_REGMATCH;
5402
5403     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5404             Perl_re_printf( aTHX_ "regmatch start\n");
5405     }));
5406
5407     st = PL_regmatch_state;
5408
5409     /* Note that nextchr is a byte even in UTF */
5410     SET_nextchr;
5411     scan = prog;
5412     while (scan != NULL) {
5413
5414
5415         next = scan + NEXT_OFF(scan);
5416         if (next == scan)
5417             next = NULL;
5418         state_num = OP(scan);
5419
5420       reenter_switch:
5421         DEBUG_EXECUTE_r(
5422             if (state_num <= REGNODE_MAX) {
5423                 SV * const prop = sv_newmortal();
5424                 regnode *rnext = regnext(scan);
5425
5426                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5427                 regprop(rex, prop, scan, reginfo, NULL);
5428                 Perl_re_printf( aTHX_
5429                     "%*s%"IVdf":%s(%"IVdf")\n",
5430                     INDENT_CHARS(depth), "",
5431                     (IV)(scan - rexi->program),
5432                     SvPVX_const(prop),
5433                     (PL_regkind[OP(scan)] == END || !rnext) ?
5434                         0 : (IV)(rnext - rexi->program));
5435             }
5436         );
5437
5438         to_complement = 0;
5439
5440         SET_nextchr;
5441         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5442
5443         switch (state_num) {
5444         case SBOL: /*  /^../ and /\A../  */
5445             if (locinput == reginfo->strbeg)
5446                 break;
5447             sayNO;
5448
5449         case MBOL: /*  /^../m  */
5450             if (locinput == reginfo->strbeg ||
5451                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5452             {
5453                 break;
5454             }
5455             sayNO;
5456
5457         case GPOS: /*  \G  */
5458             if (locinput == reginfo->ganch)
5459                 break;
5460             sayNO;
5461
5462         case KEEPS: /*   \K  */
5463             /* update the startpoint */
5464             st->u.keeper.val = rex->offs[0].start;
5465             rex->offs[0].start = locinput - reginfo->strbeg;
5466             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5467             NOT_REACHED; /* NOTREACHED */
5468
5469         case KEEPS_next_fail:
5470             /* rollback the start point change */
5471             rex->offs[0].start = st->u.keeper.val;
5472             sayNO_SILENT;
5473             NOT_REACHED; /* NOTREACHED */
5474
5475         case MEOL: /* /..$/m  */
5476             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5477                 sayNO;
5478             break;
5479
5480         case SEOL: /* /..$/  */
5481             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5482                 sayNO;
5483             if (reginfo->strend - locinput > 1)
5484                 sayNO;
5485             break;
5486
5487         case EOS: /*  \z  */
5488             if (!NEXTCHR_IS_EOS)
5489                 sayNO;
5490             break;
5491
5492         case SANY: /*  /./s  */
5493             if (NEXTCHR_IS_EOS)
5494                 sayNO;
5495             goto increment_locinput;
5496
5497         case REG_ANY: /*  /./  */
5498             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5499                 sayNO;
5500             goto increment_locinput;
5501
5502
5503 #undef  ST
5504 #define ST st->u.trie
5505         case TRIEC: /* (ab|cd) with known charclass */
5506             /* In this case the charclass data is available inline so
5507                we can fail fast without a lot of extra overhead.
5508              */
5509             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5510                 DEBUG_EXECUTE_r(
5511                     Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5512                               depth, PL_colors[4], PL_colors[5])
5513                 );
5514                 sayNO_SILENT;
5515                 NOT_REACHED; /* NOTREACHED */
5516             }
5517             /* FALLTHROUGH */
5518         case TRIE:  /* (ab|cd)  */
5519             /* the basic plan of execution of the trie is:
5520              * At the beginning, run though all the states, and
5521              * find the longest-matching word. Also remember the position
5522              * of the shortest matching word. For example, this pattern:
5523              *    1  2 3 4    5
5524              *    ab|a|x|abcd|abc
5525              * when matched against the string "abcde", will generate
5526              * accept states for all words except 3, with the longest
5527              * matching word being 4, and the shortest being 2 (with
5528              * the position being after char 1 of the string).
5529              *
5530              * Then for each matching word, in word order (i.e. 1,2,4,5),
5531              * we run the remainder of the pattern; on each try setting
5532              * the current position to the character following the word,
5533              * returning to try the next word on failure.
5534              *
5535              * We avoid having to build a list of words at runtime by
5536              * using a compile-time structure, wordinfo[].prev, which
5537              * gives, for each word, the previous accepting word (if any).
5538              * In the case above it would contain the mappings 1->2, 2->0,
5539              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5540              * the longest word (4 above), a list of all words, by
5541              * following the list of prev pointers; this gives us the
5542              * unordered list 4,5,1,2. Then given the current word we have
5543              * just tried, we can go through the list and find the
5544              * next-biggest word to try (so if we just failed on word 2,
5545              * the next in the list is 4).
5546              *
5547              * Since at runtime we don't record the matching position in
5548              * the string for each word, we have to work that out for
5549              * each word we're about to process. The wordinfo table holds
5550              * the character length of each word; given that we recorded
5551              * at the start: the position of the shortest word and its
5552              * length in chars, we just need to move the pointer the
5553              * difference between the two char lengths. Depending on
5554              * Unicode status and folding, that's cheap or expensive.
5555              *
5556              * This algorithm is optimised for the case where are only a
5557              * small number of accept states, i.e. 0,1, or maybe 2.
5558              * With lots of accepts states, and having to try all of them,
5559              * it becomes quadratic on number of accept states to find all
5560              * the next words.
5561              */
5562
5563             {
5564                 /* what type of TRIE am I? (utf8 makes this contextual) */
5565                 DECL_TRIE_TYPE(scan);
5566
5567                 /* what trie are we using right now */
5568                 reg_trie_data * const trie
5569                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5570                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5571                 U32 state = trie->startstate;
5572
5573                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5574                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5575                     if (utf8_target
5576                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5577                         && scan->flags == EXACTL)
5578                     {
5579                         /* We only output for EXACTL, as we let the folder
5580                          * output this message for EXACTFLU8 to avoid
5581                          * duplication */
5582                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5583                                                                reginfo->strend);
5584                     }
5585                 }
5586                 if (   trie->bitmap
5587                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5588                 {
5589                     if (trie->states[ state ].wordnum) {
5590                          DEBUG_EXECUTE_r(
5591                             Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
5592                                           depth, PL_colors[4], PL_colors[5])
5593                         );
5594                         if (!trie->jump)
5595                             break;
5596                     } else {
5597                         DEBUG_EXECUTE_r(
5598                             Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5599                                           depth, PL_colors[4], PL_colors[5])
5600                         );
5601                         sayNO_SILENT;
5602                    }
5603                 }
5604
5605             {
5606                 U8 *uc = ( U8* )locinput;
5607
5608                 STRLEN len = 0;
5609                 STRLEN foldlen = 0;
5610                 U8 *uscan = (U8*)NULL;
5611                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5612                 U32 charcount = 0; /* how many input chars we have matched */
5613                 U32 accepted = 0; /* have we seen any accepting states? */
5614
5615                 ST.jump = trie->jump;
5616                 ST.me = scan;
5617                 ST.firstpos = NULL;
5618                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5619                 ST.nextword = 0;
5620
5621                 /* fully traverse the TRIE; note the position of the
5622                    shortest accept state and the wordnum of the longest
5623                    accept state */
5624
5625                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5626                     U32 base = trie->states[ state ].trans.base;
5627                     UV uvc = 0;
5628                     U16 charid = 0;
5629                     U16 wordnum;
5630                     wordnum = trie->states[ state ].wordnum;
5631
5632                     if (wordnum) { /* it's an accept state */
5633                         if (!accepted) {
5634                             accepted = 1;
5635                             /* record first match position */
5636                             if (ST.longfold) {
5637                                 ST.firstpos = (U8*)locinput;
5638                                 ST.firstchars = 0;
5639                             }
5640                             else {
5641                                 ST.firstpos = uc;
5642                                 ST.firstchars = charcount;
5643                             }
5644                         }
5645                         if (!ST.nextword || wordnum < ST.nextword)
5646                             ST.nextword = wordnum;
5647                         ST.topword = wordnum;
5648                     }
5649
5650                     DEBUG_TRIE_EXECUTE_r({
5651                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5652                                 Perl_re_exec_indentf( aTHX_
5653                                     "%sState: %4"UVxf" Accepted: %c ",
5654                                     depth, PL_colors[4],
5655                                     (UV)state, (accepted ? 'Y' : 'N'));
5656                     });
5657
5658                     /* read a char and goto next state */
5659                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5660                         I32 offset;
5661                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5662                                              uscan, len, uvc, charid, foldlen,
5663                                              foldbuf, uniflags);
5664                         charcount++;
5665                         if (foldlen>0)
5666                             ST.longfold = TRUE;
5667                         if (charid &&
5668                              ( ((offset =
5669                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5670
5671                              && ((U32)offset < trie->lasttrans)
5672                              && trie->trans[offset].check == state)
5673                         {
5674                             state = trie->trans[offset].next;
5675                         }
5676                         else {
5677                             state = 0;
5678                         }
5679                         uc += len;
5680
5681                     }
5682                     else {
5683                         state = 0;
5684                     }
5685                     DEBUG_TRIE_EXECUTE_r(
5686                         Perl_re_printf( aTHX_
5687                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5688                             charid, uvc, (UV)state, PL_colors[5] );
5689                     );
5690                 }
5691                 if (!accepted)
5692                    sayNO;
5693
5694                 /* calculate total number of accept states */
5695                 {
5696                     U16 w = ST.topword;
5697                     accepted = 0;
5698                     while (w) {
5699                         w = trie->wordinfo[w].prev;
5700                         accepted++;
5701                     }
5702                     ST.accepted = accepted;
5703                 }
5704
5705                 DEBUG_EXECUTE_r(
5706                     Perl_re_exec_indentf( aTHX_  "%sgot %"IVdf" possible matches%s\n",
5707                         depth,
5708                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5709                 );
5710                 goto trie_first_try; /* jump into the fail handler */
5711             }}
5712             NOT_REACHED; /* NOTREACHED */
5713
5714         case TRIE_next_fail: /* we failed - try next alternative */
5715         {
5716             U8 *uc;
5717             if ( ST.jump) {
5718                 REGCP_UNWIND(ST.cp);
5719                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5720             }
5721             if (!--ST.accepted) {
5722                 DEBUG_EXECUTE_r({
5723                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
5724                         depth,
5725                         PL_colors[4],
5726                         PL_colors[5] );
5727                 });
5728                 sayNO_SILENT;
5729             }
5730             {
5731                 /* Find next-highest word to process.  Note that this code
5732                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5733                 U16 min = 0;
5734                 U16 word;
5735                 U16 const nextword = ST.nextword;
5736                 reg_trie_wordinfo * const wordinfo
5737                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5738                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5739                     if (word > nextword && (!min || word < min))
5740                         min = word;
5741                 }
5742                 ST.nextword = min;
5743             }
5744
5745           trie_first_try:
5746             if (do_cutgroup) {
5747                 do_cutgroup = 0;
5748                 no_final = 0;
5749             }
5750
5751             if ( ST.jump) {
5752                 ST.lastparen = rex->lastparen;
5753                 ST.lastcloseparen = rex->lastcloseparen;
5754                 REGCP_SET(ST.cp);
5755             }
5756
5757             /* find start char of end of current word */
5758             {
5759                 U32 chars; /* how many chars to skip */
5760                 reg_trie_data * const trie
5761                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5762
5763                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5764                             >=  ST.firstchars);
5765                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5766                             - ST.firstchars;
5767                 uc = ST.firstpos;
5768
5769                 if (ST.longfold) {
5770                     /* the hard option - fold each char in turn and find
5771                      * its folded length (which may be different */
5772                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5773                     STRLEN foldlen;
5774                     STRLEN len;
5775                     UV uvc;
5776                     U8 *uscan;
5777
5778                     while (chars) {
5779                         if (utf8_target) {
5780                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5781                                                     uniflags);
5782                             uc += len;
5783                         }
5784                         else {
5785                             uvc = *uc;
5786                             uc++;
5787                         }
5788                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5789                         uscan = foldbuf;
5790                         while (foldlen) {
5791                             if (!--chars)
5792                                 break;
5793                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5794                                             uniflags);
5795                             uscan += len;
5796                             foldlen -= len;
5797                         }
5798                     }
5799                 }
5800                 else {
5801                     if (utf8_target)
5802                         while (chars--)
5803                             uc += UTF8SKIP(uc);
5804                     else
5805                         uc += chars;
5806                 }
5807             }
5808
5809             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5810                             ? ST.jump[ST.nextword]
5811                             : NEXT_OFF(ST.me));
5812
5813             DEBUG_EXECUTE_r({
5814                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
5815                     depth,
5816                     PL_colors[4],
5817                     ST.nextword,
5818                     PL_colors[5]
5819                     );
5820             });
5821
5822             if (ST.accepted > 1 || has_cutgroup) {
5823                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5824                 NOT_REACHED; /* NOTREACHED */
5825             }
5826             /* only one choice left - just continue */
5827             DEBUG_EXECUTE_r({
5828                 AV *const trie_words
5829                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5830                 SV ** const tmp = trie_words
5831                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5832                 SV *sv= tmp ? sv_newmortal() : NULL;
5833
5834                 Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
5835                     depth, PL_colors[4],
5836                     ST.nextword,
5837                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5838                             PL_colors[0], PL_colors[1],
5839                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5840                         )
5841                     : "not compiled under -Dr",
5842                     PL_colors[5] );
5843             });
5844
5845             locinput = (char*)uc;
5846             continue; /* execute rest of RE */
5847             /* NOTREACHED */
5848         }
5849 #undef  ST
5850
5851         case EXACTL:             /*  /abc/l       */
5852             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5853
5854             /* Complete checking would involve going through every character
5855              * matched by the string to see if any is above latin1.  But the
5856              * comparision otherwise might very well be a fast assembly
5857              * language routine, and I (khw) don't think slowing things down
5858              * just to check for this warning is worth it.  So this just checks
5859              * the first character */
5860             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5861                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5862             }
5863             /* FALLTHROUGH */
5864         case EXACT: {            /*  /abc/        */
5865             char *s = STRING(scan);
5866             ln = STR_LEN(scan);
5867             if (utf8_target != is_utf8_pat) {
5868                 /* The target and the pattern have differing utf8ness. */
5869                 char *l = locinput;
5870                 const char * const e = s + ln;
5871
5872                 if (utf8_target) {
5873                     /* The target is utf8, the pattern is not utf8.
5874                      * Above-Latin1 code points can't match the pattern;
5875                      * invariants match exactly, and the other Latin1 ones need
5876                      * to be downgraded to a single byte in order to do the
5877                      * comparison.  (If we could be confident that the target
5878                      * is not malformed, this could be refactored to have fewer
5879                      * tests by just assuming that if the first bytes match, it
5880                      * is an invariant, but there are tests in the test suite
5881                      * dealing with (??{...}) which violate this) */
5882                     while (s < e) {
5883                         if (l >= reginfo->strend
5884                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5885                         {
5886                             sayNO;
5887                         }
5888                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
5889                             if (*l != *s) {
5890                                 sayNO;
5891                             }
5892                             l++;
5893                         }
5894                         else {
5895                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5896                             {
5897                                 sayNO;
5898                             }
5899                             l += 2;
5900                         }
5901                         s++;
5902                     }
5903                 }
5904                 else {
5905                     /* The target is not utf8, the pattern is utf8. */
5906                     while (s < e) {
5907                         if (l >= reginfo->strend
5908                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5909                         {
5910                             sayNO;
5911                         }
5912                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
5913                             if (*s != *l) {
5914                                 sayNO;
5915                             }
5916                             s++;
5917                         }
5918                         else {
5919                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5920                             {
5921                                 sayNO;
5922                             }
5923                             s += 2;
5924                         }
5925                         l++;
5926                     }
5927                 }
5928                 locinput = l;
5929             }
5930             else {
5931                 /* The target and the pattern have the same utf8ness. */
5932                 /* Inline the first character, for speed. */
5933                 if (reginfo->strend - locinput < ln
5934                     || UCHARAT(s) != nextchr
5935                     || (ln > 1 && memNE(s, locinput, ln)))
5936                 {
5937                     sayNO;
5938                 }
5939                 locinput += ln;
5940             }
5941             break;
5942             }
5943
5944         case EXACTFL: {          /*  /abc/il      */
5945             re_fold_t folder;
5946             const U8 * fold_array;
5947             const char * s;
5948             U32 fold_utf8_flags;
5949
5950             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5951             folder = foldEQ_locale;
5952             fold_array = PL_fold_locale;
5953             fold_utf8_flags = FOLDEQ_LOCALE;
5954             goto do_exactf;
5955
5956         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5957                                       is effectively /u; hence to match, target
5958                                       must be UTF-8. */
5959             if (! utf8_target) {
5960                 sayNO;
5961             }
5962             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5963                                              | FOLDEQ_S1_FOLDS_SANE;
5964             folder = foldEQ_latin1;
5965             fold_array = PL_fold_latin1;
5966             goto do_exactf;
5967
5968         case EXACTFU_SS:         /*  /\x{df}/iu   */
5969         case EXACTFU:            /*  /abc/iu      */
5970             folder = foldEQ_latin1;
5971             fold_array = PL_fold_latin1;
5972             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5973             goto do_exactf;
5974
5975         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
5976                                    patterns */
5977             assert(! is_utf8_pat);
5978             /* FALLTHROUGH */
5979         case EXACTFA:            /*  /abc/iaa     */
5980             folder = foldEQ_latin1;
5981             fold_array = PL_fold_latin1;
5982             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5983             goto do_exactf;
5984
5985         case EXACTF:             /*  /abc/i    This node only generated for
5986                                                non-utf8 patterns */
5987             assert(! is_utf8_pat);
5988             folder = foldEQ;
5989             fold_array = PL_fold;
5990             fold_utf8_flags = 0;
5991
5992           do_exactf:
5993             s = STRING(scan);
5994             ln = STR_LEN(scan);
5995
5996             if (utf8_target
5997                 || is_utf8_pat
5998                 || state_num == EXACTFU_SS
5999                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6000             {
6001               /* Either target or the pattern are utf8, or has the issue where
6002                * the fold lengths may differ. */
6003                 const char * const l = locinput;
6004                 char *e = reginfo->strend;
6005
6006                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6007                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6008                 {
6009                     sayNO;
6010                 }
6011                 locinput = e;
6012                 break;
6013             }
6014
6015             /* Neither the target nor the pattern are utf8 */
6016             if (UCHARAT(s) != nextchr
6017                 && !NEXTCHR_IS_EOS
6018                 && UCHARAT(s) != fold_array[nextchr])
6019             {
6020                 sayNO;
6021             }
6022             if (reginfo->strend - locinput < ln)
6023                 sayNO;
6024             if (ln > 1 && ! folder(s, locinput, ln))
6025                 sayNO;
6026             locinput += ln;
6027             break;
6028         }
6029
6030         case NBOUNDL: /*  /\B/l  */
6031             to_complement = 1;
6032             /* FALLTHROUGH */
6033
6034         case BOUNDL:  /*  /\b/l  */
6035         {
6036             bool b1, b2;
6037             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6038
6039             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6040                 if (! IN_UTF8_CTYPE_LOCALE) {
6041                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6042                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6043                 }
6044                 goto boundu;
6045             }
6046
6047             if (utf8_target) {
6048                 if (locinput == reginfo->strbeg)
6049                     b1 = isWORDCHAR_LC('\n');
6050                 else {
6051                     b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
6052                                                         (U8*)(reginfo->strbeg)));
6053                 }
6054                 b2 = (NEXTCHR_IS_EOS)
6055                     ? isWORDCHAR_LC('\n')
6056                     : isWORDCHAR_LC_utf8((U8*)locinput);
6057             }
6058             else { /* Here the string isn't utf8 */
6059                 b1 = (locinput == reginfo->strbeg)
6060                      ? isWORDCHAR_LC('\n')
6061                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6062                 b2 = (NEXTCHR_IS_EOS)
6063                     ? isWORDCHAR_LC('\n')
6064                     : isWORDCHAR_LC(nextchr);
6065             }
6066             if (to_complement ^ (b1 == b2)) {
6067                 sayNO;
6068             }
6069             break;
6070         }
6071
6072         case NBOUND:  /*  /\B/   */
6073             to_complement = 1;
6074             /* FALLTHROUGH */
6075
6076         case BOUND:   /*  /\b/   */
6077             if (utf8_target) {
6078                 goto bound_utf8;
6079             }
6080             goto bound_ascii_match_only;
6081
6082         case NBOUNDA: /*  /\B/a  */
6083             to_complement = 1;
6084             /* FALLTHROUGH */
6085
6086         case BOUNDA:  /*  /\b/a  */
6087         {
6088             bool b1, b2;
6089
6090           bound_ascii_match_only:
6091             /* Here the string isn't utf8, or is utf8 and only ascii characters
6092              * are to match \w.  In the latter case looking at the byte just
6093              * prior to the current one may be just the final byte of a
6094              * multi-byte character.  This is ok.  There are two cases:
6095              * 1) it is a single byte character, and then the test is doing
6096              *    just what it's supposed to.
6097              * 2) it is a multi-byte character, in which case the final byte is
6098              *    never mistakable for ASCII, and so the test will say it is
6099              *    not a word character, which is the correct answer. */
6100             b1 = (locinput == reginfo->strbeg)
6101                  ? isWORDCHAR_A('\n')
6102                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6103             b2 = (NEXTCHR_IS_EOS)
6104                 ? isWORDCHAR_A('\n')
6105                 : isWORDCHAR_A(nextchr);
6106             if (to_complement ^ (b1 == b2)) {
6107                 sayNO;
6108             }
6109             break;
6110         }
6111
6112         case NBOUNDU: /*  /\B/u  */
6113             to_complement = 1;
6114             /* FALLTHROUGH */
6115
6116         case BOUNDU:  /*  /\b/u  */
6117
6118           boundu:
6119             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6120                 match = FALSE;
6121             }
6122             else if (utf8_target) {
6123               bound_utf8:
6124                 switch((bound_type) FLAGS(scan)) {
6125                     case TRADITIONAL_BOUND:
6126                     {
6127                         bool b1, b2;
6128                         b1 = (locinput == reginfo->strbeg)
6129                              ? 0 /* isWORDCHAR_L1('\n') */
6130                              : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
6131                                                        (U8*)(reginfo->strbeg)));
6132                         b2 = (NEXTCHR_IS_EOS)
6133                             ? 0 /* isWORDCHAR_L1('\n') */
6134                             : isWORDCHAR_utf8((U8*)locinput);
6135                         match = cBOOL(b1 != b2);
6136                         break;
6137                     }
6138                     case GCB_BOUND:
6139                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6140                             match = TRUE; /* GCB always matches at begin and
6141                                              end */
6142                         }
6143                         else {
6144                             /* Find the gcb values of previous and current
6145                              * chars, then see if is a break point */
6146                             match = isGCB(getGCB_VAL_UTF8(
6147                                                 reghop3((U8*)locinput,
6148                                                         -1,
6149                                                         (U8*)(reginfo->strbeg)),
6150                                                 (U8*) reginfo->strend),
6151                                           getGCB_VAL_UTF8((U8*) locinput,
6152                                                         (U8*) reginfo->strend),
6153                                           (U8*) reginfo->strbeg,
6154                                           (U8*) locinput,
6155                                           utf8_target);
6156                         }
6157                         break;
6158
6159                     case LB_BOUND:
6160                         if (locinput == reginfo->strbeg) {
6161                             match = FALSE;
6162                         }
6163                         else if (NEXTCHR_IS_EOS) {
6164                             match = TRUE;
6165                         }
6166                         else {
6167                             match = isLB(getLB_VAL_UTF8(
6168                                                 reghop3((U8*)locinput,
6169                                                         -1,
6170                                                         (U8*)(reginfo->strbeg)),
6171                                                 (U8*) reginfo->strend),
6172                                           getLB_VAL_UTF8((U8*) locinput,
6173                                                         (U8*) reginfo->strend),
6174                                           (U8*) reginfo->strbeg,
6175                                           (U8*) locinput,
6176                                           (U8*) reginfo->strend,
6177                                           utf8_target);
6178                         }
6179                         break;
6180
6181                     case SB_BOUND: /* Always matches at begin and end */
6182                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6183                             match = TRUE;
6184                         }
6185                         else {
6186                             match = isSB(getSB_VAL_UTF8(
6187                                                 reghop3((U8*)locinput,
6188                                                         -1,
6189                                                         (U8*)(reginfo->strbeg)),
6190                                                 (U8*) reginfo->strend),
6191                                           getSB_VAL_UTF8((U8*) locinput,
6192                                                         (U8*) reginfo->strend),
6193                                           (U8*) reginfo->strbeg,
6194                                           (U8*) locinput,
6195                                           (U8*) reginfo->strend,
6196                                           utf8_target);
6197                         }
6198                         break;
6199
6200                     case WB_BOUND:
6201                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6202                             match = TRUE;
6203                         }
6204                         else {
6205                             match = isWB(WB_UNKNOWN,
6206                                          getWB_VAL_UTF8(
6207                                                 reghop3((U8*)locinput,
6208                                                         -1,
6209                                                         (U8*)(reginfo->strbeg)),
6210                                                 (U8*) reginfo->strend),
6211                                           getWB_VAL_UTF8((U8*) locinput,
6212                                                         (U8*) reginfo->strend),
6213                                           (U8*) reginfo->strbeg,
6214                                           (U8*) locinput,
6215                                           (U8*) reginfo->strend,
6216                                           utf8_target);
6217                         }
6218                         break;
6219                 }
6220             }
6221             else {  /* Not utf8 target */
6222                 switch((bound_type) FLAGS(scan)) {
6223                     case TRADITIONAL_BOUND:
6224                     {
6225                         bool b1, b2;
6226                         b1 = (locinput == reginfo->strbeg)
6227                             ? 0 /* isWORDCHAR_L1('\n') */
6228                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6229                         b2 = (NEXTCHR_IS_EOS)
6230                             ? 0 /* isWORDCHAR_L1('\n') */
6231                             : isWORDCHAR_L1(nextchr);
6232                         match = cBOOL(b1 != b2);
6233                         break;
6234                     }
6235
6236                     case GCB_BOUND:
6237                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6238                             match = TRUE; /* GCB always matches at begin and
6239                                              end */
6240                         }
6241                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6242                                    range */
6243                             match =    UCHARAT(locinput - 1) != '\r'
6244                                     || UCHARAT(locinput) != '\n';
6245                         }
6246                         break;
6247
6248                     case LB_BOUND:
6249                         if (locinput == reginfo->strbeg) {
6250                             match = FALSE;
6251                         }
6252                         else if (NEXTCHR_IS_EOS) {
6253                             match = TRUE;
6254                         }
6255                         else {
6256                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6257                                          getLB_VAL_CP(UCHARAT(locinput)),
6258                                          (U8*) reginfo->strbeg,
6259                                          (U8*) locinput,
6260                                          (U8*) reginfo->strend,
6261                                          utf8_target);
6262                         }
6263                         break;
6264
6265                     case SB_BOUND: /* Always matches at begin and end */
6266                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6267                             match = TRUE;
6268                         }
6269                         else {
6270                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6271                                          getSB_VAL_CP(UCHARAT(locinput)),
6272                                          (U8*) reginfo->strbeg,
6273                                          (U8*) locinput,
6274                                          (U8*) reginfo->strend,
6275                                          utf8_target);
6276                         }
6277                         break;
6278
6279                     case WB_BOUND:
6280                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6281                             match = TRUE;
6282                         }
6283                         else {
6284                             match = isWB(WB_UNKNOWN,
6285                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6286                                          getWB_VAL_CP(UCHARAT(locinput)),
6287                                          (U8*) reginfo->strbeg,
6288                                          (U8*) locinput,
6289                                          (U8*) reginfo->strend,
6290                                          utf8_target);
6291                         }
6292                         break;
6293                 }
6294             }
6295
6296             if (to_complement ^ ! match) {
6297                 sayNO;
6298             }
6299             break;
6300
6301         case ANYOFL:  /*  /[abc]/l      */
6302             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6303
6304             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6305             {
6306               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6307             }
6308             /* FALLTHROUGH */
6309         case ANYOFD:  /*   /[abc]/d       */
6310         case ANYOF:  /*   /[abc]/       */
6311             if (NEXTCHR_IS_EOS)
6312                 sayNO;
6313             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6314                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6315                                                                    utf8_target))
6316                     sayNO;
6317                 locinput += UTF8SKIP(locinput);
6318             }
6319             else {
6320                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6321                     sayNO;
6322                 locinput++;
6323             }
6324             break;
6325
6326         /* The argument (FLAGS) to all the POSIX node types is the class number
6327          * */
6328
6329         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6330             to_complement = 1;
6331             /* FALLTHROUGH */
6332
6333         case POSIXL:    /* \w or [:punct:] etc. under /l */
6334             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6335             if (NEXTCHR_IS_EOS)
6336                 sayNO;
6337
6338             /* Use isFOO_lc() for characters within Latin1.  (Note that
6339              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6340              * wouldn't be invariant) */
6341             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6342                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6343                     sayNO;
6344                 }
6345
6346                 locinput++;
6347                 break;
6348             }
6349
6350             if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
6351                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6352                 goto utf8_posix_above_latin1;
6353             }
6354
6355             /* Here is a UTF-8 variant code point below 256 and the target is
6356              * UTF-8 */
6357             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6358                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6359                                             *(locinput + 1))))))
6360             {
6361                 sayNO;
6362             }
6363
6364             goto increment_locinput;
6365
6366         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6367             to_complement = 1;
6368             /* FALLTHROUGH */
6369
6370         case POSIXD:    /* \w or [:punct:] etc. under /d */
6371             if (utf8_target) {
6372                 goto utf8_posix;
6373             }
6374             goto posixa;
6375
6376         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6377
6378             if (NEXTCHR_IS_EOS) {
6379                 sayNO;
6380             }
6381
6382             /* All UTF-8 variants match */
6383             if (! UTF8_IS_INVARIANT(nextchr)) {
6384                 goto increment_locinput;
6385             }
6386
6387             to_complement = 1;
6388             goto join_nposixa;
6389
6390         case POSIXA:    /* \w or [:punct:] etc. under /a */
6391
6392           posixa:
6393             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6394              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6395              * character is a single byte */
6396
6397             if (NEXTCHR_IS_EOS) {
6398                 sayNO;
6399             }
6400
6401           join_nposixa:
6402
6403             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6404                                                                 FLAGS(scan)))))
6405             {
6406                 sayNO;
6407             }
6408
6409             /* Here we are either not in utf8, or we matched a utf8-invariant,
6410              * so the next char is the next byte */
6411             locinput++;
6412             break;
6413
6414         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6415             to_complement = 1;
6416             /* FALLTHROUGH */
6417
6418         case POSIXU:    /* \w or [:punct:] etc. under /u */
6419           utf8_posix:
6420             if (NEXTCHR_IS_EOS) {
6421                 sayNO;
6422             }
6423
6424             /* Use _generic_isCC() for characters within Latin1.  (Note that
6425              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6426              * wouldn't be invariant) */
6427             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6428                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6429                                                            FLAGS(scan)))))
6430                 {
6431                     sayNO;
6432                 }
6433                 locinput++;
6434             }
6435             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
6436                 if (! (to_complement
6437                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6438                                                                *(locinput + 1)),
6439                                              FLAGS(scan)))))
6440                 {
6441                     sayNO;
6442                 }
6443                 locinput += 2;
6444             }
6445             else {  /* Handle above Latin-1 code points */
6446               utf8_posix_above_latin1:
6447                 classnum = (_char_class_number) FLAGS(scan);
6448                 if (classnum < _FIRST_NON_SWASH_CC) {
6449
6450                     /* Here, uses a swash to find such code points.  Load if if
6451                      * not done already */
6452                     if (! PL_utf8_swash_ptrs[classnum]) {
6453                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6454                         PL_utf8_swash_ptrs[classnum]
6455                                 = _core_swash_init("utf8",
6456                                         "",
6457                                         &PL_sv_undef, 1, 0,
6458                                         PL_XPosix_ptrs[classnum], &flags);
6459                     }
6460                     if (! (to_complement
6461                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6462                                                (U8 *) locinput, TRUE))))
6463                     {
6464                         sayNO;
6465                     }
6466                 }
6467                 else {  /* Here, uses macros to find above Latin-1 code points */
6468                     switch (classnum) {
6469                         case _CC_ENUM_SPACE:
6470                             if (! (to_complement
6471                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
6472                             {
6473                                 sayNO;
6474                             }
6475                             break;
6476                         case _CC_ENUM_BLANK:
6477                             if (! (to_complement
6478                                             ^ cBOOL(is_HORIZWS_high(locinput))))
6479                             {
6480                                 sayNO;
6481                             }
6482                             break;
6483                         case _CC_ENUM_XDIGIT:
6484                             if (! (to_complement
6485                                             ^ cBOOL(is_XDIGIT_high(locinput))))
6486                             {
6487                                 sayNO;
6488                             }
6489                             break;
6490                         case _CC_ENUM_VERTSPACE:
6491                             if (! (to_complement
6492                                             ^ cBOOL(is_VERTWS_high(locinput))))
6493                             {
6494                                 sayNO;
6495                             }
6496                             break;
6497                         default:    /* The rest, e.g. [:cntrl:], can't match
6498                                        above Latin1 */
6499                             if (! to_complement) {
6500                                 sayNO;
6501                             }
6502                             break;
6503                     }
6504                 }
6505                 locinput += UTF8SKIP(locinput);
6506             }
6507             break;
6508
6509         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6510                        a Unicode extended Grapheme Cluster */
6511             if (NEXTCHR_IS_EOS)
6512                 sayNO;
6513             if  (! utf8_target) {
6514
6515                 /* Match either CR LF  or '.', as all the other possibilities
6516                  * require utf8 */
6517                 locinput++;         /* Match the . or CR */
6518                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6519                                        match the LF */
6520                     && locinput < reginfo->strend
6521                     && UCHARAT(locinput) == '\n')
6522                 {
6523                     locinput++;
6524                 }
6525             }
6526             else {
6527
6528                 /* Get the gcb type for the current character */
6529                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6530                                                        (U8*) reginfo->strend);
6531
6532                 /* Then scan through the input until we get to the first
6533                  * character whose type is supposed to be a gcb with the
6534                  * current character.  (There is always a break at the
6535                  * end-of-input) */
6536                 locinput += UTF8SKIP(locinput);
6537                 while (locinput < reginfo->strend) {
6538                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6539                                                          (U8*) reginfo->strend);
6540                     if (isGCB(prev_gcb, cur_gcb,
6541                               (U8*) reginfo->strbeg, (U8*) locinput,
6542                               utf8_target))
6543                     {
6544                         break;
6545                     }
6546
6547                     prev_gcb = cur_gcb;
6548                     locinput += UTF8SKIP(locinput);
6549                 }
6550
6551
6552             }
6553             break;
6554
6555         case NREFFL:  /*  /\g{name}/il  */
6556         {   /* The capture buffer cases.  The ones beginning with N for the
6557                named buffers just convert to the equivalent numbered and
6558                pretend they were called as the corresponding numbered buffer
6559                op.  */
6560             /* don't initialize these in the declaration, it makes C++
6561                unhappy */
6562             const char *s;
6563             char type;
6564             re_fold_t folder;
6565             const U8 *fold_array;
6566             UV utf8_fold_flags;
6567
6568             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6569             folder = foldEQ_locale;
6570             fold_array = PL_fold_locale;
6571             type = REFFL;
6572             utf8_fold_flags = FOLDEQ_LOCALE;
6573             goto do_nref;
6574
6575         case NREFFA:  /*  /\g{name}/iaa  */
6576             folder = foldEQ_latin1;
6577             fold_array = PL_fold_latin1;
6578             type = REFFA;
6579             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6580             goto do_nref;
6581
6582         case NREFFU:  /*  /\g{name}/iu  */
6583             folder = foldEQ_latin1;
6584             fold_array = PL_fold_latin1;
6585             type = REFFU;
6586             utf8_fold_flags = 0;
6587             goto do_nref;
6588
6589         case NREFF:  /*  /\g{name}/i  */
6590             folder = foldEQ;
6591             fold_array = PL_fold;
6592             type = REFF;
6593             utf8_fold_flags = 0;
6594             goto do_nref;
6595
6596         case NREF:  /*  /\g{name}/   */
6597             type = REF;
6598             folder = NULL;
6599             fold_array = NULL;
6600             utf8_fold_flags = 0;
6601           do_nref:
6602
6603             /* For the named back references, find the corresponding buffer
6604              * number */
6605             n = reg_check_named_buff_matched(rex,scan);
6606
6607             if ( ! n ) {
6608                 sayNO;
6609             }
6610             goto do_nref_ref_common;
6611
6612         case REFFL:  /*  /\1/il  */
6613             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6614             folder = foldEQ_locale;
6615             fold_array = PL_fold_locale;
6616             utf8_fold_flags = FOLDEQ_LOCALE;
6617             goto do_ref;
6618
6619         case REFFA:  /*  /\1/iaa  */
6620             folder = foldEQ_latin1;
6621             fold_array = PL_fold_latin1;
6622             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6623             goto do_ref;
6624
6625         case REFFU:  /*  /\1/iu  */
6626             folder = foldEQ_latin1;
6627             fold_array = PL_fold_latin1;
6628             utf8_fold_flags = 0;
6629             goto do_ref;
6630
6631         case REFF:  /*  /\1/i  */
6632             folder = foldEQ;
6633             fold_array = PL_fold;
6634             utf8_fold_flags = 0;
6635             goto do_ref;
6636
6637         case REF:  /*  /\1/    */
6638             folder = NULL;
6639             fold_array = NULL;
6640             utf8_fold_flags = 0;
6641
6642           do_ref:
6643             type = OP(scan);
6644             n = ARG(scan);  /* which paren pair */
6645
6646           do_nref_ref_common:
6647             ln = rex->offs[n].start;
6648             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6649             if (rex->lastparen < n || ln == -1)
6650                 sayNO;                  /* Do not match unless seen CLOSEn. */
6651             if (ln == rex->offs[n].end)
6652                 break;
6653
6654             s = reginfo->strbeg + ln;
6655             if (type != REF     /* REF can do byte comparison */
6656                 && (utf8_target || type == REFFU || type == REFFL))
6657             {
6658                 char * limit = reginfo->strend;
6659
6660                 /* This call case insensitively compares the entire buffer
6661                     * at s, with the current input starting at locinput, but
6662                     * not going off the end given by reginfo->strend, and
6663                     * returns in <limit> upon success, how much of the
6664                     * current input was matched */
6665                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6666                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6667                 {
6668                     sayNO;
6669                 }
6670                 locinput = limit;
6671                 break;
6672             }
6673
6674             /* Not utf8:  Inline the first character, for speed. */
6675             if (!NEXTCHR_IS_EOS &&
6676                 UCHARAT(s) != nextchr &&
6677                 (type == REF ||
6678                  UCHARAT(s) != fold_array[nextchr]))
6679                 sayNO;
6680             ln = rex->offs[n].end - ln;
6681             if (locinput + ln > reginfo->strend)
6682                 sayNO;
6683             if (ln > 1 && (type == REF
6684                            ? memNE(s, locinput, ln)
6685                            : ! folder(s, locinput, ln)))
6686                 sayNO;
6687             locinput += ln;
6688             break;
6689         }
6690
6691         case NOTHING: /* null op; e.g. the 'nothing' following
6692                        * the '*' in m{(a+|b)*}' */
6693             break;
6694         case TAIL: /* placeholder while compiling (A|B|C) */
6695             break;
6696
6697 #undef  ST
6698 #define ST st->u.eval
6699 #define CUR_EVAL cur_eval->u.eval
6700
6701         {
6702             SV *ret;
6703             REGEXP *re_sv;
6704             regexp *re;
6705             regexp_internal *rei;
6706             regnode *startpoint;
6707             U32 arg;
6708
6709         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6710             arg= (U32)ARG(scan);
6711             if (cur_eval && cur_eval->locinput == locinput) {
6712                 if ( ++nochange_depth > max_nochange_depth )
6713                     Perl_croak(aTHX_
6714                         "Pattern subroutine nesting without pos change"
6715                         " exceeded limit in regex");
6716             } else {
6717                 nochange_depth = 0;
6718             }
6719             re_sv = rex_sv;
6720             re = rex;
6721             rei = rexi;
6722             startpoint = scan + ARG2L(scan);
6723             EVAL_CLOSE_PAREN_SET( st, arg );
6724             /* Detect infinite recursion
6725              *
6726              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6727              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6728              * So we track the position in the string we are at each time
6729              * we recurse and if we try to enter the same routine twice from
6730              * the same position we throw an error.
6731              */
6732             if ( rex->recurse_locinput[arg] == locinput ) {
6733                 /* FIXME: we should show the regop that is failing as part
6734                  * of the error message. */
6735                 Perl_croak(aTHX_ "Infinite recursion in regex");
6736             } else {
6737                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6738                 rex->recurse_locinput[arg]= locinput;
6739
6740                 DEBUG_r({
6741                     GET_RE_DEBUG_FLAGS_DECL;
6742                     DEBUG_STACK_r({
6743                         Perl_re_exec_indentf( aTHX_
6744                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6745                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6746                         );
6747                     });
6748                 });
6749             }
6750
6751             /* Save all the positions seen so far. */
6752             ST.cp = regcppush(rex, 0, maxopenparen);
6753             REGCP_SET(ST.lastcp);
6754
6755             /* and then jump to the code we share with EVAL */
6756             goto eval_recurse_doit;
6757             /* NOTREACHED */
6758
6759         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
6760             if (cur_eval && cur_eval->locinput==locinput) {
6761                 if ( ++nochange_depth > max_nochange_depth )
6762                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6763             } else {
6764                 nochange_depth = 0;
6765             }
6766             {
6767                 /* execute the code in the {...} */
6768
6769                 dSP;
6770                 IV before;
6771                 OP * const oop = PL_op;
6772                 COP * const ocurcop = PL_curcop;
6773                 OP *nop;
6774                 CV *newcv;
6775
6776                 /* save *all* paren positions */
6777                 regcppush(rex, 0, maxopenparen);
6778                 REGCP_SET(runops_cp);
6779
6780                 if (!caller_cv)
6781                     caller_cv = find_runcv(NULL);
6782
6783                 n = ARG(scan);
6784
6785                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6786                     newcv = (ReANY(
6787                                     (REGEXP*)(rexi->data->data[n])
6788                             ))->qr_anoncv;
6789                     nop = (OP*)rexi->data->data[n+1];
6790                 }
6791                 else if (rexi->data->what[n] == 'l') { /* literal code */
6792                     newcv = caller_cv;
6793                     nop = (OP*)rexi->data->data[n];
6794                     assert(CvDEPTH(newcv));
6795                 }
6796                 else {
6797                     /* literal with own CV */
6798                     assert(rexi->data->what[n] == 'L');
6799                     newcv = rex->qr_anoncv;
6800                     nop = (OP*)rexi->data->data[n];
6801                 }
6802
6803                 /* normally if we're about to execute code from the same
6804                  * CV that we used previously, we just use the existing
6805                  * CX stack entry. However, its possible that in the
6806                  * meantime we may have backtracked, popped from the save
6807                  * stack, and undone the SAVECOMPPAD(s) associated with
6808                  * PUSH_MULTICALL; in which case PL_comppad no longer
6809                  * points to newcv's pad. */
6810                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6811                 {
6812                     U8 flags = (CXp_SUB_RE |
6813                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6814                     if (last_pushed_cv) {
6815                         /* PUSH/POP_MULTICALL save and restore the
6816                          * caller's PL_comppad; if we call multiple subs
6817                          * using the same CX block, we have to save and
6818                          * unwind the varying PL_comppad's ourselves,
6819                          * especially restoring the right PL_comppad on
6820                          * backtrack - so save it on the save stack */
6821                         SAVECOMPPAD();
6822                         CHANGE_MULTICALL_FLAGS(newcv, flags);
6823                     }
6824                     else {
6825                         PUSH_MULTICALL_FLAGS(newcv, flags);
6826                     }
6827                     last_pushed_cv = newcv;
6828                 }
6829                 else {
6830                     /* these assignments are just to silence compiler
6831                      * warnings */
6832                     multicall_cop = NULL;
6833                 }
6834                 last_pad = PL_comppad;
6835
6836                 /* the initial nextstate you would normally execute
6837                  * at the start of an eval (which would cause error
6838                  * messages to come from the eval), may be optimised
6839                  * away from the execution path in the regex code blocks;
6840                  * so manually set PL_curcop to it initially */
6841                 {
6842                     OP *o = cUNOPx(nop)->op_first;
6843                     assert(o->op_type == OP_NULL);
6844                     if (o->op_targ == OP_SCOPE) {
6845                         o = cUNOPo->op_first;
6846                     }
6847                     else {
6848                         assert(o->op_targ == OP_LEAVE);
6849                         o = cUNOPo->op_first;
6850                         assert(o->op_type == OP_ENTER);
6851                         o = OpSIBLING(o);
6852                     }
6853
6854                     if (o->op_type != OP_STUB) {
6855                         assert(    o->op_type == OP_NEXTSTATE
6856                                 || o->op_type == OP_DBSTATE
6857                                 || (o->op_type == OP_NULL
6858                                     &&  (  o->op_targ == OP_NEXTSTATE
6859                                         || o->op_targ == OP_DBSTATE
6860                                         )
6861                                     )
6862                         );
6863                         PL_curcop = (COP*)o;
6864                     }
6865                 }
6866                 nop = nop->op_next;
6867
6868                 DEBUG_STATE_r( Perl_re_printf( aTHX_
6869                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6870
6871                 rex->offs[0].end = locinput - reginfo->strbeg;
6872                 if (reginfo->info_aux_eval->pos_magic)
6873                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6874                                   reginfo->sv, reginfo->strbeg,
6875                                   locinput - reginfo->strbeg);
6876
6877                 if (sv_yes_mark) {
6878                     SV *sv_mrk = get_sv("REGMARK", 1);
6879                     sv_setsv(sv_mrk, sv_yes_mark);
6880                 }
6881
6882                 /* we don't use MULTICALL here as we want to call the
6883                  * first op of the block of interest, rather than the
6884                  * first op of the sub. Also, we don't want to free
6885                  * the savestack frame */
6886                 before = (IV)(SP-PL_stack_base);
6887                 PL_op = nop;
6888                 CALLRUNOPS(aTHX);                       /* Scalar context. */
6889                 SPAGAIN;
6890                 if ((IV)(SP-PL_stack_base) == before)
6891                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6892                 else {
6893                     ret = POPs;
6894                     PUTBACK;
6895                 }
6896
6897                 /* before restoring everything, evaluate the returned
6898                  * value, so that 'uninit' warnings don't use the wrong
6899                  * PL_op or pad. Also need to process any magic vars
6900                  * (e.g. $1) *before* parentheses are restored */
6901
6902                 PL_op = NULL;
6903
6904                 re_sv = NULL;
6905                 if (logical == 0)        /*   (?{})/   */
6906                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6907                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6908                     sw = cBOOL(SvTRUE(ret));
6909                     logical = 0;
6910                 }
6911                 else {                   /*  /(??{})  */
6912                     /*  if its overloaded, let the regex compiler handle
6913                      *  it; otherwise extract regex, or stringify  */
6914                     if (SvGMAGICAL(ret))
6915                         ret = sv_mortalcopy(ret);
6916                     if (!SvAMAGIC(ret)) {
6917                         SV *sv = ret;
6918                         if (SvROK(sv))
6919                             sv = SvRV(sv);
6920                         if (SvTYPE(sv) == SVt_REGEXP)
6921                             re_sv = (REGEXP*) sv;
6922                         else if (SvSMAGICAL(ret)) {
6923                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6924                             if (mg)
6925                                 re_sv = (REGEXP *) mg->mg_obj;
6926                         }
6927
6928                         /* force any undef warnings here */
6929                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6930                             ret = sv_mortalcopy(ret);
6931                             (void) SvPV_force_nolen(ret);
6932                         }
6933                     }
6934
6935                 }
6936
6937                 /* *** Note that at this point we don't restore
6938                  * PL_comppad, (or pop the CxSUB) on the assumption it may
6939                  * be used again soon. This is safe as long as nothing
6940                  * in the regexp code uses the pad ! */
6941                 PL_op = oop;
6942                 PL_curcop = ocurcop;
6943                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
6944                 PL_curpm = PL_reg_curpm;
6945
6946                 if (logical != 2)
6947                     break;
6948             }
6949
6950                 /* only /(??{})/  from now on */
6951                 logical = 0;
6952                 {
6953                     /* extract RE object from returned value; compiling if
6954                      * necessary */
6955
6956                     if (re_sv) {
6957                         re_sv = reg_temp_copy(NULL, re_sv);
6958                     }
6959                     else {
6960                         U32 pm_flags = 0;
6961
6962                         if (SvUTF8(ret) && IN_BYTES) {
6963                             /* In use 'bytes': make a copy of the octet
6964                              * sequence, but without the flag on */
6965                             STRLEN len;
6966                             const char *const p = SvPV(ret, len);
6967                             ret = newSVpvn_flags(p, len, SVs_TEMP);
6968                         }
6969                         if (rex->intflags & PREGf_USE_RE_EVAL)
6970                             pm_flags |= PMf_USE_RE_EVAL;
6971
6972                         /* if we got here, it should be an engine which
6973                          * supports compiling code blocks and stuff */
6974                         assert(rex->engine && rex->engine->op_comp);
6975                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6976                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
6977                                     rex->engine, NULL, NULL,
6978                                     /* copy /msixn etc to inner pattern */
6979                                     ARG2L(scan),
6980                                     pm_flags);
6981
6982                         if (!(SvFLAGS(ret)
6983                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
6984                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
6985                             /* This isn't a first class regexp. Instead, it's
6986                                caching a regexp onto an existing, Perl visible
6987                                scalar.  */
6988                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
6989                         }
6990                     }
6991                     SAVEFREESV(re_sv);
6992                     re = ReANY(re_sv);
6993                 }
6994                 RXp_MATCH_COPIED_off(re);
6995                 re->subbeg = rex->subbeg;
6996                 re->sublen = rex->sublen;
6997                 re->suboffset = rex->suboffset;
6998                 re->subcoffset = rex->subcoffset;
6999                 re->lastparen = 0;
7000                 re->lastcloseparen = 0;
7001                 rei = RXi_GET(re);
7002                 DEBUG_EXECUTE_r(
7003                     debug_start_match(re_sv, utf8_target, locinput,
7004                                     reginfo->strend, "Matching embedded");
7005                 );
7006                 startpoint = rei->program + 1;
7007                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7008                                              * close_paren only for GOSUB */
7009                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7010                 /* Save all the seen positions so far. */
7011                 ST.cp = regcppush(rex, 0, maxopenparen);
7012                 REGCP_SET(ST.lastcp);
7013                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7014                 maxopenparen = 0;
7015                 /* run the pattern returned from (??{...}) */
7016
7017               eval_recurse_doit: /* Share code with GOSUB below this line
7018                             * At this point we expect the stack context to be
7019                             * set up correctly */
7020
7021                 /* invalidate the S-L poscache. We're now executing a
7022                  * different set of WHILEM ops (and their associated
7023                  * indexes) against the same string, so the bits in the
7024                  * cache are meaningless. Setting maxiter to zero forces
7025                  * the cache to be invalidated and zeroed before reuse.
7026                  * XXX This is too dramatic a measure. Ideally we should
7027                  * save the old cache and restore when running the outer
7028                  * pattern again */
7029                 reginfo->poscache_maxiter = 0;
7030
7031                 /* the new regexp might have a different is_utf8_pat than we do */
7032                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7033
7034                 ST.prev_rex = rex_sv;
7035                 ST.prev_curlyx = cur_curlyx;
7036                 rex_sv = re_sv;
7037                 SET_reg_curpm(rex_sv);
7038                 rex = re;
7039                 rexi = rei;
7040                 cur_curlyx = NULL;
7041                 ST.B = next;
7042                 ST.prev_eval = cur_eval;
7043                 cur_eval = st;
7044                 /* now continue from first node in postoned RE */
7045                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
7046                 NOT_REACHED; /* NOTREACHED */
7047         }
7048
7049         case EVAL_AB: /* cleanup after a successful (??{A})B */
7050             /* note: this is called twice; first after popping B, then A */
7051             DEBUG_STACK_r({
7052                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7053                     depth, cur_eval, ST.prev_eval);
7054             });
7055
7056 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7057             if ( cur_eval && CUR_EVAL.close_paren ) {\
7058                 DEBUG_STACK_r({ \
7059                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7060                         depth,    \
7061                         CUR_EVAL.close_paren - 1,\
7062                         cur_eval, \
7063                         VAL);     \
7064                 });               \
7065                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7066             }
7067
7068             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7069
7070             rex_sv = ST.prev_rex;
7071             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7072             SET_reg_curpm(rex_sv);
7073             rex = ReANY(rex_sv);
7074             rexi = RXi_GET(rex);
7075             {
7076                 /* preserve $^R across LEAVE's. See Bug 121070. */
7077                 SV *save_sv= GvSV(PL_replgv);
7078                 SvREFCNT_inc(save_sv);
7079                 regcpblow(ST.cp); /* LEAVE in disguise */
7080                 sv_setsv(GvSV(PL_replgv), save_sv);
7081                 SvREFCNT_dec(save_sv);
7082             }
7083             cur_eval = ST.prev_eval;
7084             cur_curlyx = ST.prev_curlyx;
7085
7086             /* Invalidate cache. See "invalidate" comment above. */
7087             reginfo->poscache_maxiter = 0;
7088             if ( nochange_depth )
7089                 nochange_depth--;
7090
7091             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7092             sayYES;
7093
7094
7095         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7096             /* note: this is called twice; first after popping B, then A */
7097             DEBUG_STACK_r({
7098                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7099                     depth, cur_eval, ST.prev_eval);
7100             });
7101
7102             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7103
7104             rex_sv = ST.prev_rex;
7105             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7106             SET_reg_curpm(rex_sv);
7107             rex = ReANY(rex_sv);
7108             rexi = RXi_GET(rex);
7109
7110             REGCP_UNWIND(ST.lastcp);
7111             regcppop(rex, &maxopenparen);
7112             cur_eval = ST.prev_eval;
7113             cur_curlyx = ST.prev_curlyx;
7114
7115             /* Invalidate cache. See "invalidate" comment above. */
7116             reginfo->poscache_maxiter = 0;
7117             if ( nochange_depth )
7118                 nochange_depth--;
7119
7120             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7121             sayNO_SILENT;
7122 #undef ST
7123
7124         case OPEN: /*  (  */
7125             n = ARG(scan);  /* which paren pair */
7126             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7127             if (n > maxopenparen)
7128                 maxopenparen = n;
7129             DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
7130                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
7131                 PTR2UV(rex),
7132                 PTR2UV(rex->offs),
7133                 (UV)n,
7134                 (IV)rex->offs[n].start_tmp,
7135                 (UV)maxopenparen
7136             ));
7137             lastopen = n;
7138             break;
7139
7140 /* XXX really need to log other places start/end are set too */
7141 #define CLOSE_CAPTURE                                                      \
7142     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7143     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7144     DEBUG_BUFFERS_r(Perl_re_printf( aTHX_                                              \
7145         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
7146         PTR2UV(rex),                                                       \
7147         PTR2UV(rex->offs),                                                 \
7148         (UV)n,                                                             \
7149         (IV)rex->offs[n].start,                                            \
7150         (IV)rex->offs[n].end                                               \
7151     ))
7152
7153         case CLOSE:  /*  )  */
7154             n = ARG(scan);  /* which paren pair */
7155             CLOSE_CAPTURE;
7156             if (n > rex->lastparen)
7157                 rex->lastparen = n;
7158             rex->lastcloseparen = n;
7159             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7160                 goto fake_end;
7161
7162             break;
7163
7164         case ACCEPT:  /*  (*ACCEPT)  */
7165             if (scan->flags)
7166                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7167             if (ARG2L(scan)){
7168                 regnode *cursor;
7169                 for (cursor=scan;
7170                      cursor && OP(cursor)!=END;
7171                      cursor=regnext(cursor))
7172                 {
7173                     if ( OP(cursor)==CLOSE ){
7174                         n = ARG(cursor);
7175                         if ( n <= lastopen ) {
7176                             CLOSE_CAPTURE;
7177                             if (n > rex->lastparen)
7178                                 rex->lastparen = n;
7179                             rex->lastcloseparen = n;
7180                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7181                                 break;
7182                         }
7183                     }
7184                 }
7185             }
7186             goto fake_end;
7187             /* NOTREACHED */
7188
7189         case GROUPP:  /*  (?(1))  */
7190             n = ARG(scan);  /* which paren pair */
7191             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7192             break;
7193
7194         case NGROUPP:  /*  (?(<name>))  */
7195             /* reg_check_named_buff_matched returns 0 for no match */
7196             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7197             break;
7198
7199         case INSUBP:   /*  (?(R))  */
7200             n = ARG(scan);
7201             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7202              * of SCAN is already set up as matches a eval.close_paren */
7203             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7204             break;
7205
7206         case DEFINEP:  /*  (?(DEFINE))  */
7207             sw = 0;
7208             break;
7209
7210         case IFTHEN:   /*  (?(cond)A|B)  */
7211             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7212             if (sw)
7213                 next = NEXTOPER(NEXTOPER(scan));
7214             else {
7215                 next = scan + ARG(scan);
7216                 if (OP(next) == IFTHEN) /* Fake one. */
7217                     next = NEXTOPER(NEXTOPER(next));
7218             }
7219             break;
7220
7221         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7222             logical = scan->flags;
7223             break;
7224
7225 /*******************************************************************
7226
7227 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7228 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7229 STAR/PLUS/CURLY/CURLYN are used instead.)
7230
7231 A*B is compiled as <CURLYX><A><WHILEM><B>
7232
7233 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7234 state, which contains the current count, initialised to -1. It also sets
7235 cur_curlyx to point to this state, with any previous value saved in the
7236 state block.
7237
7238 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7239 since the pattern may possibly match zero times (i.e. it's a while {} loop
7240 rather than a do {} while loop).
7241
7242 Each entry to WHILEM represents a successful match of A. The count in the
7243 CURLYX block is incremented, another WHILEM state is pushed, and execution
7244 passes to A or B depending on greediness and the current count.
7245
7246 For example, if matching against the string a1a2a3b (where the aN are
7247 substrings that match /A/), then the match progresses as follows: (the
7248 pushed states are interspersed with the bits of strings matched so far):
7249
7250     <CURLYX cnt=-1>
7251     <CURLYX cnt=0><WHILEM>
7252     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7253     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7254     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7255     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7256
7257 (Contrast this with something like CURLYM, which maintains only a single
7258 backtrack state:
7259
7260     <CURLYM cnt=0> a1
7261     a1 <CURLYM cnt=1> a2
7262     a1 a2 <CURLYM cnt=2> a3
7263     a1 a2 a3 <CURLYM cnt=3> b
7264 )
7265
7266 Each WHILEM state block marks a point to backtrack to upon partial failure
7267 of A or B, and also contains some minor state data related to that
7268 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7269 overall state, such as the count, and pointers to the A and B ops.
7270
7271 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7272 must always point to the *current* CURLYX block, the rules are:
7273
7274 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7275 and set cur_curlyx to point the new block.
7276
7277 When popping the CURLYX block after a successful or unsuccessful match,
7278 restore the previous cur_curlyx.
7279
7280 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7281 to the outer one saved in the CURLYX block.
7282
7283 When popping the WHILEM block after a successful or unsuccessful B match,
7284 restore the previous cur_curlyx.
7285
7286 Here's an example for the pattern (AI* BI)*BO
7287 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7288
7289 cur_
7290 curlyx backtrack stack
7291 ------ ---------------
7292 NULL
7293 CO     <CO prev=NULL> <WO>
7294 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7295 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7296 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7297
7298 At this point the pattern succeeds, and we work back down the stack to
7299 clean up, restoring as we go:
7300
7301 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7302 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7303 CO     <CO prev=NULL> <WO>
7304 NULL
7305
7306 *******************************************************************/
7307
7308 #define ST st->u.curlyx
7309
7310         case CURLYX:    /* start of /A*B/  (for complex A) */
7311         {
7312             /* No need to save/restore up to this paren */
7313             I32 parenfloor = scan->flags;
7314
7315             assert(next); /* keep Coverity happy */
7316             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7317                 next += ARG(next);
7318
7319             /* XXXX Probably it is better to teach regpush to support
7320                parenfloor > maxopenparen ... */
7321             if (parenfloor > (I32)rex->lastparen)
7322                 parenfloor = rex->lastparen; /* Pessimization... */
7323
7324             ST.prev_curlyx= cur_curlyx;
7325             cur_curlyx = st;
7326             ST.cp = PL_savestack_ix;
7327
7328             /* these fields contain the state of the current curly.
7329              * they are accessed by subsequent WHILEMs */
7330             ST.parenfloor = parenfloor;
7331             ST.me = scan;
7332             ST.B = next;
7333             ST.minmod = minmod;
7334             minmod = 0;
7335             ST.count = -1;      /* this will be updated by WHILEM */
7336             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7337
7338             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7339             NOT_REACHED; /* NOTREACHED */
7340         }
7341
7342         case CURLYX_end: /* just finished matching all of A*B */
7343             cur_curlyx = ST.prev_curlyx;
7344             sayYES;
7345             NOT_REACHED; /* NOTREACHED */
7346
7347         case CURLYX_end_fail: /* just failed to match all of A*B */
7348             regcpblow(ST.cp);
7349             cur_curlyx = ST.prev_curlyx;
7350             sayNO;
7351             NOT_REACHED; /* NOTREACHED */
7352
7353
7354 #undef ST
7355 #define ST st->u.whilem
7356
7357         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7358         {
7359             /* see the discussion above about CURLYX/WHILEM */
7360             I32 n;
7361             int min, max;
7362             regnode *A;
7363
7364             assert(cur_curlyx); /* keep Coverity happy */
7365
7366             min = ARG1(cur_curlyx->u.curlyx.me);
7367             max = ARG2(cur_curlyx->u.curlyx.me);
7368             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7369             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7370             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7371             ST.cache_offset = 0;
7372             ST.cache_mask = 0;
7373
7374
7375             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
7376                   depth, (long)n, min, max)
7377             );
7378
7379             /* First just match a string of min A's. */
7380
7381             if (n < min) {
7382                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7383                                     maxopenparen);
7384                 cur_curlyx->u.curlyx.lastloc = locinput;
7385                 REGCP_SET(ST.lastcp);
7386
7387                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7388                 NOT_REACHED; /* NOTREACHED */
7389             }
7390
7391             /* If degenerate A matches "", assume A done. */
7392
7393             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7394                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
7395                    depth)
7396                 );
7397                 goto do_whilem_B_max;
7398             }
7399
7400             /* super-linear cache processing.
7401              *
7402              * The idea here is that for certain types of CURLYX/WHILEM -
7403              * principally those whose upper bound is infinity (and
7404              * excluding regexes that have things like \1 and other very
7405              * non-regular expresssiony things), then if a pattern like
7406              * /....A*.../ fails and we backtrack to the WHILEM, then we
7407              * make a note that this particular WHILEM op was at string
7408              * position 47 (say) when the rest of pattern failed. Then, if
7409              * we ever find ourselves back at that WHILEM, and at string
7410              * position 47 again, we can just fail immediately rather than
7411              * running the rest of the pattern again.
7412              *
7413              * This is very handy when patterns start to go
7414              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7415              * with a combinatorial explosion of backtracking.
7416              *
7417              * The cache is implemented as a bit array, with one bit per
7418              * string byte position per WHILEM op (up to 16) - so its
7419              * between 0.25 and 2x the string size.
7420              *
7421              * To avoid allocating a poscache buffer every time, we do an
7422              * initially countdown; only after we have  executed a WHILEM
7423              * op (string-length x #WHILEMs) times do we allocate the
7424              * cache.
7425              *
7426              * The top 4 bits of scan->flags byte say how many different
7427              * relevant CURLLYX/WHILEM op pairs there are, while the
7428              * bottom 4-bits is the identifying index number of this
7429              * WHILEM.
7430              */
7431
7432             if (scan->flags) {
7433
7434                 if (!reginfo->poscache_maxiter) {
7435                     /* start the countdown: Postpone detection until we
7436                      * know the match is not *that* much linear. */
7437                     reginfo->poscache_maxiter
7438                         =    (reginfo->strend - reginfo->strbeg + 1)
7439                            * (scan->flags>>4);
7440                     /* possible overflow for long strings and many CURLYX's */
7441                     if (reginfo->poscache_maxiter < 0)
7442                         reginfo->poscache_maxiter = I32_MAX;
7443                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7444                 }
7445
7446                 if (reginfo->poscache_iter-- == 0) {
7447                     /* initialise cache */
7448                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7449                     regmatch_info_aux *const aux = reginfo->info_aux;
7450                     if (aux->poscache) {
7451                         if ((SSize_t)reginfo->poscache_size < size) {
7452                             Renew(aux->poscache, size, char);
7453                             reginfo->poscache_size = size;
7454                         }
7455                         Zero(aux->poscache, size, char);
7456                     }
7457                     else {
7458                         reginfo->poscache_size = size;
7459                         Newxz(aux->poscache, size, char);
7460                     }
7461                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7462       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7463                               PL_colors[4], PL_colors[5])
7464                     );
7465                 }
7466
7467                 if (reginfo->poscache_iter < 0) {
7468                     /* have we already failed at this position? */
7469                     SSize_t offset, mask;
7470
7471                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7472                     offset  = (scan->flags & 0xf) - 1
7473                                 +   (locinput - reginfo->strbeg)
7474                                   * (scan->flags>>4);
7475                     mask    = 1 << (offset % 8);
7476                     offset /= 8;
7477                     if (reginfo->info_aux->poscache[offset] & mask) {
7478                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
7479                             depth)
7480                         );
7481                         sayNO; /* cache records failure */
7482                     }
7483                     ST.cache_offset = offset;
7484                     ST.cache_mask   = mask;
7485                 }
7486             }
7487
7488             /* Prefer B over A for minimal matching. */
7489
7490             if (cur_curlyx->u.curlyx.minmod) {
7491                 ST.save_curlyx = cur_curlyx;
7492                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7493                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
7494                             maxopenparen);
7495                 REGCP_SET(ST.lastcp);
7496                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7497                                     locinput);
7498                 NOT_REACHED; /* NOTREACHED */
7499             }
7500
7501             /* Prefer A over B for maximal matching. */
7502
7503             if (n < max) { /* More greed allowed? */
7504                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7505                             maxopenparen);
7506                 cur_curlyx->u.curlyx.lastloc = locinput;
7507                 REGCP_SET(ST.lastcp);
7508                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7509                 NOT_REACHED; /* NOTREACHED */
7510             }
7511             goto do_whilem_B_max;
7512         }
7513         NOT_REACHED; /* NOTREACHED */
7514
7515         case WHILEM_B_min: /* just matched B in a minimal match */
7516         case WHILEM_B_max: /* just matched B in a maximal match */
7517             cur_curlyx = ST.save_curlyx;
7518             sayYES;
7519             NOT_REACHED; /* NOTREACHED */
7520
7521         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7522             cur_curlyx = ST.save_curlyx;
7523             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7524             cur_curlyx->u.curlyx.count--;
7525             CACHEsayNO;
7526             NOT_REACHED; /* NOTREACHED */
7527
7528         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7529             /* FALLTHROUGH */
7530         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7531             REGCP_UNWIND(ST.lastcp);
7532             regcppop(rex, &maxopenparen);
7533             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7534             cur_curlyx->u.curlyx.count--;
7535             CACHEsayNO;
7536             NOT_REACHED; /* NOTREACHED */
7537
7538         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7539             REGCP_UNWIND(ST.lastcp);
7540             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7541             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
7542                 depth)
7543             );
7544           do_whilem_B_max:
7545             if (cur_curlyx->u.curlyx.count >= REG_INFTY
7546                 && ckWARN(WARN_REGEXP)
7547                 && !reginfo->warned)
7548             {
7549                 reginfo->warned = TRUE;
7550                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7551                      "Complex regular subexpression recursion limit (%d) "
7552                      "exceeded",
7553                      REG_INFTY - 1);
7554             }
7555
7556             /* now try B */
7557             ST.save_curlyx = cur_curlyx;
7558             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7559             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7560                                 locinput);
7561             NOT_REACHED; /* NOTREACHED */
7562
7563         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7564             cur_curlyx = ST.save_curlyx;
7565             REGCP_UNWIND(ST.lastcp);
7566             regcppop(rex, &maxopenparen);
7567
7568             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7569                 /* Maximum greed exceeded */
7570                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7571                     && ckWARN(WARN_REGEXP)
7572                     && !reginfo->warned)
7573                 {
7574                     reginfo->warned     = TRUE;
7575                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7576                         "Complex regular subexpression recursion "
7577                         "limit (%d) exceeded",
7578                         REG_INFTY - 1);
7579                 }
7580                 cur_curlyx->u.curlyx.count--;
7581                 CACHEsayNO;
7582             }
7583
7584             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
7585             );
7586             /* Try grabbing another A and see if it helps. */
7587             cur_curlyx->u.curlyx.lastloc = locinput;
7588             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7589                             maxopenparen);
7590             REGCP_SET(ST.lastcp);
7591             PUSH_STATE_GOTO(WHILEM_A_min,
7592                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7593                 locinput);
7594             NOT_REACHED; /* NOTREACHED */
7595
7596 #undef  ST
7597 #define ST st->u.branch
7598
7599         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
7600             next = scan + ARG(scan);
7601             if (next == scan)
7602                 next = NULL;
7603             scan = NEXTOPER(scan);
7604             /* FALLTHROUGH */
7605
7606         case BRANCH:        /*  /(...|A|...)/ */
7607             scan = NEXTOPER(scan); /* scan now points to inner node */
7608             ST.lastparen = rex->lastparen;
7609             ST.lastcloseparen = rex->lastcloseparen;
7610             ST.next_branch = next;
7611             REGCP_SET(ST.cp);
7612
7613             /* Now go into the branch */
7614             if (has_cutgroup) {
7615                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7616             } else {
7617                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7618             }
7619             NOT_REACHED; /* NOTREACHED */
7620
7621         case CUTGROUP:  /*  /(*THEN)/  */
7622             sv_yes_mark = st->u.mark.mark_name = scan->flags
7623                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7624                 : NULL;
7625             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7626             NOT_REACHED; /* NOTREACHED */
7627
7628         case CUTGROUP_next_fail:
7629             do_cutgroup = 1;
7630             no_final = 1;
7631             if (st->u.mark.mark_name)
7632                 sv_commit = st->u.mark.mark_name;
7633             sayNO;
7634             NOT_REACHED; /* NOTREACHED */
7635
7636         case BRANCH_next:
7637             sayYES;
7638             NOT_REACHED; /* NOTREACHED */
7639
7640         case BRANCH_next_fail: /* that branch failed; try the next, if any */
7641             if (do_cutgroup) {
7642                 do_cutgroup = 0;
7643                 no_final = 0;
7644             }
7645             REGCP_UNWIND(ST.cp);
7646             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7647             scan = ST.next_branch;
7648             /* no more branches? */
7649             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7650                 DEBUG_EXECUTE_r({
7651                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
7652                         depth,
7653                         PL_colors[4],
7654                         PL_colors[5] );
7655                 });
7656                 sayNO_SILENT;
7657             }
7658             continue; /* execute next BRANCH[J] op */
7659             /* NOTREACHED */
7660
7661         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7662             minmod = 1;
7663             break;
7664
7665 #undef  ST
7666 #define ST st->u.curlym
7667
7668         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7669
7670             /* This is an optimisation of CURLYX that enables us to push
7671              * only a single backtracking state, no matter how many matches
7672              * there are in {m,n}. It relies on the pattern being constant
7673              * length, with no parens to influence future backrefs
7674              */
7675
7676             ST.me = scan;
7677             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7678
7679             ST.lastparen      = rex->lastparen;
7680             ST.lastcloseparen = rex->lastcloseparen;
7681
7682             /* if paren positive, emulate an OPEN/CLOSE around A */
7683             if (ST.me->flags) {
7684                 U32 paren = ST.me->flags;
7685                 if (paren > maxopenparen)
7686                     maxopenparen = paren;
7687                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7688             }
7689             ST.A = scan;
7690             ST.B = next;
7691             ST.alen = 0;
7692             ST.count = 0;
7693             ST.minmod = minmod;
7694             minmod = 0;
7695             ST.c1 = CHRTEST_UNINIT;
7696             REGCP_SET(ST.cp);
7697
7698             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7699                 goto curlym_do_B;
7700
7701           curlym_do_A: /* execute the A in /A{m,n}B/  */
7702             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7703             NOT_REACHED; /* NOTREACHED */
7704
7705         case CURLYM_A: /* we've just matched an A */
7706             ST.count++;
7707             /* after first match, determine A's length: u.curlym.alen */
7708             if (ST.count == 1) {
7709                 if (reginfo->is_utf8_target) {
7710                     char *s = st->locinput;
7711                     while (s < locinput) {
7712                         ST.alen++;
7713                         s += UTF8SKIP(s);
7714                     }
7715                 }
7716                 else {
7717                     ST.alen = locinput - st->locinput;
7718                 }
7719                 if (ST.alen == 0)
7720                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7721             }
7722             DEBUG_EXECUTE_r(
7723                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7724                           depth, (IV) ST.count, (IV)ST.alen)
7725             );
7726
7727             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7728                 goto fake_end;
7729
7730             {
7731                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7732                 if ( max == REG_INFTY || ST.count < max )
7733                     goto curlym_do_A; /* try to match another A */
7734             }
7735             goto curlym_do_B; /* try to match B */
7736
7737         case CURLYM_A_fail: /* just failed to match an A */
7738             REGCP_UNWIND(ST.cp);
7739
7740
7741             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7742                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7743                 sayNO;
7744
7745           curlym_do_B: /* execute the B in /A{m,n}B/  */
7746             if (ST.c1 == CHRTEST_UNINIT) {
7747                 /* calculate c1 and c2 for possible match of 1st char
7748                  * following curly */
7749                 ST.c1 = ST.c2 = CHRTEST_VOID;
7750                 assert(ST.B);
7751                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7752                     regnode *text_node = ST.B;
7753                     if (! HAS_TEXT(text_node))
7754                         FIND_NEXT_IMPT(text_node);
7755                     /* this used to be
7756
7757                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7758
7759                         But the former is redundant in light of the latter.
7760
7761                         if this changes back then the macro for
7762                         IS_TEXT and friends need to change.
7763                      */
7764                     if (PL_regkind[OP(text_node)] == EXACT) {
7765                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7766                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7767                            reginfo))
7768                         {
7769                             sayNO;
7770                         }
7771                     }
7772                 }
7773             }
7774
7775             DEBUG_EXECUTE_r(
7776                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%"IVdf"...\n",
7777                     depth, (IV)ST.count)
7778                 );
7779             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7780                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7781                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7782                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7783                     {
7784                         /* simulate B failing */
7785                         DEBUG_OPTIMISE_r(
7786                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7787                                 depth,
7788                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7789                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7790                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7791                         );
7792                         state_num = CURLYM_B_fail;
7793                         goto reenter_switch;
7794                     }
7795                 }
7796                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7797                     /* simulate B failing */
7798                     DEBUG_OPTIMISE_r(
7799                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7800                             depth,
7801                             (int) nextchr, ST.c1, ST.c2)
7802                     );
7803                     state_num = CURLYM_B_fail;
7804                     goto reenter_switch;
7805                 }
7806             }
7807
7808             if (ST.me->flags) {
7809                 /* emulate CLOSE: mark current A as captured */
7810                 I32 paren = ST.me->flags;
7811                 if (ST.count) {
7812                     rex->offs[paren].start
7813                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7814                     rex->offs[paren].end = locinput - reginfo->strbeg;
7815                     if ((U32)paren > rex->lastparen)
7816                         rex->lastparen = paren;
7817                     rex->lastcloseparen = paren;
7818                 }
7819                 else
7820                     rex->offs[paren].end = -1;
7821
7822                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7823                 {
7824                     if (ST.count)
7825                         goto fake_end;
7826                     else
7827                         sayNO;
7828                 }
7829             }
7830
7831             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7832             NOT_REACHED; /* NOTREACHED */
7833
7834         case CURLYM_B_fail: /* just failed to match a B */
7835             REGCP_UNWIND(ST.cp);
7836             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7837             if (ST.minmod) {
7838                 I32 max = ARG2(ST.me);
7839                 if (max != REG_INFTY && ST.count == max)
7840                     sayNO;
7841                 goto curlym_do_A; /* try to match a further A */
7842             }
7843             /* backtrack one A */
7844             if (ST.count == ARG1(ST.me) /* min */)
7845                 sayNO;
7846             ST.count--;
7847             SET_locinput(HOPc(locinput, -ST.alen));
7848             goto curlym_do_B; /* try to match B */
7849
7850 #undef ST
7851 #define ST st->u.curly
7852
7853 #define CURLY_SETPAREN(paren, success) \
7854     if (paren) { \
7855         if (success) { \
7856             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7857             rex->offs[paren].end = locinput - reginfo->strbeg; \
7858             if (paren > rex->lastparen) \
7859                 rex->lastparen = paren; \
7860             rex->lastcloseparen = paren; \
7861         } \
7862         else { \
7863             rex->offs[paren].end = -1; \
7864             rex->lastparen      = ST.lastparen; \
7865             rex->lastcloseparen = ST.lastcloseparen; \
7866         } \
7867     }
7868
7869         case STAR:              /*  /A*B/ where A is width 1 char */
7870             ST.paren = 0;
7871             ST.min = 0;
7872             ST.max = REG_INFTY;
7873             scan = NEXTOPER(scan);
7874             goto repeat;
7875
7876         case PLUS:              /*  /A+B/ where A is width 1 char */
7877             ST.paren = 0;
7878             ST.min = 1;
7879             ST.max = REG_INFTY;
7880             scan = NEXTOPER(scan);
7881             goto repeat;
7882
7883         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
7884             ST.paren = scan->flags;     /* Which paren to set */
7885             ST.lastparen      = rex->lastparen;
7886             ST.lastcloseparen = rex->lastcloseparen;
7887             if (ST.paren > maxopenparen)
7888                 maxopenparen = ST.paren;
7889             ST.min = ARG1(scan);  /* min to match */
7890             ST.max = ARG2(scan);  /* max to match */
7891             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
7892             {
7893                 ST.min=1;
7894                 ST.max=1;
7895             }
7896             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7897             goto repeat;
7898
7899         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
7900             ST.paren = 0;
7901             ST.min = ARG1(scan);  /* min to match */
7902             ST.max = ARG2(scan);  /* max to match */
7903             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7904           repeat:
7905             /*
7906             * Lookahead to avoid useless match attempts
7907             * when we know what character comes next.
7908             *
7909             * Used to only do .*x and .*?x, but now it allows
7910             * for )'s, ('s and (?{ ... })'s to be in the way
7911             * of the quantifier and the EXACT-like node.  -- japhy
7912             */
7913
7914             assert(ST.min <= ST.max);
7915             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7916                 ST.c1 = ST.c2 = CHRTEST_VOID;
7917             }
7918             else {
7919                 regnode *text_node = next;
7920
7921                 if (! HAS_TEXT(text_node))
7922                     FIND_NEXT_IMPT(text_node);
7923
7924                 if (! HAS_TEXT(text_node))
7925                     ST.c1 = ST.c2 = CHRTEST_VOID;
7926                 else {
7927                     if ( PL_regkind[OP(text_node)] != EXACT ) {
7928                         ST.c1 = ST.c2 = CHRTEST_VOID;
7929                     }
7930                     else {
7931
7932                     /*  Currently we only get here when
7933
7934                         PL_rekind[OP(text_node)] == EXACT
7935
7936                         if this changes back then the macro for IS_TEXT and
7937                         friends need to change. */
7938                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7939                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7940                            reginfo))
7941                         {
7942                             sayNO;
7943                         }
7944                     }
7945                 }
7946             }
7947
7948             ST.A = scan;
7949             ST.B = next;
7950             if (minmod) {
7951                 char *li = locinput;
7952                 minmod = 0;
7953                 if (ST.min &&
7954                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
7955                             < ST.min)
7956                     sayNO;
7957                 SET_locinput(li);
7958                 ST.count = ST.min;
7959                 REGCP_SET(ST.cp);
7960                 if (ST.c1 == CHRTEST_VOID)
7961                     goto curly_try_B_min;
7962
7963                 ST.oldloc = locinput;
7964
7965                 /* set ST.maxpos to the furthest point along the
7966                  * string that could possibly match */
7967                 if  (ST.max == REG_INFTY) {
7968                     ST.maxpos = reginfo->strend - 1;
7969                     if (utf8_target)
7970                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7971                             ST.maxpos--;
7972                 }
7973                 else if (utf8_target) {
7974                     int m = ST.max - ST.min;
7975                     for (ST.maxpos = locinput;
7976                          m >0 && ST.maxpos < reginfo->strend; m--)
7977                         ST.maxpos += UTF8SKIP(ST.maxpos);
7978                 }
7979                 else {
7980                     ST.maxpos = locinput + ST.max - ST.min;
7981                     if (ST.maxpos >= reginfo->strend)
7982                         ST.maxpos = reginfo->strend - 1;
7983                 }
7984                 goto curly_try_B_min_known;
7985
7986             }
7987             else {
7988                 /* avoid taking address of locinput, so it can remain
7989                  * a register var */
7990                 char *li = locinput;
7991                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
7992                 if (ST.count < ST.min)
7993                     sayNO;
7994                 SET_locinput(li);
7995                 if ((ST.count > ST.min)
7996                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
7997                 {
7998                     /* A{m,n} must come at the end of the string, there's
7999                      * no point in backing off ... */
8000                     ST.min = ST.count;
8001                     /* ...except that $ and \Z can match before *and* after
8002                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8003                        We may back off by one in this case. */
8004                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8005                         ST.min--;
8006                 }
8007                 REGCP_SET(ST.cp);
8008                 goto curly_try_B_max;
8009             }
8010             NOT_REACHED; /* NOTREACHED */
8011
8012         case CURLY_B_min_known_fail:
8013             /* failed to find B in a non-greedy match where c1,c2 valid */
8014
8015             REGCP_UNWIND(ST.cp);
8016             if (ST.paren) {
8017                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8018             }
8019             /* Couldn't or didn't -- move forward. */
8020             ST.oldloc = locinput;
8021             if (utf8_target)
8022                 locinput += UTF8SKIP(locinput);
8023             else
8024                 locinput++;
8025             ST.count++;
8026           curly_try_B_min_known:
8027              /* find the next place where 'B' could work, then call B */
8028             {
8029                 int n;
8030                 if (utf8_target) {
8031                     n = (ST.oldloc == locinput) ? 0 : 1;
8032                     if (ST.c1 == ST.c2) {
8033                         /* set n to utf8_distance(oldloc, locinput) */
8034                         while (locinput <= ST.maxpos
8035                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8036                         {
8037                             locinput += UTF8SKIP(locinput);
8038                             n++;
8039                         }
8040                     }
8041                     else {
8042                         /* set n to utf8_distance(oldloc, locinput) */
8043                         while (locinput <= ST.maxpos
8044                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8045                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8046                         {
8047                             locinput += UTF8SKIP(locinput);
8048                             n++;
8049                         }
8050                     }
8051                 }
8052                 else {  /* Not utf8_target */
8053                     if (ST.c1 == ST.c2) {
8054                         while (locinput <= ST.maxpos &&
8055                                UCHARAT(locinput) != ST.c1)
8056                             locinput++;
8057                     }
8058                     else {
8059                         while (locinput <= ST.maxpos
8060                                && UCHARAT(locinput) != ST.c1
8061                                && UCHARAT(locinput) != ST.c2)
8062                             locinput++;
8063                     }
8064                     n = locinput - ST.oldloc;
8065                 }
8066                 if (locinput > ST.maxpos)
8067                     sayNO;
8068                 if (n) {
8069                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8070                      * at b; check that everything between oldloc and
8071                      * locinput matches */
8072                     char *li = ST.oldloc;
8073                     ST.count += n;
8074                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
8075                         sayNO;
8076                     assert(n == REG_INFTY || locinput == li);
8077                 }
8078                 CURLY_SETPAREN(ST.paren, ST.count);
8079                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8080                     goto fake_end;
8081                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8082             }
8083             NOT_REACHED; /* NOTREACHED */
8084
8085         case CURLY_B_min_fail:
8086             /* failed to find B in a non-greedy match where c1,c2 invalid */
8087
8088             REGCP_UNWIND(ST.cp);
8089             if (ST.paren) {
8090                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8091             }
8092             /* failed -- move forward one */
8093             {
8094                 char *li = locinput;
8095                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
8096                     sayNO;
8097                 }
8098                 locinput = li;
8099             }
8100             {
8101                 ST.count++;
8102                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8103                         ST.count > 0)) /* count overflow ? */
8104                 {
8105                   curly_try_B_min:
8106                     CURLY_SETPAREN(ST.paren, ST.count);
8107                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8108                         goto fake_end;
8109                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8110                 }
8111             }
8112             sayNO;
8113             NOT_REACHED; /* NOTREACHED */
8114
8115           curly_try_B_max:
8116             /* a successful greedy match: now try to match B */
8117             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8118                 goto fake_end;
8119             {
8120                 bool could_match = locinput < reginfo->strend;
8121
8122                 /* If it could work, try it. */
8123                 if (ST.c1 != CHRTEST_VOID && could_match) {
8124                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8125                     {
8126                         could_match = memEQ(locinput,
8127                                             ST.c1_utf8,
8128                                             UTF8SKIP(locinput))
8129                                     || memEQ(locinput,
8130                                              ST.c2_utf8,
8131                                              UTF8SKIP(locinput));
8132                     }
8133                     else {
8134                         could_match = UCHARAT(locinput) == ST.c1
8135                                       || UCHARAT(locinput) == ST.c2;
8136                     }
8137                 }
8138                 if (ST.c1 == CHRTEST_VOID || could_match) {
8139                     CURLY_SETPAREN(ST.paren, ST.count);
8140                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8141                     NOT_REACHED; /* NOTREACHED */
8142                 }
8143             }
8144             /* FALLTHROUGH */
8145
8146         case CURLY_B_max_fail:
8147             /* failed to find B in a greedy match */
8148
8149             REGCP_UNWIND(ST.cp);
8150             if (ST.paren) {
8151                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8152             }
8153             /*  back up. */
8154             if (--ST.count < ST.min)
8155                 sayNO;
8156             locinput = HOPc(locinput, -1);
8157             goto curly_try_B_max;
8158
8159 #undef ST
8160
8161         case END: /*  last op of main pattern  */
8162           fake_end:
8163             if (cur_eval) {
8164                 /* we've just finished A in /(??{A})B/; now continue with B */
8165                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8166                 st->u.eval.prev_rex = rex_sv;           /* inner */
8167
8168                 /* Save *all* the positions. */
8169                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8170                 rex_sv = CUR_EVAL.prev_rex;
8171                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8172                 SET_reg_curpm(rex_sv);
8173                 rex = ReANY(rex_sv);
8174                 rexi = RXi_GET(rex);
8175
8176                 st->u.eval.prev_curlyx = cur_curlyx;
8177                 cur_curlyx = CUR_EVAL.prev_curlyx;
8178
8179                 REGCP_SET(st->u.eval.lastcp);
8180
8181                 /* Restore parens of the outer rex without popping the
8182                  * savestack */
8183                 S_regcp_restore(aTHX_ rex, CUR_EVAL.lastcp,
8184                                         &maxopenparen);
8185
8186                 st->u.eval.prev_eval = cur_eval;
8187                 cur_eval = CUR_EVAL.prev_eval;
8188                 DEBUG_EXECUTE_r(
8189                     Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
8190                                       depth, cur_eval););
8191                 if ( nochange_depth )
8192                     nochange_depth--;
8193
8194                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8195
8196                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
8197                                     locinput); /* match B */
8198             }
8199
8200             if (locinput < reginfo->till) {
8201                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8202                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8203                                       PL_colors[4],
8204                                       (long)(locinput - startpos),
8205                                       (long)(reginfo->till - startpos),
8206                                       PL_colors[5]));
8207
8208                 sayNO_SILENT;           /* Cannot match: too short. */
8209             }
8210             sayYES;                     /* Success! */
8211
8212         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8213             DEBUG_EXECUTE_r(
8214             Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
8215                 depth, PL_colors[4], PL_colors[5]));
8216             sayYES;                     /* Success! */
8217
8218 #undef  ST
8219 #define ST st->u.ifmatch
8220
8221         {
8222             char *newstart;
8223
8224         case SUSPEND:   /* (?>A) */
8225             ST.wanted = 1;
8226             newstart = locinput;
8227             goto do_ifmatch;
8228
8229         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8230             ST.wanted = 0;
8231             goto ifmatch_trivial_fail_test;
8232
8233         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8234             ST.wanted = 1;
8235           ifmatch_trivial_fail_test:
8236             if (scan->flags) {
8237                 char * const s = HOPBACKc(locinput, scan->flags);
8238                 if (!s) {
8239                     /* trivial fail */
8240                     if (logical) {
8241                         logical = 0;
8242                         sw = 1 - cBOOL(ST.wanted);
8243                     }
8244                     else if (ST.wanted)
8245                         sayNO;
8246                     next = scan + ARG(scan);
8247                     if (next == scan)
8248                         next = NULL;
8249                     break;
8250                 }
8251                 newstart = s;
8252             }
8253             else
8254                 newstart = locinput;
8255
8256           do_ifmatch:
8257             ST.me = scan;
8258             ST.logical = logical;
8259             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8260
8261             /* execute body of (?...A) */
8262             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8263             NOT_REACHED; /* NOTREACHED */
8264         }
8265
8266         case IFMATCH_A_fail: /* body of (?...A) failed */
8267             ST.wanted = !ST.wanted;
8268             /* FALLTHROUGH */
8269
8270         case IFMATCH_A: /* body of (?...A) succeeded */
8271             if (ST.logical) {
8272                 sw = cBOOL(ST.wanted);
8273             }
8274             else if (!ST.wanted)
8275                 sayNO;
8276
8277             if (OP(ST.me) != SUSPEND) {
8278                 /* restore old position except for (?>...) */
8279                 locinput = st->locinput;
8280             }
8281             scan = ST.me + ARG(ST.me);
8282             if (scan == ST.me)
8283                 scan = NULL;
8284             continue; /* execute B */
8285
8286 #undef ST
8287
8288         case LONGJMP: /*  alternative with many branches compiles to
8289                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8290             next = scan + ARG(scan);
8291             if (next == scan)
8292                 next = NULL;
8293             break;
8294
8295         case COMMIT:  /*  (*COMMIT)  */
8296             reginfo->cutpoint = reginfo->strend;
8297             /* FALLTHROUGH */
8298
8299         case PRUNE:   /*  (*PRUNE)   */
8300             if (scan->flags)
8301                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8302             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8303             NOT_REACHED; /* NOTREACHED */
8304
8305         case COMMIT_next_fail:
8306             no_final = 1;
8307             /* FALLTHROUGH */
8308             sayNO;
8309             NOT_REACHED; /* NOTREACHED */
8310
8311         case OPFAIL:   /* (*FAIL)  */
8312             if (scan->flags)
8313                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8314             if (logical) {
8315                 /* deal with (?(?!)X|Y) properly,
8316                  * make sure we trigger the no branch
8317                  * of the trailing IFTHEN structure*/
8318                 sw= 0;
8319                 break;
8320             } else {
8321                 sayNO;
8322             }
8323             NOT_REACHED; /* NOTREACHED */
8324
8325 #define ST st->u.mark
8326         case MARKPOINT: /*  (*MARK:foo)  */
8327             ST.prev_mark = mark_state;
8328             ST.mark_name = sv_commit = sv_yes_mark
8329                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8330             mark_state = st;
8331             ST.mark_loc = locinput;
8332             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8333             NOT_REACHED; /* NOTREACHED */
8334
8335         case MARKPOINT_next:
8336             mark_state = ST.prev_mark;
8337             sayYES;
8338             NOT_REACHED; /* NOTREACHED */
8339
8340         case MARKPOINT_next_fail:
8341             if (popmark && sv_eq(ST.mark_name,popmark))
8342             {
8343                 if (ST.mark_loc > startpoint)
8344                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8345                 popmark = NULL; /* we found our mark */
8346                 sv_commit = ST.mark_name;
8347
8348                 DEBUG_EXECUTE_r({
8349                         Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%"SVf"...%s\n",
8350                             depth,
8351                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8352                 });
8353             }
8354             mark_state = ST.prev_mark;
8355             sv_yes_mark = mark_state ?
8356                 mark_state->u.mark.mark_name : NULL;
8357             sayNO;
8358             NOT_REACHED; /* NOTREACHED */
8359
8360         case SKIP:  /*  (*SKIP)  */
8361             if (!scan->flags) {
8362                 /* (*SKIP) : if we fail we cut here*/
8363                 ST.mark_name = NULL;
8364                 ST.mark_loc = locinput;
8365                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8366             } else {
8367                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8368                    otherwise do nothing.  Meaning we need to scan
8369                  */
8370                 regmatch_state *cur = mark_state;
8371                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8372
8373                 while (cur) {
8374                     if ( sv_eq( cur->u.mark.mark_name,
8375                                 find ) )
8376                     {
8377                         ST.mark_name = find;
8378                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8379                     }
8380                     cur = cur->u.mark.prev_mark;
8381                 }
8382             }
8383             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8384             break;
8385
8386         case SKIP_next_fail:
8387             if (ST.mark_name) {
8388                 /* (*CUT:NAME) - Set up to search for the name as we
8389                    collapse the stack*/
8390                 popmark = ST.mark_name;
8391             } else {
8392                 /* (*CUT) - No name, we cut here.*/
8393                 if (ST.mark_loc > startpoint)
8394                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8395                 /* but we set sv_commit to latest mark_name if there
8396                    is one so they can test to see how things lead to this
8397                    cut */
8398                 if (mark_state)
8399                     sv_commit=mark_state->u.mark.mark_name;
8400             }
8401             no_final = 1;
8402             sayNO;
8403             NOT_REACHED; /* NOTREACHED */
8404 #undef ST
8405
8406         case LNBREAK: /* \R */
8407             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8408                 locinput += n;
8409             } else
8410                 sayNO;
8411             break;
8412
8413         default:
8414             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
8415                           PTR2UV(scan), OP(scan));
8416             Perl_croak(aTHX_ "regexp memory corruption");
8417
8418         /* this is a point to jump to in order to increment
8419          * locinput by one character */
8420           increment_locinput:
8421             assert(!NEXTCHR_IS_EOS);
8422             if (utf8_target) {
8423                 locinput += PL_utf8skip[nextchr];
8424                 /* locinput is allowed to go 1 char off the end, but not 2+ */
8425                 if (locinput > reginfo->strend)
8426                     sayNO;
8427             }
8428             else
8429                 locinput++;
8430             break;
8431
8432         } /* end switch */
8433
8434         /* switch break jumps here */
8435         scan = next; /* prepare to execute the next op and ... */
8436         continue;    /* ... jump back to the top, reusing st */
8437         /* NOTREACHED */
8438
8439       push_yes_state:
8440         /* push a state that backtracks on success */
8441         st->u.yes.prev_yes_state = yes_state;
8442         yes_state = st;
8443         /* FALLTHROUGH */
8444       push_state:
8445         /* push a new regex state, then continue at scan  */
8446         {
8447             regmatch_state *newst;
8448
8449             DEBUG_STACK_r({
8450                 regmatch_state *cur = st;
8451                 regmatch_state *curyes = yes_state;
8452                 int curd = depth;
8453                 regmatch_slab *slab = PL_regmatch_slab;
8454                 for (;curd > -1 && (depth-curd < 3);cur--,curd--) {
8455                     if (cur < SLAB_FIRST(slab)) {
8456                         slab = slab->prev;
8457                         cur = SLAB_LAST(slab);
8458                     }
8459                     Perl_re_exec_indentf( aTHX_ "#%-3d %-10s %s\n",
8460                         depth,
8461                         curd, PL_reg_name[cur->resume_state],
8462                         (curyes == cur) ? "yes" : ""
8463                     );
8464                     if (curyes == cur)
8465                         curyes = cur->u.yes.prev_yes_state;
8466                 }
8467             } else
8468                 DEBUG_STATE_pp("push")
8469             );
8470             depth++;
8471             st->locinput = locinput;
8472             newst = st+1;
8473             if (newst >  SLAB_LAST(PL_regmatch_slab))
8474                 newst = S_push_slab(aTHX);
8475             PL_regmatch_state = newst;
8476
8477             locinput = pushinput;
8478             st = newst;
8479             continue;
8480             /* NOTREACHED */
8481         }
8482     }
8483 #ifdef SOLARIS_BAD_OPTIMIZER
8484 #  undef PL_charclass
8485 #endif
8486
8487     /*
8488     * We get here only if there's trouble -- normally "case END" is
8489     * the terminating point.
8490     */
8491     Perl_croak(aTHX_ "corrupted regexp pointers");
8492     NOT_REACHED; /* NOTREACHED */
8493
8494   yes:
8495     if (yes_state) {
8496         /* we have successfully completed a subexpression, but we must now
8497          * pop to the state marked by yes_state and continue from there */
8498         assert(st != yes_state);
8499 #ifdef DEBUGGING
8500         while (st != yes_state) {
8501             st--;
8502             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8503                 PL_regmatch_slab = PL_regmatch_slab->prev;
8504                 st = SLAB_LAST(PL_regmatch_slab);
8505             }
8506             DEBUG_STATE_r({
8507                 if (no_final) {
8508                     DEBUG_STATE_pp("pop (no final)");
8509                 } else {
8510                     DEBUG_STATE_pp("pop (yes)");
8511                 }
8512             });
8513             depth--;
8514         }
8515 #else
8516         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8517             || yes_state > SLAB_LAST(PL_regmatch_slab))
8518         {
8519             /* not in this slab, pop slab */
8520             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8521             PL_regmatch_slab = PL_regmatch_slab->prev;
8522             st = SLAB_LAST(PL_regmatch_slab);
8523         }
8524         depth -= (st - yes_state);
8525 #endif
8526         st = yes_state;
8527         yes_state = st->u.yes.prev_yes_state;
8528         PL_regmatch_state = st;
8529
8530         if (no_final)
8531             locinput= st->locinput;
8532         state_num = st->resume_state + no_final;
8533         goto reenter_switch;
8534     }
8535
8536     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
8537                           PL_colors[4], PL_colors[5]));
8538
8539     if (reginfo->info_aux_eval) {
8540         /* each successfully executed (?{...}) block does the equivalent of
8541          *   local $^R = do {...}
8542          * When popping the save stack, all these locals would be undone;
8543          * bypass this by setting the outermost saved $^R to the latest
8544          * value */
8545         /* I dont know if this is needed or works properly now.
8546          * see code related to PL_replgv elsewhere in this file.
8547          * Yves
8548          */
8549         if (oreplsv != GvSV(PL_replgv))
8550             sv_setsv(oreplsv, GvSV(PL_replgv));
8551     }
8552     result = 1;
8553     goto final_exit;
8554
8555   no:
8556     DEBUG_EXECUTE_r(
8557         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
8558             depth,
8559             PL_colors[4], PL_colors[5])
8560         );
8561
8562   no_silent:
8563     if (no_final) {
8564         if (yes_state) {
8565             goto yes;
8566         } else {
8567             goto final_exit;
8568         }
8569     }
8570     if (depth) {
8571         /* there's a previous state to backtrack to */
8572         st--;
8573         if (st < SLAB_FIRST(PL_regmatch_slab)) {
8574             PL_regmatch_slab = PL_regmatch_slab->prev;
8575             st = SLAB_LAST(PL_regmatch_slab);
8576         }
8577         PL_regmatch_state = st;
8578         locinput= st->locinput;
8579
8580         DEBUG_STATE_pp("pop");
8581         depth--;
8582         if (yes_state == st)
8583             yes_state = st->u.yes.prev_yes_state;
8584
8585         state_num = st->resume_state + 1; /* failure = success + 1 */
8586         PERL_ASYNC_CHECK();
8587         goto reenter_switch;
8588     }
8589     result = 0;
8590
8591   final_exit:
8592     if (rex->intflags & PREGf_VERBARG_SEEN) {
8593         SV *sv_err = get_sv("REGERROR", 1);
8594         SV *sv_mrk = get_sv("REGMARK", 1);
8595         if (result) {
8596             sv_commit = &PL_sv_no;
8597             if (!sv_yes_mark)
8598                 sv_yes_mark = &PL_sv_yes;
8599         } else {
8600             if (!sv_commit)
8601                 sv_commit = &PL_sv_yes;
8602             sv_yes_mark = &PL_sv_no;
8603         }
8604         assert(sv_err);
8605         assert(sv_mrk);
8606         sv_setsv(sv_err, sv_commit);
8607         sv_setsv(sv_mrk, sv_yes_mark);
8608     }
8609
8610
8611     if (last_pushed_cv) {
8612         dSP;
8613         POP_MULTICALL;
8614         PERL_UNUSED_VAR(SP);
8615     }
8616
8617     assert(!result ||  locinput - reginfo->strbeg >= 0);
8618     return result ?  locinput - reginfo->strbeg : -1;
8619 }
8620
8621 /*
8622  - regrepeat - repeatedly match something simple, report how many
8623  *
8624  * What 'simple' means is a node which can be the operand of a quantifier like
8625  * '+', or {1,3}
8626  *
8627  * startposp - pointer a pointer to the start position.  This is updated
8628  *             to point to the byte following the highest successful
8629  *             match.
8630  * p         - the regnode to be repeatedly matched against.
8631  * reginfo   - struct holding match state, such as strend
8632  * max       - maximum number of things to match.
8633  * depth     - (for debugging) backtracking depth.
8634  */
8635 STATIC I32
8636 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8637             regmatch_info *const reginfo, I32 max, int depth)
8638 {
8639     char *scan;     /* Pointer to current position in target string */
8640     I32 c;
8641     char *loceol = reginfo->strend;   /* local version */
8642     I32 hardcount = 0;  /* How many matches so far */
8643     bool utf8_target = reginfo->is_utf8_target;
8644     unsigned int to_complement = 0;  /* Invert the result? */
8645     UV utf8_flags;
8646     _char_class_number classnum;
8647 #ifndef DEBUGGING
8648     PERL_UNUSED_ARG(depth);
8649 #endif
8650
8651     PERL_ARGS_ASSERT_REGREPEAT;
8652
8653     scan = *startposp;
8654     if (max == REG_INFTY)
8655         max = I32_MAX;
8656     else if (! utf8_target && loceol - scan > max)
8657         loceol = scan + max;
8658
8659     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8660      * to the maximum of how far we should go in it (leaving it set to the real
8661      * end, if the maximum permissible would take us beyond that).  This allows
8662      * us to make the loop exit condition that we haven't gone past <loceol> to
8663      * also mean that we haven't exceeded the max permissible count, saving a
8664      * test each time through the loop.  But it assumes that the OP matches a
8665      * single byte, which is true for most of the OPs below when applied to a
8666      * non-UTF-8 target.  Those relatively few OPs that don't have this
8667      * characteristic will have to compensate.
8668      *
8669      * There is no adjustment for UTF-8 targets, as the number of bytes per
8670      * character varies.  OPs will have to test both that the count is less
8671      * than the max permissible (using <hardcount> to keep track), and that we
8672      * are still within the bounds of the string (using <loceol>.  A few OPs
8673      * match a single byte no matter what the encoding.  They can omit the max
8674      * test if, for the UTF-8 case, they do the adjustment that was skipped
8675      * above.
8676      *
8677      * Thus, the code above sets things up for the common case; and exceptional
8678      * cases need extra work; the common case is to make sure <scan> doesn't
8679      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8680      * count doesn't exceed the maximum permissible */
8681
8682     switch (OP(p)) {
8683     case REG_ANY:
8684         if (utf8_target) {
8685             while (scan < loceol && hardcount < max && *scan != '\n') {
8686                 scan += UTF8SKIP(scan);
8687                 hardcount++;
8688             }
8689         } else {
8690             while (scan < loceol && *scan != '\n')
8691                 scan++;
8692         }
8693         break;
8694     case SANY:
8695         if (utf8_target) {
8696             while (scan < loceol && hardcount < max) {
8697                 scan += UTF8SKIP(scan);
8698                 hardcount++;
8699             }
8700         }
8701         else
8702             scan = loceol;
8703         break;
8704     case EXACTL:
8705         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8706         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8707             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8708         }
8709         /* FALLTHROUGH */
8710     case EXACT:
8711         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8712
8713         c = (U8)*STRING(p);
8714
8715         /* Can use a simple loop if the pattern char to match on is invariant
8716          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8717          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8718          * true iff it doesn't matter if the argument is in UTF-8 or not */
8719         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8720             if (utf8_target && loceol - scan > max) {
8721                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8722                  * since here, to match at all, 1 char == 1 byte */
8723                 loceol = scan + max;
8724             }
8725             while (scan < loceol && UCHARAT(scan) == c) {
8726                 scan++;
8727             }
8728         }
8729         else if (reginfo->is_utf8_pat) {
8730             if (utf8_target) {
8731                 STRLEN scan_char_len;
8732
8733                 /* When both target and pattern are UTF-8, we have to do
8734                  * string EQ */
8735                 while (hardcount < max
8736                        && scan < loceol
8737                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8738                        && memEQ(scan, STRING(p), scan_char_len))
8739                 {
8740                     scan += scan_char_len;
8741                     hardcount++;
8742                 }
8743             }
8744             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8745
8746                 /* Target isn't utf8; convert the character in the UTF-8
8747                  * pattern to non-UTF8, and do a simple loop */
8748                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8749                 while (scan < loceol && UCHARAT(scan) == c) {
8750                     scan++;
8751                 }
8752             } /* else pattern char is above Latin1, can't possibly match the
8753                  non-UTF-8 target */
8754         }
8755         else {
8756
8757             /* Here, the string must be utf8; pattern isn't, and <c> is
8758              * different in utf8 than not, so can't compare them directly.
8759              * Outside the loop, find the two utf8 bytes that represent c, and
8760              * then look for those in sequence in the utf8 string */
8761             U8 high = UTF8_TWO_BYTE_HI(c);
8762             U8 low = UTF8_TWO_BYTE_LO(c);
8763
8764             while (hardcount < max
8765                     && scan + 1 < loceol
8766                     && UCHARAT(scan) == high
8767                     && UCHARAT(scan + 1) == low)
8768             {
8769                 scan += 2;
8770                 hardcount++;
8771             }
8772         }
8773         break;
8774
8775     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8776         assert(! reginfo->is_utf8_pat);
8777         /* FALLTHROUGH */
8778     case EXACTFA:
8779         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8780         goto do_exactf;
8781
8782     case EXACTFL:
8783         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8784         utf8_flags = FOLDEQ_LOCALE;
8785         goto do_exactf;
8786
8787     case EXACTF:   /* This node only generated for non-utf8 patterns */
8788         assert(! reginfo->is_utf8_pat);
8789         utf8_flags = 0;
8790         goto do_exactf;
8791
8792     case EXACTFLU8:
8793         if (! utf8_target) {
8794             break;
8795         }
8796         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8797                                     | FOLDEQ_S2_FOLDS_SANE;
8798         goto do_exactf;
8799
8800     case EXACTFU_SS:
8801     case EXACTFU:
8802         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8803
8804       do_exactf: {
8805         int c1, c2;
8806         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8807
8808         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8809
8810         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8811                                         reginfo))
8812         {
8813             if (c1 == CHRTEST_VOID) {
8814                 /* Use full Unicode fold matching */
8815                 char *tmpeol = reginfo->strend;
8816                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8817                 while (hardcount < max
8818                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8819                                              STRING(p), NULL, pat_len,
8820                                              reginfo->is_utf8_pat, utf8_flags))
8821                 {
8822                     scan = tmpeol;
8823                     tmpeol = reginfo->strend;
8824                     hardcount++;
8825                 }
8826             }
8827             else if (utf8_target) {
8828                 if (c1 == c2) {
8829                     while (scan < loceol
8830                            && hardcount < max
8831                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8832                     {
8833                         scan += UTF8SKIP(scan);
8834                         hardcount++;
8835                     }
8836                 }
8837                 else {
8838                     while (scan < loceol
8839                            && hardcount < max
8840                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8841                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8842                     {
8843                         scan += UTF8SKIP(scan);
8844                         hardcount++;
8845                     }
8846                 }
8847             }
8848             else if (c1 == c2) {
8849                 while (scan < loceol && UCHARAT(scan) == c1) {
8850                     scan++;
8851                 }
8852             }
8853             else {
8854                 while (scan < loceol &&
8855                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8856                 {
8857                     scan++;
8858                 }
8859             }
8860         }
8861         break;
8862     }
8863     case ANYOFL:
8864         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8865
8866         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
8867             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8868         }
8869         /* FALLTHROUGH */
8870     case ANYOFD:
8871     case ANYOF:
8872         if (utf8_target) {
8873             while (hardcount < max
8874                    && scan < loceol
8875                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8876             {
8877                 scan += UTF8SKIP(scan);
8878                 hardcount++;
8879             }
8880         } else {
8881             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan, 0))
8882                 scan++;
8883         }
8884         break;
8885
8886     /* The argument (FLAGS) to all the POSIX node types is the class number */
8887
8888     case NPOSIXL:
8889         to_complement = 1;
8890         /* FALLTHROUGH */
8891
8892     case POSIXL:
8893         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8894         if (! utf8_target) {
8895             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8896                                                                    *scan)))
8897             {
8898                 scan++;
8899             }
8900         } else {
8901             while (hardcount < max && scan < loceol
8902                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8903                                                                   (U8 *) scan)))
8904             {
8905                 scan += UTF8SKIP(scan);
8906                 hardcount++;
8907             }
8908         }
8909         break;
8910
8911     case POSIXD:
8912         if (utf8_target) {
8913             goto utf8_posix;
8914         }
8915         /* FALLTHROUGH */
8916
8917     case POSIXA:
8918         if (utf8_target && loceol - scan > max) {
8919
8920             /* We didn't adjust <loceol> at the beginning of this routine
8921              * because is UTF-8, but it is actually ok to do so, since here, to
8922              * match, 1 char == 1 byte. */
8923             loceol = scan + max;
8924         }
8925         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8926             scan++;
8927         }
8928         break;
8929
8930     case NPOSIXD:
8931         if (utf8_target) {
8932             to_complement = 1;
8933             goto utf8_posix;
8934         }
8935         /* FALLTHROUGH */
8936
8937     case NPOSIXA:
8938         if (! utf8_target) {
8939             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8940                 scan++;
8941             }
8942         }
8943         else {
8944
8945             /* The complement of something that matches only ASCII matches all
8946              * non-ASCII, plus everything in ASCII that isn't in the class. */
8947             while (hardcount < max && scan < loceol
8948                    && (! isASCII_utf8(scan)
8949                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8950             {
8951                 scan += UTF8SKIP(scan);
8952                 hardcount++;
8953             }
8954         }
8955         break;
8956
8957     case NPOSIXU:
8958         to_complement = 1;
8959         /* FALLTHROUGH */
8960
8961     case POSIXU:
8962         if (! utf8_target) {
8963             while (scan < loceol && to_complement
8964                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8965             {
8966                 scan++;
8967             }
8968         }
8969         else {
8970           utf8_posix:
8971             classnum = (_char_class_number) FLAGS(p);
8972             if (classnum < _FIRST_NON_SWASH_CC) {
8973
8974                 /* Here, a swash is needed for above-Latin1 code points.
8975                  * Process as many Latin1 code points using the built-in rules.
8976                  * Go to another loop to finish processing upon encountering
8977                  * the first Latin1 code point.  We could do that in this loop
8978                  * as well, but the other way saves having to test if the swash
8979                  * has been loaded every time through the loop: extra space to
8980                  * save a test. */
8981                 while (hardcount < max && scan < loceol) {
8982                     if (UTF8_IS_INVARIANT(*scan)) {
8983                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
8984                                                                    classnum))))
8985                         {
8986                             break;
8987                         }
8988                         scan++;
8989                     }
8990                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
8991                         if (! (to_complement
8992                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
8993                                                                      *(scan + 1)),
8994                                                     classnum))))
8995                         {
8996                             break;
8997                         }
8998                         scan += 2;
8999                     }
9000                     else {
9001                         goto found_above_latin1;
9002                     }
9003
9004                     hardcount++;
9005                 }
9006             }
9007             else {
9008                 /* For these character classes, the knowledge of how to handle
9009                  * every code point is compiled in to Perl via a macro.  This
9010                  * code is written for making the loops as tight as possible.
9011                  * It could be refactored to save space instead */
9012                 switch (classnum) {
9013                     case _CC_ENUM_SPACE:
9014                         while (hardcount < max
9015                                && scan < loceol
9016                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
9017                         {
9018                             scan += UTF8SKIP(scan);
9019                             hardcount++;
9020                         }
9021                         break;
9022                     case _CC_ENUM_BLANK:
9023                         while (hardcount < max
9024                                && scan < loceol
9025                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
9026                         {
9027                             scan += UTF8SKIP(scan);
9028                             hardcount++;
9029                         }
9030                         break;
9031                     case _CC_ENUM_XDIGIT:
9032                         while (hardcount < max
9033                                && scan < loceol
9034                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
9035                         {
9036                             scan += UTF8SKIP(scan);
9037                             hardcount++;
9038                         }
9039                         break;
9040                     case _CC_ENUM_VERTSPACE:
9041                         while (hardcount < max
9042                                && scan < loceol
9043                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
9044                         {
9045                             scan += UTF8SKIP(scan);
9046                             hardcount++;
9047                         }
9048                         break;
9049                     case _CC_ENUM_CNTRL:
9050                         while (hardcount < max
9051                                && scan < loceol
9052                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
9053                         {
9054                             scan += UTF8SKIP(scan);
9055                             hardcount++;
9056                         }
9057                         break;
9058                     default:
9059                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9060                 }
9061             }
9062         }
9063         break;
9064
9065       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
9066
9067         /* Load the swash if not already present */
9068         if (! PL_utf8_swash_ptrs[classnum]) {
9069             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9070             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9071                                         "utf8",
9072                                         "",
9073                                         &PL_sv_undef, 1, 0,
9074                                         PL_XPosix_ptrs[classnum], &flags);
9075         }
9076
9077         while (hardcount < max && scan < loceol
9078                && to_complement ^ cBOOL(_generic_utf8(
9079                                        classnum,
9080                                        scan,
9081                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
9082                                                    (U8 *) scan,
9083                                                    TRUE))))
9084         {
9085             scan += UTF8SKIP(scan);
9086             hardcount++;
9087         }
9088         break;
9089
9090     case LNBREAK:
9091         if (utf8_target) {
9092             while (hardcount < max && scan < loceol &&
9093                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9094                 scan += c;
9095                 hardcount++;
9096             }
9097         } else {
9098             /* LNBREAK can match one or two latin chars, which is ok, but we
9099              * have to use hardcount in this situation, and throw away the
9100              * adjustment to <loceol> done before the switch statement */
9101             loceol = reginfo->strend;
9102             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9103                 scan+=c;
9104                 hardcount++;
9105             }
9106         }
9107         break;
9108
9109     case BOUNDL:
9110     case NBOUNDL:
9111         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9112         /* FALLTHROUGH */
9113     case BOUND:
9114     case BOUNDA:
9115     case BOUNDU:
9116     case EOS:
9117     case GPOS:
9118     case KEEPS:
9119     case NBOUND:
9120     case NBOUNDA:
9121     case NBOUNDU:
9122     case OPFAIL:
9123     case SBOL:
9124     case SEOL:
9125         /* These are all 0 width, so match right here or not at all. */
9126         break;
9127
9128     default:
9129         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9130         NOT_REACHED; /* NOTREACHED */
9131
9132     }
9133
9134     if (hardcount)
9135         c = hardcount;
9136     else
9137         c = scan - *startposp;
9138     *startposp = scan;
9139
9140     DEBUG_r({
9141         GET_RE_DEBUG_FLAGS_DECL;
9142         DEBUG_EXECUTE_r({
9143             SV * const prop = sv_newmortal();
9144             regprop(prog, prop, p, reginfo, NULL);
9145             Perl_re_exec_indentf( aTHX_  "%s can match %"IVdf" times out of %"IVdf"...\n",
9146                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9147         });
9148     });
9149
9150     return(c);
9151 }
9152
9153
9154 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9155 /*
9156 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9157 create a copy so that changes the caller makes won't change the shared one.
9158 If <altsvp> is non-null, will return NULL in it, for back-compat.
9159  */
9160 SV *
9161 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9162 {
9163     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9164
9165     if (altsvp) {
9166         *altsvp = NULL;
9167     }
9168
9169     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9170 }
9171
9172 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9173
9174 /*
9175  - reginclass - determine if a character falls into a character class
9176
9177   n is the ANYOF-type regnode
9178   p is the target string
9179   p_end points to one byte beyond the end of the target string
9180   utf8_target tells whether p is in UTF-8.
9181
9182   Returns true if matched; false otherwise.
9183
9184   Note that this can be a synthetic start class, a combination of various
9185   nodes, so things you think might be mutually exclusive, such as locale,
9186   aren't.  It can match both locale and non-locale
9187
9188  */
9189
9190 STATIC bool
9191 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9192 {
9193     dVAR;
9194     const char flags = ANYOF_FLAGS(n);
9195     bool match = FALSE;
9196     UV c = *p;
9197
9198     PERL_ARGS_ASSERT_REGINCLASS;
9199
9200     /* If c is not already the code point, get it.  Note that
9201      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9202     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9203         STRLEN c_len = 0;
9204         c = utf8n_to_uvchr(p, p_end - p, &c_len,
9205                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
9206                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
9207                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
9208                  * UTF8_ALLOW_FFFF */
9209         if (c_len == (STRLEN)-1)
9210             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
9211         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9212             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9213         }
9214     }
9215
9216     /* If this character is potentially in the bitmap, check it */
9217     if (c < NUM_ANYOF_CODE_POINTS) {
9218         if (ANYOF_BITMAP_TEST(n, c))
9219             match = TRUE;
9220         else if ((flags
9221                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9222                   && OP(n) == ANYOFD
9223                   && ! utf8_target
9224                   && ! isASCII(c))
9225         {
9226             match = TRUE;
9227         }
9228         else if (flags & ANYOF_LOCALE_FLAGS) {
9229             if ((flags & ANYOFL_FOLD)
9230                 && c < 256
9231                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9232             {
9233                 match = TRUE;
9234             }
9235             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9236                      && c < 256
9237             ) {
9238
9239                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9240                  * if the class includes the Posix character class given by
9241                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9242                  * complemented Posix class given by int(bit/2).  So we loop
9243                  * through the bits, each time changing whether we complement
9244                  * the result or not.  Suppose for the sake of illustration
9245                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9246                  * is set, it means there is a match for this ANYOF node if the
9247                  * character is in the class given by the expression (0 / 2 = 0
9248                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9249                  * and since 'to_complement' is 0, the result will stay TRUE,
9250                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9251                  * bit 1 is 1.  That means there is a match if the character
9252                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9253                  * but will on bit 1.  On the second iteration 'to_complement'
9254                  * will be 1, so the exclusive or will reverse things, so we
9255                  * are testing for \W.  On the third iteration, 'to_complement'
9256                  * will be 0, and we would be testing for \s; the fourth
9257                  * iteration would test for \S, etc.
9258                  *
9259                  * Note that this code assumes that all the classes are closed
9260                  * under folding.  For example, if a character matches \w, then
9261                  * its fold does too; and vice versa.  This should be true for
9262                  * any well-behaved locale for all the currently defined Posix
9263                  * classes, except for :lower: and :upper:, which are handled
9264                  * by the pseudo-class :cased: which matches if either of the
9265                  * other two does.  To get rid of this assumption, an outer
9266                  * loop could be used below to iterate over both the source
9267                  * character, and its fold (if different) */
9268
9269                 int count = 0;
9270                 int to_complement = 0;
9271
9272                 while (count < ANYOF_MAX) {
9273                     if (ANYOF_POSIXL_TEST(n, count)
9274                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9275                     {
9276                         match = TRUE;
9277                         break;
9278                     }
9279                     count++;
9280                     to_complement ^= 1;
9281                 }
9282             }
9283         }
9284     }
9285
9286
9287     /* If the bitmap didn't (or couldn't) match, and something outside the
9288      * bitmap could match, try that. */
9289     if (!match) {
9290         if (c >= NUM_ANYOF_CODE_POINTS
9291             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9292         {
9293             match = TRUE;       /* Everything above the bitmap matches */
9294         }
9295             /* Here doesn't match everything above the bitmap.  If there is
9296              * some information available beyond the bitmap, we may find a
9297              * match in it.  If so, this is most likely because the code point
9298              * is outside the bitmap range.  But rarely, it could be because of
9299              * some other reason.  If so, various flags are set to indicate
9300              * this possibility.  On ANYOFD nodes, there may be matches that
9301              * happen only when the target string is UTF-8; or for other node
9302              * types, because runtime lookup is needed, regardless of the
9303              * UTF-8ness of the target string.  Finally, under /il, there may
9304              * be some matches only possible if the locale is a UTF-8 one. */
9305         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9306                  && (   c >= NUM_ANYOF_CODE_POINTS
9307                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9308                          && (   UNLIKELY(OP(n) != ANYOFD)
9309                              || (utf8_target && ! isASCII_uni(c)
9310 #                               if NUM_ANYOF_CODE_POINTS > 256
9311                                                                  && c < 256
9312 #                               endif
9313                                 )))
9314                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9315                          && IN_UTF8_CTYPE_LOCALE)))
9316         {
9317             SV* only_utf8_locale = NULL;
9318             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9319                                                        &only_utf8_locale, NULL);
9320             if (sw) {
9321                 U8 utf8_buffer[2];
9322                 U8 * utf8_p;
9323                 if (utf8_target) {
9324                     utf8_p = (U8 *) p;
9325                 } else { /* Convert to utf8 */
9326                     utf8_p = utf8_buffer;
9327                     append_utf8_from_native_byte(*p, &utf8_p);
9328                     utf8_p = utf8_buffer;
9329                 }
9330
9331                 if (swash_fetch(sw, utf8_p, TRUE)) {
9332                     match = TRUE;
9333                 }
9334             }
9335             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9336                 match = _invlist_contains_cp(only_utf8_locale, c);
9337             }
9338         }
9339
9340         if (UNICODE_IS_SUPER(c)
9341             && (flags
9342                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9343             && OP(n) != ANYOFD
9344             && ckWARN_d(WARN_NON_UNICODE))
9345         {
9346             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9347                 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
9348         }
9349     }
9350
9351 #if ANYOF_INVERT != 1
9352     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9353      * use it, don't */
9354 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9355 #endif
9356
9357     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9358     return (flags & ANYOF_INVERT) ^ match;
9359 }
9360
9361 STATIC U8 *
9362 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9363 {
9364     /* return the position 'off' UTF-8 characters away from 's', forward if
9365      * 'off' >= 0, backwards if negative.  But don't go outside of position
9366      * 'lim', which better be < s  if off < 0 */
9367
9368     PERL_ARGS_ASSERT_REGHOP3;
9369
9370     if (off >= 0) {
9371         while (off-- && s < lim) {
9372             /* XXX could check well-formedness here */
9373             s += UTF8SKIP(s);
9374         }
9375     }
9376     else {
9377         while (off++ && s > lim) {
9378             s--;
9379             if (UTF8_IS_CONTINUED(*s)) {
9380                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9381                     s--;
9382                 if (! UTF8_IS_START(*s)) {
9383                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9384                 }
9385             }
9386             /* XXX could check well-formedness here */
9387         }
9388     }
9389     return s;
9390 }
9391
9392 STATIC U8 *
9393 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9394 {
9395     PERL_ARGS_ASSERT_REGHOP4;
9396
9397     if (off >= 0) {
9398         while (off-- && s < rlim) {
9399             /* XXX could check well-formedness here */
9400             s += UTF8SKIP(s);
9401         }
9402     }
9403     else {
9404         while (off++ && s > llim) {
9405             s--;
9406             if (UTF8_IS_CONTINUED(*s)) {
9407                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9408                     s--;
9409                 if (! UTF8_IS_START(*s)) {
9410                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9411                 }
9412             }
9413             /* XXX could check well-formedness here */
9414         }
9415     }
9416     return s;
9417 }
9418
9419 /* like reghop3, but returns NULL on overrun, rather than returning last
9420  * char pos */
9421
9422 STATIC U8 *
9423 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9424 {
9425     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9426
9427     if (off >= 0) {
9428         while (off-- && s < lim) {
9429             /* XXX could check well-formedness here */
9430             s += UTF8SKIP(s);
9431         }
9432         if (off >= 0)
9433             return NULL;
9434     }
9435     else {
9436         while (off++ && s > lim) {
9437             s--;
9438             if (UTF8_IS_CONTINUED(*s)) {
9439                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9440                     s--;
9441                 if (! UTF8_IS_START(*s)) {
9442                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9443                 }
9444             }
9445             /* XXX could check well-formedness here */
9446         }
9447         if (off <= 0)
9448             return NULL;
9449     }
9450     return s;
9451 }
9452
9453
9454 /* when executing a regex that may have (?{}), extra stuff needs setting
9455    up that will be visible to the called code, even before the current
9456    match has finished. In particular:
9457
9458    * $_ is localised to the SV currently being matched;
9459    * pos($_) is created if necessary, ready to be updated on each call-out
9460      to code;
9461    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9462      isn't set until the current pattern is successfully finished), so that
9463      $1 etc of the match-so-far can be seen;
9464    * save the old values of subbeg etc of the current regex, and  set then
9465      to the current string (again, this is normally only done at the end
9466      of execution)
9467 */
9468
9469 static void
9470 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9471 {
9472     MAGIC *mg;
9473     regexp *const rex = ReANY(reginfo->prog);
9474     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9475
9476     eval_state->rex = rex;
9477
9478     if (reginfo->sv) {
9479         /* Make $_ available to executed code. */
9480         if (reginfo->sv != DEFSV) {
9481             SAVE_DEFSV;
9482             DEFSV_set(reginfo->sv);
9483         }
9484
9485         if (!(mg = mg_find_mglob(reginfo->sv))) {
9486             /* prepare for quick setting of pos */
9487             mg = sv_magicext_mglob(reginfo->sv);
9488             mg->mg_len = -1;
9489         }
9490         eval_state->pos_magic = mg;
9491         eval_state->pos       = mg->mg_len;
9492         eval_state->pos_flags = mg->mg_flags;
9493     }
9494     else
9495         eval_state->pos_magic = NULL;
9496
9497     if (!PL_reg_curpm) {
9498         /* PL_reg_curpm is a fake PMOP that we can attach the current
9499          * regex to and point PL_curpm at, so that $1 et al are visible
9500          * within a /(?{})/. It's just allocated once per interpreter the
9501          * first time its needed */
9502         Newxz(PL_reg_curpm, 1, PMOP);
9503 #ifdef USE_ITHREADS
9504         {
9505             SV* const repointer = &PL_sv_undef;
9506             /* this regexp is also owned by the new PL_reg_curpm, which
9507                will try to free it.  */
9508             av_push(PL_regex_padav, repointer);
9509             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9510             PL_regex_pad = AvARRAY(PL_regex_padav);
9511         }
9512 #endif
9513     }
9514     SET_reg_curpm(reginfo->prog);
9515     eval_state->curpm = PL_curpm;
9516     PL_curpm = PL_reg_curpm;
9517     if (RXp_MATCH_COPIED(rex)) {
9518         /*  Here is a serious problem: we cannot rewrite subbeg,
9519             since it may be needed if this match fails.  Thus
9520             $` inside (?{}) could fail... */
9521         eval_state->subbeg     = rex->subbeg;
9522         eval_state->sublen     = rex->sublen;
9523         eval_state->suboffset  = rex->suboffset;
9524         eval_state->subcoffset = rex->subcoffset;
9525 #ifdef PERL_ANY_COW
9526         eval_state->saved_copy = rex->saved_copy;
9527 #endif
9528         RXp_MATCH_COPIED_off(rex);
9529     }
9530     else
9531         eval_state->subbeg = NULL;
9532     rex->subbeg = (char *)reginfo->strbeg;
9533     rex->suboffset = 0;
9534     rex->subcoffset = 0;
9535     rex->sublen = reginfo->strend - reginfo->strbeg;
9536 }
9537
9538
9539 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
9540
9541 static void
9542 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
9543 {
9544     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9545     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
9546     regmatch_slab *s;
9547
9548     Safefree(aux->poscache);
9549
9550     if (eval_state) {
9551
9552         /* undo the effects of S_setup_eval_state() */
9553
9554         if (eval_state->subbeg) {
9555             regexp * const rex = eval_state->rex;
9556             rex->subbeg     = eval_state->subbeg;
9557             rex->sublen     = eval_state->sublen;
9558             rex->suboffset  = eval_state->suboffset;
9559             rex->subcoffset = eval_state->subcoffset;
9560 #ifdef PERL_ANY_COW
9561             rex->saved_copy = eval_state->saved_copy;
9562 #endif
9563             RXp_MATCH_COPIED_on(rex);
9564         }
9565         if (eval_state->pos_magic)
9566         {
9567             eval_state->pos_magic->mg_len = eval_state->pos;
9568             eval_state->pos_magic->mg_flags =
9569                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9570                | (eval_state->pos_flags & MGf_BYTES);
9571         }
9572
9573         PL_curpm = eval_state->curpm;
9574     }
9575
9576     PL_regmatch_state = aux->old_regmatch_state;
9577     PL_regmatch_slab  = aux->old_regmatch_slab;
9578
9579     /* free all slabs above current one - this must be the last action
9580      * of this function, as aux and eval_state are allocated within
9581      * slabs and may be freed here */
9582
9583     s = PL_regmatch_slab->next;
9584     if (s) {
9585         PL_regmatch_slab->next = NULL;
9586         while (s) {
9587             regmatch_slab * const osl = s;
9588             s = s->next;
9589             Safefree(osl);
9590         }
9591     }
9592 }
9593
9594
9595 STATIC void
9596 S_to_utf8_substr(pTHX_ regexp *prog)
9597 {
9598     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9599      * on the converted value */
9600
9601     int i = 1;
9602
9603     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9604
9605     do {
9606         if (prog->substrs->data[i].substr
9607             && !prog->substrs->data[i].utf8_substr) {
9608             SV* const sv = newSVsv(prog->substrs->data[i].substr);
9609             prog->substrs->data[i].utf8_substr = sv;
9610             sv_utf8_upgrade(sv);
9611             if (SvVALID(prog->substrs->data[i].substr)) {
9612                 if (SvTAIL(prog->substrs->data[i].substr)) {
9613                     /* Trim the trailing \n that fbm_compile added last
9614                        time.  */
9615                     SvCUR_set(sv, SvCUR(sv) - 1);
9616                     /* Whilst this makes the SV technically "invalid" (as its
9617                        buffer is no longer followed by "\0") when fbm_compile()
9618                        adds the "\n" back, a "\0" is restored.  */
9619                     fbm_compile(sv, FBMcf_TAIL);
9620                 } else
9621                     fbm_compile(sv, 0);
9622             }
9623             if (prog->substrs->data[i].substr == prog->check_substr)
9624                 prog->check_utf8 = sv;
9625         }
9626     } while (i--);
9627 }
9628
9629 STATIC bool
9630 S_to_byte_substr(pTHX_ regexp *prog)
9631 {
9632     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9633      * on the converted value; returns FALSE if can't be converted. */
9634
9635     int i = 1;
9636
9637     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9638
9639     do {
9640         if (prog->substrs->data[i].utf8_substr
9641             && !prog->substrs->data[i].substr) {
9642             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9643             if (! sv_utf8_downgrade(sv, TRUE)) {
9644                 return FALSE;
9645             }
9646             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9647                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9648                     /* Trim the trailing \n that fbm_compile added last
9649                         time.  */
9650                     SvCUR_set(sv, SvCUR(sv) - 1);
9651                     fbm_compile(sv, FBMcf_TAIL);
9652                 } else
9653                     fbm_compile(sv, 0);
9654             }
9655             prog->substrs->data[i].substr = sv;
9656             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9657                 prog->check_substr = sv;
9658         }
9659     } while (i--);
9660
9661     return TRUE;
9662 }
9663
9664 /*
9665  * ex: set ts=8 sts=4 sw=4 et:
9666  */