regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
 100     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
 101     goto target; \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid only for non-utf8 strings: avoids the reginclass
 111  * call if there are no complications: i.e., if everything matchable is
 112  * straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 114                                               : ANYOF_BITMAP_TEST(p,*(c)))
 115
 116 /*
 117  * Forwards.
 118  */
 119
 120 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 121 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 122
 123 #define HOPc(pos,off) \
 124         (char *)(reginfo->is_utf8_target \
 125             ? reghop3((U8*)pos, off, \
 126                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 127             : (U8*)(pos + off))
 128
 129 #define HOPBACKc(pos, off) \
 130         (char*)(reginfo->is_utf8_target \
 131             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 132             : (pos - off >= reginfo->strbeg)    \
 133                 ? (U8*)pos - off                \
 134                 : NULL)
 135
 136 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 137 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 138
 139 /* lim must be +ve. Returns NULL on overshoot */
 140 #define HOPMAYBE3(pos,off,lim) \
 141         (reginfo->is_utf8_target                        \
 142             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 143             : ((U8*)pos + off <= lim)                   \
 144                 ? (U8*)pos + off                        \
 145                 : NULL)
 146
 147 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 148  * off must be >=0; args should be vars rather than expressions */
 149 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 150     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 151     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 152
 153 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 154     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 155     : (U8*)(pos + off))
 156 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 157
 158 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 159 #define NEXTCHR_IS_EOS (nextchr < 0)
 160
 161 #define SET_nextchr \
 162     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 163
 164 #define SET_locinput(p) \
 165     locinput = (p);  \
 166     SET_nextchr
 167
 168
 169 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 170         if (!swash_ptr) {                                                     \
 171             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 172             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 173                                          1, 0, invlist, &flags);              \
 174             assert(swash_ptr);                                                \
 175         }                                                                     \
 176     } STMT_END
 177
 178 /* If in debug mode, we test that a known character properly matches */
 179 #ifdef DEBUGGING
 180 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 181                                           property_name,                      \
 182                                           invlist,                            \
 183                                           utf8_char_in_property)              \
 184         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 185         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 186 #else
 187 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 188                                           property_name,                      \
 189                                           invlist,                            \
 190                                           utf8_char_in_property)              \
 191         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 192 #endif
 193
 194 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 195                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 196                                         "",                                   \
 197                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 198                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 199
 200 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 201 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 202
 203 /* for use after a quantifier and before an EXACT-like node -- japhy */
 204 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 205  *
 206  * NOTE that *nothing* that affects backtracking should be in here, specifically
 207  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 208  * node that is in between two EXACT like nodes when ascertaining what the required
 209  * "follow" character is. This should probably be moved to regex compile time
 210  * although it may be done at run time beause of the REF possibility - more
 211  * investigation required. -- demerphq
 212 */
 213 #define JUMPABLE(rn) (                                                             \
 214     OP(rn) == OPEN ||                                                              \
 215     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 216     OP(rn) == EVAL ||                                                              \
 217     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 218     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 219     OP(rn) == KEEPS ||                                                             \
 220     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 221 )
 222 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 223
 224 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 225
 226 #if 0
 227 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 228    we don't need this definition.  XXX These are now out-of-sync*/
 229 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 230 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 231 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 232
 233 #else
 234 /* ... so we use this as its faster. */
 235 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 236 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 237 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 238 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 239
 240 #endif
 241
 242 /*
 243   Search for mandatory following text node; for lookahead, the text must
 244   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 245 */
 246 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 247     while (JUMPABLE(rn)) { \
 248         const OPCODE type = OP(rn); \
 249         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 250             rn = NEXTOPER(NEXTOPER(rn)); \
 251         else if (type == PLUS) \
 252             rn = NEXTOPER(rn); \
 253         else if (type == IFMATCH) \
 254             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 255         else rn += NEXT_OFF(rn); \
 256     } \
 257 } STMT_END
 258
 259 #define SLAB_FIRST(s) (&(s)->states[0])
 260 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 261
 262 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 263 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 264 static regmatch_state * S_push_slab(pTHX);
 265
 266 #define REGCP_PAREN_ELEMS 3
 267 #define REGCP_OTHER_ELEMS 3
 268 #define REGCP_FRAME_ELEMS 1
 269 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 270  * are needed for the regexp context stack bookkeeping. */
 271
 272 STATIC CHECKPOINT
 273 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 274 {
 275     const int retval = PL_savestack_ix;
 276     const int paren_elems_to_push =
 277                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 278     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 279     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 280     I32 p;
 281     GET_RE_DEBUG_FLAGS_DECL;
 282
 283     PERL_ARGS_ASSERT_REGCPPUSH;
 284
 285     if (paren_elems_to_push < 0)
 286         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 287                    (int)paren_elems_to_push, (int)maxopenparen,
 288                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 289
 290     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 291         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 292                    " out of range (%lu-%ld)",
 293                    total_elems,
 294                    (unsigned long)maxopenparen,
 295                    (long)parenfloor);
 296
 297     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 298
 299     DEBUG_BUFFERS_r(
 300         if ((int)maxopenparen > (int)parenfloor)
 301             PerlIO_printf(Perl_debug_log,
 302                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 303                 PTR2UV(rex),
 304                 PTR2UV(rex->offs)
 305             );
 306     );
 307     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 308 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 309         SSPUSHIV(rex->offs[p].end);
 310         SSPUSHIV(rex->offs[p].start);
 311         SSPUSHINT(rex->offs[p].start_tmp);
 312         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 313             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 314             (UV)p,
 315             (IV)rex->offs[p].start,
 316             (IV)rex->offs[p].start_tmp,
 317             (IV)rex->offs[p].end
 318         ));
 319     }
 320 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 321     SSPUSHINT(maxopenparen);
 322     SSPUSHINT(rex->lastparen);
 323     SSPUSHINT(rex->lastcloseparen);
 324     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 325
 326     return retval;
 327 }
 328
 329 /* These are needed since we do not localize EVAL nodes: */
 330 #define REGCP_SET(cp)                                           \
 331     DEBUG_STATE_r(                                              \
 332             PerlIO_printf(Perl_debug_log,                       \
 333                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 334                 (IV)PL_savestack_ix));                          \
 335     cp = PL_savestack_ix
 336
 337 #define REGCP_UNWIND(cp)                                        \
 338     DEBUG_STATE_r(                                              \
 339         if (cp != PL_savestack_ix)                              \
 340             PerlIO_printf(Perl_debug_log,                       \
 341                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 342                 (IV)(cp), (IV)PL_savestack_ix));                \
 343     regcpblow(cp)
 344
 345 #define UNWIND_PAREN(lp, lcp)               \
 346     for (n = rex->lastparen; n > lp; n--)   \
 347         rex->offs[n].end = -1;              \
 348     rex->lastparen = n;                     \
 349     rex->lastcloseparen = lcp;
 350
 351
 352 STATIC void
 353 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 354 {
 355     UV i;
 356     U32 paren;
 357     GET_RE_DEBUG_FLAGS_DECL;
 358
 359     PERL_ARGS_ASSERT_REGCPPOP;
 360
 361     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 362     i = SSPOPUV;
 363     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 364     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 365     rex->lastcloseparen = SSPOPINT;
 366     rex->lastparen = SSPOPINT;
 367     *maxopenparen_p = SSPOPINT;
 368
 369     i -= REGCP_OTHER_ELEMS;
 370     /* Now restore the parentheses context. */
 371     DEBUG_BUFFERS_r(
 372         if (i || rex->lastparen + 1 <= rex->nparens)
 373             PerlIO_printf(Perl_debug_log,
 374                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 375                 PTR2UV(rex),
 376                 PTR2UV(rex->offs)
 377             );
 378     );
 379     paren = *maxopenparen_p;
 380     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 381         SSize_t tmps;
 382         rex->offs[paren].start_tmp = SSPOPINT;
 383         rex->offs[paren].start = SSPOPIV;
 384         tmps = SSPOPIV;
 385         if (paren <= rex->lastparen)
 386             rex->offs[paren].end = tmps;
 387         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 388             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 389             (UV)paren,
 390             (IV)rex->offs[paren].start,
 391             (IV)rex->offs[paren].start_tmp,
 392             (IV)rex->offs[paren].end,
 393             (paren > rex->lastparen ? "(skipped)" : ""));
 394         );
 395         paren--;
 396     }
 397 #if 1
 398     /* It would seem that the similar code in regtry()
 399      * already takes care of this, and in fact it is in
 400      * a better location to since this code can #if 0-ed out
 401      * but the code in regtry() is needed or otherwise tests
 402      * requiring null fields (pat.t#187 and split.t#{13,14}
 403      * (as of patchlevel 7877)  will fail.  Then again,
 404      * this code seems to be necessary or otherwise
 405      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 406      * --jhi updated by dapm */
 407     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 408         if (i > *maxopenparen_p)
 409             rex->offs[i].start = -1;
 410         rex->offs[i].end = -1;
 411         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 412             "    \\%"UVuf": %s   ..-1 undeffing\n",
 413             (UV)i,
 414             (i > *maxopenparen_p) ? "-1" : "  "
 415         ));
 416     }
 417 #endif
 418 }
 419
 420 /* restore the parens and associated vars at savestack position ix,
 421  * but without popping the stack */
 422
 423 STATIC void
 424 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 425 {
 426     I32 tmpix = PL_savestack_ix;
 427     PL_savestack_ix = ix;
 428     regcppop(rex, maxopenparen_p);
 429     PL_savestack_ix = tmpix;
 430 }
 431
 432 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 433
 434 STATIC bool
 435 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 436 {
 437     /* Returns a boolean as to whether or not 'character' is a member of the
 438      * Posix character class given by 'classnum' that should be equivalent to a
 439      * value in the typedef '_char_class_number'.
 440      *
 441      * Ideally this could be replaced by a just an array of function pointers
 442      * to the C library functions that implement the macros this calls.
 443      * However, to compile, the precise function signatures are required, and
 444      * these may vary from platform to to platform.  To avoid having to figure
 445      * out what those all are on each platform, I (khw) am using this method,
 446      * which adds an extra layer of function call overhead (unless the C
 447      * optimizer strips it away).  But we don't particularly care about
 448      * performance with locales anyway. */
 449
 450     switch ((_char_class_number) classnum) {
 451         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 452         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 453         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 454         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 455         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 456                                         || isUPPER_LC(character);
 457         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 458         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 459         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 460         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 461         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 462         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 463         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 464         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 465         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 466         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 467         default:    /* VERTSPACE should never occur in locales */
 468             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 469     }
 470
 471     NOT_REACHED; /* NOTREACHED */
 472     return FALSE;
 473 }
 474
 475 STATIC bool
 476 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 477 {
 478     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 479      * 'character' is a member of the Posix character class given by 'classnum'
 480      * that should be equivalent to a value in the typedef
 481      * '_char_class_number'.
 482      *
 483      * This just calls isFOO_lc on the code point for the character if it is in
 484      * the range 0-255.  Outside that range, all characters use Unicode
 485      * rules, ignoring any locale.  So use the Unicode function if this class
 486      * requires a swash, and use the Unicode macro otherwise. */
 487
 488     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 489
 490     if (UTF8_IS_INVARIANT(*character)) {
 491         return isFOO_lc(classnum, *character);
 492     }
 493     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 494         return isFOO_lc(classnum,
 495                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 496     }
 497
 498     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 499
 500     if (classnum < _FIRST_NON_SWASH_CC) {
 501
 502         /* Initialize the swash unless done already */
 503         if (! PL_utf8_swash_ptrs[classnum]) {
 504             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 505             PL_utf8_swash_ptrs[classnum] =
 506                     _core_swash_init("utf8",
 507                                      "",
 508                                      &PL_sv_undef, 1, 0,
 509                                      PL_XPosix_ptrs[classnum], &flags);
 510         }
 511
 512         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 513                                  character,
 514                                  TRUE /* is UTF */ ));
 515     }
 516
 517     switch ((_char_class_number) classnum) {
 518         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 519         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 520         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 521         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 522         default:                 break;
 523     }
 524
 525     return FALSE; /* Things like CNTRL are always below 256 */
 526 }
 527
 528 /*
 529  * pregexec and friends
 530  */
 531
 532 #ifndef PERL_IN_XSUB_RE
 533 /*
 534  - pregexec - match a regexp against a string
 535  */
 536 I32
 537 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 538          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 539 /* stringarg: the point in the string at which to begin matching */
 540 /* strend:    pointer to null at end of string */
 541 /* strbeg:    real beginning of string */
 542 /* minend:    end of match must be >= minend bytes after stringarg. */
 543 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 544  *            itself is accessed via the pointers above */
 545 /* nosave:    For optimizations. */
 546 {
 547     PERL_ARGS_ASSERT_PREGEXEC;
 548
 549     return
 550         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 551                       nosave ? 0 : REXEC_COPY_STR);
 552 }
 553 #endif
 554
 555
 556
 557 /* re_intuit_start():
 558  *
 559  * Based on some optimiser hints, try to find the earliest position in the
 560  * string where the regex could match.
 561  *
 562  *   rx:     the regex to match against
 563  *   sv:     the SV being matched: only used for utf8 flag; the string
 564  *           itself is accessed via the pointers below. Note that on
 565  *           something like an overloaded SV, SvPOK(sv) may be false
 566  *           and the string pointers may point to something unrelated to
 567  *           the SV itself.
 568  *   strbeg: real beginning of string
 569  *   strpos: the point in the string at which to begin matching
 570  *   strend: pointer to the byte following the last char of the string
 571  *   flags   currently unused; set to 0
 572  *   data:   currently unused; set to NULL
 573  *
 574  * The basic idea of re_intuit_start() is to use some known information
 575  * about the pattern, namely:
 576  *
 577  *   a) the longest known anchored substring (i.e. one that's at a
 578  *      constant offset from the beginning of the pattern; but not
 579  *      necessarily at a fixed offset from the beginning of the
 580  *      string);
 581  *   b) the longest floating substring (i.e. one that's not at a constant
 582  *      offset from the beginning of the pattern);
 583  *   c) Whether the pattern is anchored to the string; either
 584  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 585  *      or anchored to pos(): /\G/;
 586  *   d) A start class: a real or synthetic character class which
 587  *      represents which characters are legal at the start of the pattern;
 588  *
 589  * to either quickly reject the match, or to find the earliest position
 590  * within the string at which the pattern might match, thus avoiding
 591  * running the full NFA engine at those earlier locations, only to
 592  * eventually fail and retry further along.
 593  *
 594  * Returns NULL if the pattern can't match, or returns the address within
 595  * the string which is the earliest place the match could occur.
 596  *
 597  * The longest of the anchored and floating substrings is called 'check'
 598  * and is checked first. The other is called 'other' and is checked
 599  * second. The 'other' substring may not be present.  For example,
 600  *
 601  *    /(abc|xyz)ABC\d{0,3}DEFG/
 602  *
 603  * will have
 604  *
 605  *   check substr (float)    = "DEFG", offset 6..9 chars
 606  *   other substr (anchored) = "ABC",  offset 3..3 chars
 607  *   stclass = [ax]
 608  *
 609  * Be aware that during the course of this function, sometimes 'anchored'
 610  * refers to a substring being anchored relative to the start of the
 611  * pattern, and sometimes to the pattern itself being anchored relative to
 612  * the string. For example:
 613  *
 614  *   /\dabc/:   "abc" is anchored to the pattern;
 615  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 616  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 617  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 618  *                    but the pattern is anchored to the string.
 619  */
 620
 621 char *
 622 Perl_re_intuit_start(pTHX_
 623                     REGEXP * const rx,
 624                     SV *sv,
 625                     const char * const strbeg,
 626                     char *strpos,
 627                     char *strend,
 628                     const U32 flags,
 629                     re_scream_pos_data *data)
 630 {
 631     struct regexp *const prog = ReANY(rx);
 632     SSize_t start_shift = prog->check_offset_min;
 633     /* Should be nonnegative! */
 634     SSize_t end_shift   = 0;
 635     /* current lowest pos in string where the regex can start matching */
 636     char *rx_origin = strpos;
 637     SV *check;
 638     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 639     U8   other_ix = 1 - prog->substrs->check_ix;
 640     bool ml_anch = 0;
 641     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 642     char *check_at = NULL;              /* check substr found at this pos */
 643     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 644     RXi_GET_DECL(prog,progi);
 645     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 646     regmatch_info *const reginfo = &reginfo_buf;
 647     GET_RE_DEBUG_FLAGS_DECL;
 648
 649     PERL_ARGS_ASSERT_RE_INTUIT_START;
 650     PERL_UNUSED_ARG(flags);
 651     PERL_UNUSED_ARG(data);
 652
 653     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 654                 "Intuit: trying to determine minimum start position...\n"));
 655
 656     /* for now, assume that all substr offsets are positive. If at some point
 657      * in the future someone wants to do clever things with look-behind and
 658      * -ve offsets, they'll need to fix up any code in this function
 659      * which uses these offsets. See the thread beginning
 660      * <20140113145929.GF27210@iabyn.com>
 661      */
 662     assert(prog->substrs->data[0].min_offset >= 0);
 663     assert(prog->substrs->data[0].max_offset >= 0);
 664     assert(prog->substrs->data[1].min_offset >= 0);
 665     assert(prog->substrs->data[1].max_offset >= 0);
 666     assert(prog->substrs->data[2].min_offset >= 0);
 667     assert(prog->substrs->data[2].max_offset >= 0);
 668
 669     /* for now, assume that if both present, that the floating substring
 670      * doesn't start before the anchored substring.
 671      * If you break this assumption (e.g. doing better optimisations
 672      * with lookahead/behind), then you'll need to audit the code in this
 673      * function carefully first
 674      */
 675     assert(
 676             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 677               && (prog->float_utf8    || prog->float_substr))
 678            || (prog->float_min_offset >= prog->anchored_offset));
 679
 680     /* byte rather than char calculation for efficiency. It fails
 681      * to quickly reject some cases that can't match, but will reject
 682      * them later after doing full char arithmetic */
 683     if (prog->minlen > strend - strpos) {
 684         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 685                               "  String too short...\n"));
 686         goto fail;
 687     }
 688
 689     RX_MATCH_UTF8_set(rx,utf8_target);
 690     reginfo->is_utf8_target = cBOOL(utf8_target);
 691     reginfo->info_aux = NULL;
 692     reginfo->strbeg = strbeg;
 693     reginfo->strend = strend;
 694     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 695     reginfo->intuit = 1;
 696     /* not actually used within intuit, but zero for safety anyway */
 697     reginfo->poscache_maxiter = 0;
 698
 699     if (utf8_target) {
 700         if (!prog->check_utf8 && prog->check_substr)
 701             to_utf8_substr(prog);
 702         check = prog->check_utf8;
 703     } else {
 704         if (!prog->check_substr && prog->check_utf8) {
 705             if (! to_byte_substr(prog)) {
 706                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 707             }
 708         }
 709         check = prog->check_substr;
 710     }
 711
 712     /* dump the various substring data */
 713     DEBUG_OPTIMISE_MORE_r({
 714         int i;
 715         for (i=0; i<=2; i++) {
 716             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 717                                   : prog->substrs->data[i].substr);
 718             if (!sv)
 719                 continue;
 720
 721             PerlIO_printf(Perl_debug_log,
 722                 "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 723                 " useful=%"IVdf" utf8=%d [%s]\n",
 724                 i,
 725                 (IV)prog->substrs->data[i].min_offset,
 726                 (IV)prog->substrs->data[i].max_offset,
 727                 (IV)prog->substrs->data[i].end_shift,
 728                 BmUSEFUL(sv),
 729                 utf8_target ? 1 : 0,
 730                 SvPEEK(sv));
 731         }
 732     });
 733
 734     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 735
 736         /* ml_anch: check after \n?
 737          *
 738          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 739          * with /.*.../, these flags will have been added by the
 740          * compiler:
 741          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 742          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 743          */
 744         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 745                    && !(prog->intflags & PREGf_IMPLICIT);
 746
 747         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 748             /* we are only allowed to match at BOS or \G */
 749
 750             /* trivially reject if there's a BOS anchor and we're not at BOS.
 751              *
 752              * Note that we don't try to do a similar quick reject for
 753              * \G, since generally the caller will have calculated strpos
 754              * based on pos() and gofs, so the string is already correctly
 755              * anchored by definition; and handling the exceptions would
 756              * be too fiddly (e.g. REXEC_IGNOREPOS).
 757              */
 758             if (   strpos != strbeg
 759                 && (prog->intflags & PREGf_ANCH_SBOL))
 760             {
 761                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 762                                 "  Not at start...\n"));
 763                 goto fail;
 764             }
 765
 766             /* in the presence of an anchor, the anchored (relative to the
 767              * start of the regex) substr must also be anchored relative
 768              * to strpos. So quickly reject if substr isn't found there.
 769              * This works for \G too, because the caller will already have
 770              * subtracted gofs from pos, and gofs is the offset from the
 771              * \G to the start of the regex. For example, in /.abc\Gdef/,
 772              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 773              * caller will have set strpos=pos()-4; we look for the substr
 774              * at position pos()-4+1, which lines up with the "a" */
 775
 776             if (prog->check_offset_min == prog->check_offset_max) {
 777                 /* Substring at constant offset from beg-of-str... */
 778                 SSize_t slen = SvCUR(check);
 779                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 780
 781                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 782                     "  Looking for check substr at fixed offset %"IVdf"...\n",
 783                     (IV)prog->check_offset_min));
 784
 785                 if (SvTAIL(check)) {
 786                     /* In this case, the regex is anchored at the end too.
 787                      * Unless it's a multiline match, the lengths must match
 788                      * exactly, give or take a \n.  NB: slen >= 1 since
 789                      * the last char of check is \n */
 790                     if (!multiline
 791                         && (   strend - s > slen
 792                             || strend - s < slen - 1
 793                             || (strend - s == slen && strend[-1] != '\n')))
 794                     {
 795                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 796                                             "  String too long...\n"));
 797                         goto fail_finish;
 798                     }
 799                     /* Now should match s[0..slen-2] */
 800                     slen--;
 801                 }
 802                 if (slen && (*SvPVX_const(check) != *s
 803                     || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 804                 {
 805                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 806                                     "  String not equal...\n"));
 807                     goto fail_finish;
 808                 }
 809
 810                 check_at = s;
 811                 goto success_at_start;
 812             }
 813         }
 814     }
 815
 816     end_shift = prog->check_end_shift;
 817
 818 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 819     if (end_shift < 0)
 820         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 821                    (IV)end_shift, RX_PRECOMP(prog));
 822 #endif
 823
 824   restart:
 825
 826     /* This is the (re)entry point of the main loop in this function.
 827      * The goal of this loop is to:
 828      * 1) find the "check" substring in the region rx_origin..strend
 829      *    (adjusted by start_shift / end_shift). If not found, reject
 830      *    immediately.
 831      * 2) If it exists, look for the "other" substr too if defined; for
 832      *    example, if the check substr maps to the anchored substr, then
 833      *    check the floating substr, and vice-versa. If not found, go
 834      *    back to (1) with rx_origin suitably incremented.
 835      * 3) If we find an rx_origin position that doesn't contradict
 836      *    either of the substrings, then check the possible additional
 837      *    constraints on rx_origin of /^.../m or a known start class.
 838      *    If these fail, then depending on which constraints fail, jump
 839      *    back to here, or to various other re-entry points further along
 840      *    that skip some of the first steps.
 841      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 842      *    substring. If the start position was determined to be at the
 843      *    beginning of the string  - so, not rejected, but not optimised,
 844      *    since we have to run regmatch from position 0 - decrement the
 845      *    BmUSEFUL() count. Otherwise increment it.
 846      */
 847
 848
 849     /* first, look for the 'check' substring */
 850
 851     {
 852         U8* start_point;
 853         U8* end_point;
 854
 855         DEBUG_OPTIMISE_MORE_r({
 856             PerlIO_printf(Perl_debug_log,
 857                 "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 858                 " Start shift: %"IVdf" End shift %"IVdf
 859                 " Real end Shift: %"IVdf"\n",
 860                 (IV)(rx_origin - strbeg),
 861                 (IV)prog->check_offset_min,
 862                 (IV)start_shift,
 863                 (IV)end_shift,
 864                 (IV)prog->check_end_shift);
 865         });
 866
 867         end_point = HOP3(strend, -end_shift, strbeg);
 868         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 869         if (!start_point)
 870             goto fail_finish;
 871
 872
 873         /* If the regex is absolutely anchored to either the start of the
 874          * string (SBOL) or to pos() (ANCH_GPOS), then
 875          * check_offset_max represents an upper bound on the string where
 876          * the substr could start. For the ANCH_GPOS case, we assume that
 877          * the caller of intuit will have already set strpos to
 878          * pos()-gofs, so in this case strpos + offset_max will still be
 879          * an upper bound on the substr.
 880          */
 881         if (!ml_anch
 882             && prog->intflags & PREGf_ANCH
 883             && prog->check_offset_max != SSize_t_MAX)
 884         {
 885             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 886             const char * const anchor =
 887                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 888
 889             /* do a bytes rather than chars comparison. It's conservative;
 890              * so it skips doing the HOP if the result can't possibly end
 891              * up earlier than the old value of end_point.
 892              */
 893             if ((char*)end_point - anchor > prog->check_offset_max) {
 894                 end_point = HOP3lim((U8*)anchor,
 895                                 prog->check_offset_max,
 896                                 end_point -len)
 897                             + len;
 898             }
 899         }
 900
 901         check_at = fbm_instr( start_point, end_point,
 902                       check, multiline ? FBMrf_MULTILINE : 0);
 903
 904         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 905             "  doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
 906             (IV)((char*)start_point - strbeg),
 907             (IV)((char*)end_point   - strbeg),
 908             (IV)(check_at ? check_at - strbeg : -1)
 909         ));
 910
 911         /* Update the count-of-usability, remove useless subpatterns,
 912             unshift s.  */
 913
 914         DEBUG_EXECUTE_r({
 915             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 916                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 917             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 918                               (check_at ? "Found" : "Did not find"),
 919                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 920                     ? "anchored" : "floating"),
 921                 quoted,
 922                 RE_SV_TAIL(check),
 923                 (check_at ? " at offset " : "...\n") );
 924         });
 925
 926         if (!check_at)
 927             goto fail_finish;
 928         /* set rx_origin to the minimum position where the regex could start
 929          * matching, given the constraint of the just-matched check substring.
 930          * But don't set it lower than previously.
 931          */
 932
 933         if (check_at - rx_origin > prog->check_offset_max)
 934             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 935         /* Finish the diagnostic message */
 936         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 937             "%ld (rx_origin now %"IVdf")...\n",
 938             (long)(check_at - strbeg),
 939             (IV)(rx_origin - strbeg)
 940         ));
 941     }
 942
 943
 944     /* now look for the 'other' substring if defined */
 945
 946     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 947                     : prog->substrs->data[other_ix].substr)
 948     {
 949         /* Take into account the "other" substring. */
 950         char *last, *last1;
 951         char *s;
 952         SV* must;
 953         struct reg_substr_datum *other;
 954
 955       do_other_substr:
 956         other = &prog->substrs->data[other_ix];
 957
 958         /* if "other" is anchored:
 959          * we've previously found a floating substr starting at check_at.
 960          * This means that the regex origin must lie somewhere
 961          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 962          * and max:                 HOP3(check_at, -check_offset_min)
 963          * (except that min will be >= strpos)
 964          * So the fixed  substr must lie somewhere between
 965          *  HOP3(min, anchored_offset)
 966          *  HOP3(max, anchored_offset) + SvCUR(substr)
 967          */
 968
 969         /* if "other" is floating
 970          * Calculate last1, the absolute latest point where the
 971          * floating substr could start in the string, ignoring any
 972          * constraints from the earlier fixed match. It is calculated
 973          * as follows:
 974          *
 975          * strend - prog->minlen (in chars) is the absolute latest
 976          * position within the string where the origin of the regex
 977          * could appear. The latest start point for the floating
 978          * substr is float_min_offset(*) on from the start of the
 979          * regex.  last1 simply combines thee two offsets.
 980          *
 981          * (*) You might think the latest start point should be
 982          * float_max_offset from the regex origin, and technically
 983          * you'd be correct. However, consider
 984          *    /a\d{2,4}bcd\w/
 985          * Here, float min, max are 3,5 and minlen is 7.
 986          * This can match either
 987          *    /a\d\dbcd\w/
 988          *    /a\d\d\dbcd\w/
 989          *    /a\d\d\d\dbcd\w/
 990          * In the first case, the regex matches minlen chars; in the
 991          * second, minlen+1, in the third, minlen+2.
 992          * In the first case, the floating offset is 3 (which equals
 993          * float_min), in the second, 4, and in the third, 5 (which
 994          * equals float_max). In all cases, the floating string bcd
 995          * can never start more than 4 chars from the end of the
 996          * string, which equals minlen - float_min. As the substring
 997          * starts to match more than float_min from the start of the
 998          * regex, it makes the regex match more than minlen chars,
 999          * and the two cancel each other out. So we can always use
1000          * float_min - minlen, rather than float_max - minlen for the
1001          * latest position in the string.
1002          *
1003          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1004          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1005          */
1006
1007         assert(prog->minlen >= other->min_offset);
1008         last1 = HOP3c(strend,
1009                         other->min_offset - prog->minlen, strbeg);
1010
1011         if (other_ix) {/* i.e. if (other-is-float) */
1012             /* last is the latest point where the floating substr could
1013              * start, *given* any constraints from the earlier fixed
1014              * match. This constraint is that the floating string starts
1015              * <= float_max_offset chars from the regex origin (rx_origin).
1016              * If this value is less than last1, use it instead.
1017              */
1018             assert(rx_origin <= last1);
1019             last =
1020                 /* this condition handles the offset==infinity case, and
1021                  * is a short-cut otherwise. Although it's comparing a
1022                  * byte offset to a char length, it does so in a safe way,
1023                  * since 1 char always occupies 1 or more bytes,
1024                  * so if a string range is  (last1 - rx_origin) bytes,
1025                  * it will be less than or equal to  (last1 - rx_origin)
1026                  * chars; meaning it errs towards doing the accurate HOP3
1027                  * rather than just using last1 as a short-cut */
1028                 (last1 - rx_origin) < other->max_offset
1029                     ? last1
1030                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1031         }
1032         else {
1033             assert(strpos + start_shift <= check_at);
1034             last = HOP4c(check_at, other->min_offset - start_shift,
1035                         strbeg, strend);
1036         }
1037
1038         s = HOP3c(rx_origin, other->min_offset, strend);
1039         if (s < other_last)     /* These positions already checked */
1040             s = other_last;
1041
1042         must = utf8_target ? other->utf8_substr : other->substr;
1043         assert(SvPOK(must));
1044         {
1045             char *from = s;
1046             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1047
1048             if (from > to) {
1049                 s = NULL;
1050                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1051                     "  skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1052                     (IV)(from - strbeg),
1053                     (IV)(to   - strbeg)
1054                 ));
1055             }
1056             else {
1057                 s = fbm_instr(
1058                     (unsigned char*)from,
1059                     (unsigned char*)to,
1060                     must,
1061                     multiline ? FBMrf_MULTILINE : 0
1062                 );
1063                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1064                     "  doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1065                     (IV)(from - strbeg),
1066                     (IV)(to   - strbeg),
1067                     (IV)(s ? s - strbeg : -1)
1068                 ));
1069             }
1070         }
1071
1072         DEBUG_EXECUTE_r({
1073             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1074                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1075             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1076                 s ? "Found" : "Contradicts",
1077                 other_ix ? "floating" : "anchored",
1078                 quoted, RE_SV_TAIL(must));
1079         });
1080
1081
1082         if (!s) {
1083             /* last1 is latest possible substr location. If we didn't
1084              * find it before there, we never will */
1085             if (last >= last1) {
1086                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1087                                         "; giving up...\n"));
1088                 goto fail_finish;
1089             }
1090
1091             /* try to find the check substr again at a later
1092              * position. Maybe next time we'll find the "other" substr
1093              * in range too */
1094             other_last = HOP3c(last, 1, strend) /* highest failure */;
1095             rx_origin =
1096                 other_ix /* i.e. if other-is-float */
1097                     ? HOP3c(rx_origin, 1, strend)
1098                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1099             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1100                 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1101                 (other_ix ? "floating" : "anchored"),
1102                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1103                 (IV)(rx_origin - strbeg)
1104             ));
1105             goto restart;
1106         }
1107         else {
1108             if (other_ix) { /* if (other-is-float) */
1109                 /* other_last is set to s, not s+1, since its possible for
1110                  * a floating substr to fail first time, then succeed
1111                  * second time at the same floating position; e.g.:
1112                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1113                  * The first time round, anchored and float match at
1114                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1115                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1116                  */
1117                 other_last = s;
1118             }
1119             else {
1120                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1121                 other_last = HOP3c(s, 1, strend);
1122             }
1123             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1124                 " at offset %ld (rx_origin now %"IVdf")...\n",
1125                   (long)(s - strbeg),
1126                 (IV)(rx_origin - strbeg)
1127               ));
1128
1129         }
1130     }
1131     else {
1132         DEBUG_OPTIMISE_MORE_r(
1133             PerlIO_printf(Perl_debug_log,
1134                 "  Check-only match: offset min:%"IVdf" max:%"IVdf
1135                 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1136                 " strend:%"IVdf"\n",
1137                 (IV)prog->check_offset_min,
1138                 (IV)prog->check_offset_max,
1139                 (IV)(check_at-strbeg),
1140                 (IV)(rx_origin-strbeg),
1141                 (IV)(rx_origin-check_at),
1142                 (IV)(strend-strbeg)
1143             )
1144         );
1145     }
1146
1147   postprocess_substr_matches:
1148
1149     /* handle the extra constraint of /^.../m if present */
1150
1151     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1152         char *s;
1153
1154         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1155                         "  looking for /^/m anchor"));
1156
1157         /* we have failed the constraint of a \n before rx_origin.
1158          * Find the next \n, if any, even if it's beyond the current
1159          * anchored and/or floating substrings. Whether we should be
1160          * scanning ahead for the next \n or the next substr is debatable.
1161          * On the one hand you'd expect rare substrings to appear less
1162          * often than \n's. On the other hand, searching for \n means
1163          * we're effectively flipping between check_substr and "\n" on each
1164          * iteration as the current "rarest" string candidate, which
1165          * means for example that we'll quickly reject the whole string if
1166          * hasn't got a \n, rather than trying every substr position
1167          * first
1168          */
1169
1170         s = HOP3c(strend, - prog->minlen, strpos);
1171         if (s <= rx_origin ||
1172             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1173         {
1174             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1175                             "  Did not find /%s^%s/m...\n",
1176                             PL_colors[0], PL_colors[1]));
1177             goto fail_finish;
1178         }
1179
1180         /* earliest possible origin is 1 char after the \n.
1181          * (since *rx_origin == '\n', it's safe to ++ here rather than
1182          * HOP(rx_origin, 1)) */
1183         rx_origin++;
1184
1185         if (prog->substrs->check_ix == 0  /* check is anchored */
1186             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1187         {
1188             /* Position contradicts check-string; either because
1189              * check was anchored (and thus has no wiggle room),
1190              * or check was float and rx_origin is above the float range */
1191             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1192                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1193                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1194             goto restart;
1195         }
1196
1197         /* if we get here, the check substr must have been float,
1198          * is in range, and we may or may not have had an anchored
1199          * "other" substr which still contradicts */
1200         assert(prog->substrs->check_ix); /* check is float */
1201
1202         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1203             /* whoops, the anchored "other" substr exists, so we still
1204              * contradict. On the other hand, the float "check" substr
1205              * didn't contradict, so just retry the anchored "other"
1206              * substr */
1207             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1208                 "  Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
1209                 PL_colors[0], PL_colors[1],
1210                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1211                 (IV)(rx_origin - strbeg)
1212             ));
1213             goto do_other_substr;
1214         }
1215
1216         /* success: we don't contradict the found floating substring
1217          * (and there's no anchored substr). */
1218         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1219             "  Found /%s^%s/m with rx_origin %ld...\n",
1220             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1221     }
1222     else {
1223         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1224             "  (multiline anchor test skipped)\n"));
1225     }
1226
1227   success_at_start:
1228
1229
1230     /* if we have a starting character class, then test that extra constraint.
1231      * (trie stclasses are too expensive to use here, we are better off to
1232      * leave it to regmatch itself) */
1233
1234     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1235         const U8* const str = (U8*)STRING(progi->regstclass);
1236
1237         /* XXX this value could be pre-computed */
1238         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1239                     ?  (reginfo->is_utf8_pat
1240                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1241                         : STR_LEN(progi->regstclass))
1242                     : 1);
1243         char * endpos;
1244         char *s;
1245         /* latest pos that a matching float substr constrains rx start to */
1246         char *rx_max_float = NULL;
1247
1248         /* if the current rx_origin is anchored, either by satisfying an
1249          * anchored substring constraint, or a /^.../m constraint, then we
1250          * can reject the current origin if the start class isn't found
1251          * at the current position. If we have a float-only match, then
1252          * rx_origin is constrained to a range; so look for the start class
1253          * in that range. if neither, then look for the start class in the
1254          * whole rest of the string */
1255
1256         /* XXX DAPM it's not clear what the minlen test is for, and why
1257          * it's not used in the floating case. Nothing in the test suite
1258          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1259          * Here are some old comments, which may or may not be correct:
1260          *
1261          *   minlen == 0 is possible if regstclass is \b or \B,
1262          *   and the fixed substr is ''$.
1263          *   Since minlen is already taken into account, rx_origin+1 is
1264          *   before strend; accidentally, minlen >= 1 guaranties no false
1265          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1266          *   0) below assumes that regstclass does not come from lookahead...
1267          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1268          *   This leaves EXACTF-ish only, which are dealt with in
1269          *   find_byclass().
1270          */
1271
1272         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1273             endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1274         else if (prog->float_substr || prog->float_utf8) {
1275             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1276             endpos= HOP3c(rx_max_float, cl_l, strend);
1277         }
1278         else
1279             endpos= strend;
1280
1281         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1282             "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1283             " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1284               (IV)start_shift, (IV)(check_at - strbeg),
1285               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1286
1287         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1288                             reginfo);
1289         if (!s) {
1290             if (endpos == strend) {
1291                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1292                                 "  Could not match STCLASS...\n") );
1293                 goto fail;
1294             }
1295             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1296                                "  This position contradicts STCLASS...\n") );
1297             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1298                         && !(prog->intflags & PREGf_IMPLICIT))
1299                 goto fail;
1300
1301             /* Contradict one of substrings */
1302             if (prog->anchored_substr || prog->anchored_utf8) {
1303                 if (prog->substrs->check_ix == 1) { /* check is float */
1304                     /* Have both, check_string is floating */
1305                     assert(rx_origin + start_shift <= check_at);
1306                     if (rx_origin + start_shift != check_at) {
1307                         /* not at latest position float substr could match:
1308                          * Recheck anchored substring, but not floating.
1309                          * The condition above is in bytes rather than
1310                          * chars for efficiency. It's conservative, in
1311                          * that it errs on the side of doing 'goto
1312                          * do_other_substr'. In this case, at worst,
1313                          * an extra anchored search may get done, but in
1314                          * practice the extra fbm_instr() is likely to
1315                          * get skipped anyway. */
1316                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1317                             "  about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1318                             (long)(other_last - strbeg),
1319                             (IV)(rx_origin - strbeg)
1320                         ));
1321                         goto do_other_substr;
1322                     }
1323                 }
1324             }
1325             else {
1326                 /* float-only */
1327
1328                 if (ml_anch) {
1329                     /* In the presence of ml_anch, we might be able to
1330                      * find another \n without breaking the current float
1331                      * constraint. */
1332
1333                     /* strictly speaking this should be HOP3c(..., 1, ...),
1334                      * but since we goto a block of code that's going to
1335                      * search for the next \n if any, its safe here */
1336                     rx_origin++;
1337                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1338                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1339                               PL_colors[0], PL_colors[1],
1340                               (long)(rx_origin - strbeg)) );
1341                     goto postprocess_substr_matches;
1342                 }
1343
1344                 /* strictly speaking this can never be true; but might
1345                  * be if we ever allow intuit without substrings */
1346                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1347                     goto fail;
1348
1349                 rx_origin = rx_max_float;
1350             }
1351
1352             /* at this point, any matching substrings have been
1353              * contradicted. Start again... */
1354
1355             rx_origin = HOP3c(rx_origin, 1, strend);
1356
1357             /* uses bytes rather than char calculations for efficiency.
1358              * It's conservative: it errs on the side of doing 'goto restart',
1359              * where there is code that does a proper char-based test */
1360             if (rx_origin + start_shift + end_shift > strend) {
1361                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1362                                        "  Could not match STCLASS...\n") );
1363                 goto fail;
1364             }
1365             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1366                 "  about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1367                 (prog->substrs->check_ix ? "floating" : "anchored"),
1368                 (long)(rx_origin + start_shift - strbeg),
1369                 (IV)(rx_origin - strbeg)
1370             ));
1371             goto restart;
1372         }
1373
1374         /* Success !!! */
1375
1376         if (rx_origin != s) {
1377             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1378                         "  By STCLASS: moving %ld --> %ld\n",
1379                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1380                    );
1381         }
1382         else {
1383             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1384                                   "  Does not contradict STCLASS...\n");
1385                    );
1386         }
1387     }
1388
1389     /* Decide whether using the substrings helped */
1390
1391     if (rx_origin != strpos) {
1392         /* Fixed substring is found far enough so that the match
1393            cannot start at strpos. */
1394
1395         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1396         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1397     }
1398     else {
1399         /* The found rx_origin position does not prohibit matching at
1400          * strpos, so calling intuit didn't gain us anything. Decrement
1401          * the BmUSEFUL() count on the check substring, and if we reach
1402          * zero, free it.  */
1403         if (!(prog->intflags & PREGf_NAUGHTY)
1404             && (utf8_target ? (
1405                 prog->check_utf8                /* Could be deleted already */
1406                 && --BmUSEFUL(prog->check_utf8) < 0
1407                 && (prog->check_utf8 == prog->float_utf8)
1408             ) : (
1409                 prog->check_substr              /* Could be deleted already */
1410                 && --BmUSEFUL(prog->check_substr) < 0
1411                 && (prog->check_substr == prog->float_substr)
1412             )))
1413         {
1414             /* If flags & SOMETHING - do not do it many times on the same match */
1415             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1416             /* XXX Does the destruction order has to change with utf8_target? */
1417             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1418             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1419             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1420             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1421             check = NULL;                       /* abort */
1422             /* XXXX This is a remnant of the old implementation.  It
1423                     looks wasteful, since now INTUIT can use many
1424                     other heuristics. */
1425             prog->extflags &= ~RXf_USE_INTUIT;
1426         }
1427     }
1428
1429     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1430             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1431              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1432
1433     return rx_origin;
1434
1435   fail_finish:                          /* Substring not found */
1436     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1437         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1438   fail:
1439     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1440                           PL_colors[4], PL_colors[5]));
1441     return NULL;
1442 }
1443
1444
1445 #define DECL_TRIE_TYPE(scan) \
1446     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1447                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1448                  trie_utf8l, trie_flu8 }                                            \
1449                     trie_type = ((scan->flags == EXACT)                             \
1450                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1451                                  : (scan->flags == EXACTL)                          \
1452                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1453                                     : (scan->flags == EXACTFA)                      \
1454                                       ? (utf8_target                                \
1455                                          ? trie_utf8_exactfa_fold                   \
1456                                          : trie_latin_utf8_exactfa_fold)            \
1457                                       : (scan->flags == EXACTFLU8                   \
1458                                          ? trie_flu8                                \
1459                                          : (utf8_target                             \
1460                                            ? trie_utf8_fold                         \
1461                                            :   trie_latin_utf8_fold)))
1462
1463 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1464 STMT_START {                                                                        \
1465     STRLEN skiplen;                                                                 \
1466     U8 flags = FOLD_FLAGS_FULL;                                                     \
1467     switch (trie_type) {                                                            \
1468     case trie_flu8:                                                                 \
1469         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1470         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1471             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1472         }                                                                           \
1473         goto do_trie_utf8_fold;                                                     \
1474     case trie_utf8_exactfa_fold:                                                    \
1475         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1476         /* FALLTHROUGH */                                                           \
1477     case trie_utf8_fold:                                                            \
1478       do_trie_utf8_fold:                                                            \
1479         if ( foldlen>0 ) {                                                          \
1480             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1481             foldlen -= len;                                                         \
1482             uscan += len;                                                           \
1483             len=0;                                                                  \
1484         } else {                                                                    \
1485             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1486             len = UTF8SKIP(uc);                                                     \
1487             skiplen = UVCHR_SKIP( uvc );                                            \
1488             foldlen -= skiplen;                                                     \
1489             uscan = foldbuf + skiplen;                                              \
1490         }                                                                           \
1491         break;                                                                      \
1492     case trie_latin_utf8_exactfa_fold:                                              \
1493         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1494         /* FALLTHROUGH */                                                           \
1495     case trie_latin_utf8_fold:                                                      \
1496         if ( foldlen>0 ) {                                                          \
1497             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1498             foldlen -= len;                                                         \
1499             uscan += len;                                                           \
1500             len=0;                                                                  \
1501         } else {                                                                    \
1502             len = 1;                                                                \
1503             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1504             skiplen = UVCHR_SKIP( uvc );                                            \
1505             foldlen -= skiplen;                                                     \
1506             uscan = foldbuf + skiplen;                                              \
1507         }                                                                           \
1508         break;                                                                      \
1509     case trie_utf8l:                                                                \
1510         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1511         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1512             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1513         }                                                                           \
1514         /* FALLTHROUGH */                                                           \
1515     case trie_utf8:                                                                 \
1516         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1517         break;                                                                      \
1518     case trie_plain:                                                                \
1519         uvc = (UV)*uc;                                                              \
1520         len = 1;                                                                    \
1521     }                                                                               \
1522     if (uvc < 256) {                                                                \
1523         charid = trie->charmap[ uvc ];                                              \
1524     }                                                                               \
1525     else {                                                                          \
1526         charid = 0;                                                                 \
1527         if (widecharmap) {                                                          \
1528             SV** const svpp = hv_fetch(widecharmap,                                 \
1529                         (char*)&uvc, sizeof(UV), 0);                                \
1530             if (svpp)                                                               \
1531                 charid = (U16)SvIV(*svpp);                                          \
1532         }                                                                           \
1533     }                                                                               \
1534 } STMT_END
1535
1536 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1537     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1538                 startpos, doutf8)
1539
1540 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1541 STMT_START {                                              \
1542     while (s <= e) {                                      \
1543         if ( (COND)                                       \
1544              && (ln == 1 || folder(s, pat_string, ln))    \
1545              && (reginfo->intuit || regtry(reginfo, &s)) )\
1546             goto got_it;                                  \
1547         s++;                                              \
1548     }                                                     \
1549 } STMT_END
1550
1551 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1552 STMT_START {                                          \
1553     while (s < strend) {                              \
1554         CODE                                          \
1555         s += UTF8SKIP(s);                             \
1556     }                                                 \
1557 } STMT_END
1558
1559 #define REXEC_FBC_SCAN(CODE)                          \
1560 STMT_START {                                          \
1561     while (s < strend) {                              \
1562         CODE                                          \
1563         s++;                                          \
1564     }                                                 \
1565 } STMT_END
1566
1567 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1568 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1569     if (COND) {                                                \
1570         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1571             goto got_it;                                       \
1572         else                                                   \
1573             tmp = doevery;                                     \
1574     }                                                          \
1575     else                                                       \
1576         tmp = 1;                                               \
1577 )
1578
1579 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1580 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1581     if (COND) {                                                \
1582         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1583             goto got_it;                                       \
1584         else                                                   \
1585             tmp = doevery;                                     \
1586     }                                                          \
1587     else                                                       \
1588         tmp = 1;                                               \
1589 )
1590
1591 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1592     if (utf8_target) {                                         \
1593         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1594     }                                                          \
1595     else {                                                     \
1596         REXEC_FBC_CLASS_SCAN(COND);                            \
1597     }
1598
1599 /* The three macros below are slightly different versions of the same logic.
1600  *
1601  * The first is for /a and /aa when the target string is UTF-8.  This can only
1602  * match ascii, but it must advance based on UTF-8.   The other two handle the
1603  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1604  * for the boundary (or non-boundary) between a word and non-word character.
1605  * The utf8 and non-utf8 cases have the same logic, but the details must be
1606  * different.  Find the "wordness" of the character just prior to this one, and
1607  * compare it with the wordness of this one.  If they differ, we have a
1608  * boundary.  At the beginning of the string, pretend that the previous
1609  * character was a new-line.
1610  *
1611  * All these macros uncleanly have side-effects with each other and outside
1612  * variables.  So far it's been too much trouble to clean-up
1613  *
1614  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1615  *               a word character or not.
1616  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1617  *               word/non-word
1618  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1619  *
1620  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1621  * are looking for a boundary or for a non-boundary.  If we are looking for a
1622  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1623  * see if this tentative match actually works, and if so, to quit the loop
1624  * here.  And vice-versa if we are looking for a non-boundary.
1625  *
1626  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1627  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1628  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1629  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1630  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1631  * complement.  But in that branch we complement tmp, meaning that at the
1632  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1633  * which means at the top of the loop in the next iteration, it is
1634  * TEST_NON_UTF8(s-1) */
1635 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1636     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1637     tmp = TEST_NON_UTF8(tmp);                                                  \
1638     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1639         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1640             tmp = !tmp;                                                        \
1641             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1642         }                                                                      \
1643         else {                                                                 \
1644             IF_FAIL;                                                           \
1645         }                                                                      \
1646     );                                                                         \
1647
1648 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1649  * TEST_UTF8 is a macro that for the same input code points returns identically
1650  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1651 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1652     if (s == reginfo->strbeg) {                                                \
1653         tmp = '\n';                                                            \
1654     }                                                                          \
1655     else { /* Back-up to the start of the previous character */                \
1656         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1657         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1658                                                        0, UTF8_ALLOW_DEFAULT); \
1659     }                                                                          \
1660     tmp = TEST_UV(tmp);                                                        \
1661     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1662     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1663         if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1664             tmp = !tmp;                                                        \
1665             IF_SUCCESS;                                                        \
1666         }                                                                      \
1667         else {                                                                 \
1668             IF_FAIL;                                                           \
1669         }                                                                      \
1670     );
1671
1672 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1673  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1674  * macros below */
1675 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1676     if (utf8_target) {                                                         \
1677         UTF8_CODE                                                              \
1678     }                                                                          \
1679     else {  /* Not utf8 */                                                     \
1680         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1681         tmp = TEST_NON_UTF8(tmp);                                              \
1682         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1683             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1684                 IF_SUCCESS;                                                    \
1685                 tmp = !tmp;                                                    \
1686             }                                                                  \
1687             else {                                                             \
1688                 IF_FAIL;                                                       \
1689             }                                                                  \
1690         );                                                                     \
1691     }                                                                          \
1692     /* Here, things have been set up by the previous code so that tmp is the   \
1693      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1694      * utf8ness of the target).  We also have to check if this matches against \
1695      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1696      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1697      * string */                                                               \
1698     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1699         IF_SUCCESS;                                                            \
1700     }                                                                          \
1701     else {                                                                     \
1702         IF_FAIL;                                                               \
1703     }
1704
1705 /* This is the macro to use when we want to see if something that looks like it
1706  * could match, actually does, and if so exits the loop */
1707 #define REXEC_FBC_TRYIT                            \
1708     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1709         goto got_it
1710
1711 /* The only difference between the BOUND and NBOUND cases is that
1712  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1713  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1714  * with the other one being empty (PLACEHOLDER is defined as empty).
1715  *
1716  * The TEST_FOO parameters are for operating on different forms of input, but
1717  * all should be ones that return identically for the same underlying code
1718  * points */
1719 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1720     FBC_BOUND_COMMON(                                                          \
1721           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1722           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1723
1724 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1725     FBC_BOUND_COMMON(                                                          \
1726             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1727             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1728
1729 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1730     FBC_BOUND_COMMON(                                                          \
1731           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1732           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1733
1734 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1735     FBC_BOUND_COMMON(                                                          \
1736             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1737             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1738
1739 /* Takes a pointer to an inversion list, a pointer to its corresponding
1740  * inversion map, and a code point, and returns the code point's value
1741  * according to the two arrays.  It assumes that all code points have a value.
1742  * This is used as the base macro for macros for particular properties */
1743 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1744                              invmap[_invlist_search(invlist, cp)]
1745
1746 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1747  * of a code point, returning the value for the first code point in the string.
1748  * And it takes the particular macro name that finds the desired value given a
1749  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1750 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1751              (__ASSERT_(pos < strend)                                          \
1752                  /* Note assumes is valid UTF-8 */                             \
1753              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1754
1755 /* Returns the GCB value for the input code point */
1756 #define getGCB_VAL_CP(cp)                                                      \
1757           _generic_GET_BREAK_VAL_CP(                                           \
1758                                     PL_GCB_invlist,                            \
1759                                     _Perl_GCB_invmap,                          \
1760                                     (cp))
1761
1762 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1763  * bounded by pos and strend */
1764 #define getGCB_VAL_UTF8(pos, strend)                                           \
1765     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1766
1767
1768 /* Returns the SB value for the input code point */
1769 #define getSB_VAL_CP(cp)                                                       \
1770           _generic_GET_BREAK_VAL_CP(                                           \
1771                                     PL_SB_invlist,                             \
1772                                     _Perl_SB_invmap,                     \
1773                                     (cp))
1774
1775 /* Returns the SB value for the first code point in the UTF-8 encoded string
1776  * bounded by pos and strend */
1777 #define getSB_VAL_UTF8(pos, strend)                                            \
1778     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1779
1780 /* Returns the WB value for the input code point */
1781 #define getWB_VAL_CP(cp)                                                       \
1782           _generic_GET_BREAK_VAL_CP(                                           \
1783                                     PL_WB_invlist,                             \
1784                                     _Perl_WB_invmap,                         \
1785                                     (cp))
1786
1787 /* Returns the WB value for the first code point in the UTF-8 encoded string
1788  * bounded by pos and strend */
1789 #define getWB_VAL_UTF8(pos, strend)                                            \
1790     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1791
1792 /* We know what class REx starts with.  Try to find this position... */
1793 /* if reginfo->intuit, its a dryrun */
1794 /* annoyingly all the vars in this routine have different names from their counterparts
1795    in regmatch. /grrr */
1796 STATIC char *
1797 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1798     const char *strend, regmatch_info *reginfo)
1799 {
1800     dVAR;
1801     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1802     char *pat_string;   /* The pattern's exactish string */
1803     char *pat_end;          /* ptr to end char of pat_string */
1804     re_fold_t folder;   /* Function for computing non-utf8 folds */
1805     const U8 *fold_array;   /* array for folding ords < 256 */
1806     STRLEN ln;
1807     STRLEN lnc;
1808     U8 c1;
1809     U8 c2;
1810     char *e;
1811     I32 tmp = 1;        /* Scratch variable? */
1812     const bool utf8_target = reginfo->is_utf8_target;
1813     UV utf8_fold_flags = 0;
1814     const bool is_utf8_pat = reginfo->is_utf8_pat;
1815     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1816                                    with a result inverts that result, as 0^1 =
1817                                    1 and 1^1 = 0 */
1818     _char_class_number classnum;
1819
1820     RXi_GET_DECL(prog,progi);
1821
1822     PERL_ARGS_ASSERT_FIND_BYCLASS;
1823
1824     /* We know what class it must start with. */
1825     switch (OP(c)) {
1826     case ANYOFL:
1827         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1828
1829         if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
1830             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1831         }
1832
1833         /* FALLTHROUGH */
1834     case ANYOFD:
1835     case ANYOF:
1836         if (utf8_target) {
1837             REXEC_FBC_UTF8_CLASS_SCAN(
1838                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1839         }
1840         else {
1841             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1842         }
1843         break;
1844
1845     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1846         assert(! is_utf8_pat);
1847         /* FALLTHROUGH */
1848     case EXACTFA:
1849         if (is_utf8_pat || utf8_target) {
1850             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1851             goto do_exactf_utf8;
1852         }
1853         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1854         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1855         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1856
1857     case EXACTF:   /* This node only generated for non-utf8 patterns */
1858         assert(! is_utf8_pat);
1859         if (utf8_target) {
1860             utf8_fold_flags = 0;
1861             goto do_exactf_utf8;
1862         }
1863         fold_array = PL_fold;
1864         folder = foldEQ;
1865         goto do_exactf_non_utf8;
1866
1867     case EXACTFL:
1868         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1869         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1870             utf8_fold_flags = FOLDEQ_LOCALE;
1871             goto do_exactf_utf8;
1872         }
1873         fold_array = PL_fold_locale;
1874         folder = foldEQ_locale;
1875         goto do_exactf_non_utf8;
1876
1877     case EXACTFU_SS:
1878         if (is_utf8_pat) {
1879             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1880         }
1881         goto do_exactf_utf8;
1882
1883     case EXACTFLU8:
1884             if (! utf8_target) {    /* All code points in this node require
1885                                        UTF-8 to express.  */
1886                 break;
1887             }
1888             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1889                                              | FOLDEQ_S2_FOLDS_SANE;
1890             goto do_exactf_utf8;
1891
1892     case EXACTFU:
1893         if (is_utf8_pat || utf8_target) {
1894             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1895             goto do_exactf_utf8;
1896         }
1897
1898         /* Any 'ss' in the pattern should have been replaced by regcomp,
1899          * so we don't have to worry here about this single special case
1900          * in the Latin1 range */
1901         fold_array = PL_fold_latin1;
1902         folder = foldEQ_latin1;
1903
1904         /* FALLTHROUGH */
1905
1906       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1907                            are no glitches with fold-length differences
1908                            between the target string and pattern */
1909
1910         /* The idea in the non-utf8 EXACTF* cases is to first find the
1911          * first character of the EXACTF* node and then, if necessary,
1912          * case-insensitively compare the full text of the node.  c1 is the
1913          * first character.  c2 is its fold.  This logic will not work for
1914          * Unicode semantics and the german sharp ss, which hence should
1915          * not be compiled into a node that gets here. */
1916         pat_string = STRING(c);
1917         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1918
1919         /* We know that we have to match at least 'ln' bytes (which is the
1920          * same as characters, since not utf8).  If we have to match 3
1921          * characters, and there are only 2 availabe, we know without
1922          * trying that it will fail; so don't start a match past the
1923          * required minimum number from the far end */
1924         e = HOP3c(strend, -((SSize_t)ln), s);
1925
1926         if (reginfo->intuit && e < s) {
1927             e = s;                      /* Due to minlen logic of intuit() */
1928         }
1929
1930         c1 = *pat_string;
1931         c2 = fold_array[c1];
1932         if (c1 == c2) { /* If char and fold are the same */
1933             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1934         }
1935         else {
1936             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1937         }
1938         break;
1939
1940       do_exactf_utf8:
1941       {
1942         unsigned expansion;
1943
1944         /* If one of the operands is in utf8, we can't use the simpler folding
1945          * above, due to the fact that many different characters can have the
1946          * same fold, or portion of a fold, or different- length fold */
1947         pat_string = STRING(c);
1948         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1949         pat_end = pat_string + ln;
1950         lnc = is_utf8_pat       /* length to match in characters */
1951                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1952                 : ln;
1953
1954         /* We have 'lnc' characters to match in the pattern, but because of
1955          * multi-character folding, each character in the target can match
1956          * up to 3 characters (Unicode guarantees it will never exceed
1957          * this) if it is utf8-encoded; and up to 2 if not (based on the
1958          * fact that the Latin 1 folds are already determined, and the
1959          * only multi-char fold in that range is the sharp-s folding to
1960          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1961          * string character.  Adjust lnc accordingly, rounding up, so that
1962          * if we need to match at least 4+1/3 chars, that really is 5. */
1963         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1964         lnc = (lnc + expansion - 1) / expansion;
1965
1966         /* As in the non-UTF8 case, if we have to match 3 characters, and
1967          * only 2 are left, it's guaranteed to fail, so don't start a
1968          * match that would require us to go beyond the end of the string
1969          */
1970         e = HOP3c(strend, -((SSize_t)lnc), s);
1971
1972         if (reginfo->intuit && e < s) {
1973             e = s;                      /* Due to minlen logic of intuit() */
1974         }
1975
1976         /* XXX Note that we could recalculate e to stop the loop earlier,
1977          * as the worst case expansion above will rarely be met, and as we
1978          * go along we would usually find that e moves further to the left.
1979          * This would happen only after we reached the point in the loop
1980          * where if there were no expansion we should fail.  Unclear if
1981          * worth the expense */
1982
1983         while (s <= e) {
1984             char *my_strend= (char *)strend;
1985             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1986                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1987                 && (reginfo->intuit || regtry(reginfo, &s)) )
1988             {
1989                 goto got_it;
1990             }
1991             s += (utf8_target) ? UTF8SKIP(s) : 1;
1992         }
1993         break;
1994     }
1995
1996     case BOUNDL:
1997         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1998         if (FLAGS(c) != TRADITIONAL_BOUND) {
1999             if (! IN_UTF8_CTYPE_LOCALE) {
2000                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2001                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2002             }
2003             goto do_boundu;
2004         }
2005
2006         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2007         break;
2008
2009     case NBOUNDL:
2010         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2011         if (FLAGS(c) != TRADITIONAL_BOUND) {
2012             if (! IN_UTF8_CTYPE_LOCALE) {
2013                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2014                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2015             }
2016             goto do_nboundu;
2017         }
2018
2019         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2020         break;
2021
2022     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2023                    meaning */
2024         assert(FLAGS(c) == TRADITIONAL_BOUND);
2025
2026         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2027         break;
2028
2029     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2030                    meaning */
2031         assert(FLAGS(c) == TRADITIONAL_BOUND);
2032
2033         FBC_BOUND_A(isWORDCHAR_A);
2034         break;
2035
2036     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2037                    meaning */
2038         assert(FLAGS(c) == TRADITIONAL_BOUND);
2039
2040         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2041         break;
2042
2043     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2044                    meaning */
2045         assert(FLAGS(c) == TRADITIONAL_BOUND);
2046
2047         FBC_NBOUND_A(isWORDCHAR_A);
2048         break;
2049
2050     case NBOUNDU:
2051         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2052             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2053             break;
2054         }
2055
2056       do_nboundu:
2057
2058         to_complement = 1;
2059         /* FALLTHROUGH */
2060
2061     case BOUNDU:
2062       do_boundu:
2063         switch((bound_type) FLAGS(c)) {
2064             case TRADITIONAL_BOUND:
2065                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2066                 break;
2067             case GCB_BOUND:
2068                 if (s == reginfo->strbeg) {
2069                     if (reginfo->intuit || regtry(reginfo, &s))
2070                     {
2071                         goto got_it;
2072                     }
2073
2074                     /* Didn't match.  Try at the next position (if there is one) */
2075                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2076                     if (UNLIKELY(s >= reginfo->strend)) {
2077                         break;
2078                     }
2079                 }
2080
2081                 if (utf8_target) {
2082                     GCB_enum before = getGCB_VAL_UTF8(
2083                                                reghop3((U8*)s, -1,
2084                                                        (U8*)(reginfo->strbeg)),
2085                                                (U8*) reginfo->strend);
2086                     while (s < strend) {
2087                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2088                                                         (U8*) reginfo->strend);
2089                         if (   (to_complement ^ isGCB(before, after))
2090                             && (reginfo->intuit || regtry(reginfo, &s)))
2091                         {
2092                             goto got_it;
2093                         }
2094                         before = after;
2095                         s += UTF8SKIP(s);
2096                     }
2097                 }
2098                 else {  /* Not utf8.  Everything is a GCB except between CR and
2099                            LF */
2100                     while (s < strend) {
2101                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2102                                               || UCHARAT(s) != '\n'))
2103                             && (reginfo->intuit || regtry(reginfo, &s)))
2104                         {
2105                             goto got_it;
2106                         }
2107                         s++;
2108                     }
2109                 }
2110
2111                 /* And, since this is a bound, it can match after the final
2112                  * character in the string */
2113                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2114                     goto got_it;
2115                 }
2116                 break;
2117
2118             case SB_BOUND:
2119                 if (s == reginfo->strbeg) {
2120                     if (reginfo->intuit || regtry(reginfo, &s)) {
2121                         goto got_it;
2122                     }
2123                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2124                     if (UNLIKELY(s >= reginfo->strend)) {
2125                         break;
2126                     }
2127                 }
2128
2129                 if (utf8_target) {
2130                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2131                                                         -1,
2132                                                         (U8*)(reginfo->strbeg)),
2133                                                       (U8*) reginfo->strend);
2134                     while (s < strend) {
2135                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2136                                                          (U8*) reginfo->strend);
2137                         if ((to_complement ^ isSB(before,
2138                                                   after,
2139                                                   (U8*) reginfo->strbeg,
2140                                                   (U8*) s,
2141                                                   (U8*) reginfo->strend,
2142                                                   utf8_target))
2143                             && (reginfo->intuit || regtry(reginfo, &s)))
2144                         {
2145                             goto got_it;
2146                         }
2147                         before = after;
2148                         s += UTF8SKIP(s);
2149                     }
2150                 }
2151                 else {  /* Not utf8. */
2152                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2153                     while (s < strend) {
2154                         SB_enum after = getSB_VAL_CP((U8) *s);
2155                         if ((to_complement ^ isSB(before,
2156                                                   after,
2157                                                   (U8*) reginfo->strbeg,
2158                                                   (U8*) s,
2159                                                   (U8*) reginfo->strend,
2160                                                   utf8_target))
2161                             && (reginfo->intuit || regtry(reginfo, &s)))
2162                         {
2163                             goto got_it;
2164                         }
2165                         before = after;
2166                         s++;
2167                     }
2168                 }
2169
2170                 /* Here are at the final position in the target string.  The SB
2171                  * value is always true here, so matches, depending on other
2172                  * constraints */
2173                 if (reginfo->intuit || regtry(reginfo, &s)) {
2174                     goto got_it;
2175                 }
2176
2177                 break;
2178
2179             case WB_BOUND:
2180                 if (s == reginfo->strbeg) {
2181                     if (reginfo->intuit || regtry(reginfo, &s)) {
2182                         goto got_it;
2183                     }
2184                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2185                     if (UNLIKELY(s >= reginfo->strend)) {
2186                         break;
2187                     }
2188                 }
2189
2190                 if (utf8_target) {
2191                     /* We are at a boundary between char_sub_0 and char_sub_1.
2192                      * We also keep track of the value for char_sub_-1 as we
2193                      * loop through the line.   Context may be needed to make a
2194                      * determination, and if so, this can save having to
2195                      * recalculate it */
2196                     WB_enum previous = WB_UNKNOWN;
2197                     WB_enum before = getWB_VAL_UTF8(
2198                                               reghop3((U8*)s,
2199                                                       -1,
2200                                                       (U8*)(reginfo->strbeg)),
2201                                               (U8*) reginfo->strend);
2202                     while (s < strend) {
2203                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2204                                                         (U8*) reginfo->strend);
2205                         if ((to_complement ^ isWB(previous,
2206                                                   before,
2207                                                   after,
2208                                                   (U8*) reginfo->strbeg,
2209                                                   (U8*) s,
2210                                                   (U8*) reginfo->strend,
2211                                                   utf8_target))
2212                             && (reginfo->intuit || regtry(reginfo, &s)))
2213                         {
2214                             goto got_it;
2215                         }
2216                         previous = before;
2217                         before = after;
2218                         s += UTF8SKIP(s);
2219                     }
2220                 }
2221                 else {  /* Not utf8. */
2222                     WB_enum previous = WB_UNKNOWN;
2223                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2224                     while (s < strend) {
2225                         WB_enum after = getWB_VAL_CP((U8) *s);
2226                         if ((to_complement ^ isWB(previous,
2227                                                   before,
2228                                                   after,
2229                                                   (U8*) reginfo->strbeg,
2230                                                   (U8*) s,
2231                                                   (U8*) reginfo->strend,
2232                                                   utf8_target))
2233                             && (reginfo->intuit || regtry(reginfo, &s)))
2234                         {
2235                             goto got_it;
2236                         }
2237                         previous = before;
2238                         before = after;
2239                         s++;
2240                     }
2241                 }
2242
2243                 if (reginfo->intuit || regtry(reginfo, &s)) {
2244                     goto got_it;
2245                 }
2246         }
2247         break;
2248
2249     case LNBREAK:
2250         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2251                         is_LNBREAK_latin1_safe(s, strend)
2252         );
2253         break;
2254
2255     /* The argument to all the POSIX node types is the class number to pass to
2256      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2257
2258     case NPOSIXL:
2259         to_complement = 1;
2260         /* FALLTHROUGH */
2261
2262     case POSIXL:
2263         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2264         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2265                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2266         break;
2267
2268     case NPOSIXD:
2269         to_complement = 1;
2270         /* FALLTHROUGH */
2271
2272     case POSIXD:
2273         if (utf8_target) {
2274             goto posix_utf8;
2275         }
2276         goto posixa;
2277
2278     case NPOSIXA:
2279         if (utf8_target) {
2280             /* The complement of something that matches only ASCII matches all
2281              * non-ASCII, plus everything in ASCII that isn't in the class. */
2282             REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2283                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2284             break;
2285         }
2286
2287         to_complement = 1;
2288         /* FALLTHROUGH */
2289
2290     case POSIXA:
2291       posixa:
2292         /* Don't need to worry about utf8, as it can match only a single
2293          * byte invariant character. */
2294         REXEC_FBC_CLASS_SCAN(
2295                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2296         break;
2297
2298     case NPOSIXU:
2299         to_complement = 1;
2300         /* FALLTHROUGH */
2301
2302     case POSIXU:
2303         if (! utf8_target) {
2304             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2305                                                                     FLAGS(c))));
2306         }
2307         else {
2308
2309           posix_utf8:
2310             classnum = (_char_class_number) FLAGS(c);
2311             if (classnum < _FIRST_NON_SWASH_CC) {
2312                 while (s < strend) {
2313
2314                     /* We avoid loading in the swash as long as possible, but
2315                      * should we have to, we jump to a separate loop.  This
2316                      * extra 'if' statement is what keeps this code from being
2317                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2318                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2319                         goto found_above_latin1;
2320                     }
2321                     if ((UTF8_IS_INVARIANT(*s)
2322                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2323                                                                 classnum)))
2324                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
2325                             && to_complement ^ cBOOL(
2326                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2327                                                                       *(s + 1)),
2328                                               classnum))))
2329                     {
2330                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2331                             goto got_it;
2332                         else {
2333                             tmp = doevery;
2334                         }
2335                     }
2336                     else {
2337                         tmp = 1;
2338                     }
2339                     s += UTF8SKIP(s);
2340                 }
2341             }
2342             else switch (classnum) {    /* These classes are implemented as
2343                                            macros */
2344                 case _CC_ENUM_SPACE:
2345                     REXEC_FBC_UTF8_CLASS_SCAN(
2346                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
2347                     break;
2348
2349                 case _CC_ENUM_BLANK:
2350                     REXEC_FBC_UTF8_CLASS_SCAN(
2351                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
2352                     break;
2353
2354                 case _CC_ENUM_XDIGIT:
2355                     REXEC_FBC_UTF8_CLASS_SCAN(
2356                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2357                     break;
2358
2359                 case _CC_ENUM_VERTSPACE:
2360                     REXEC_FBC_UTF8_CLASS_SCAN(
2361                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
2362                     break;
2363
2364                 case _CC_ENUM_CNTRL:
2365                     REXEC_FBC_UTF8_CLASS_SCAN(
2366                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
2367                     break;
2368
2369                 default:
2370                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2371                     NOT_REACHED; /* NOTREACHED */
2372             }
2373         }
2374         break;
2375
2376       found_above_latin1:   /* Here we have to load a swash to get the result
2377                                for the current code point */
2378         if (! PL_utf8_swash_ptrs[classnum]) {
2379             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2380             PL_utf8_swash_ptrs[classnum] =
2381                     _core_swash_init("utf8",
2382                                      "",
2383                                      &PL_sv_undef, 1, 0,
2384                                      PL_XPosix_ptrs[classnum], &flags);
2385         }
2386
2387         /* This is a copy of the loop above for swash classes, though using the
2388          * FBC macro instead of being expanded out.  Since we've loaded the
2389          * swash, we don't have to check for that each time through the loop */
2390         REXEC_FBC_UTF8_CLASS_SCAN(
2391                 to_complement ^ cBOOL(_generic_utf8(
2392                                       classnum,
2393                                       s,
2394                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2395                                                   (U8 *) s, TRUE))));
2396         break;
2397
2398     case AHOCORASICKC:
2399     case AHOCORASICK:
2400         {
2401             DECL_TRIE_TYPE(c);
2402             /* what trie are we using right now */
2403             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2404             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2405             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2406
2407             const char *last_start = strend - trie->minlen;
2408 #ifdef DEBUGGING
2409             const char *real_start = s;
2410 #endif
2411             STRLEN maxlen = trie->maxlen;
2412             SV *sv_points;
2413             U8 **points; /* map of where we were in the input string
2414                             when reading a given char. For ASCII this
2415                             is unnecessary overhead as the relationship
2416                             is always 1:1, but for Unicode, especially
2417                             case folded Unicode this is not true. */
2418             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2419             U8 *bitmap=NULL;
2420
2421
2422             GET_RE_DEBUG_FLAGS_DECL;
2423
2424             /* We can't just allocate points here. We need to wrap it in
2425              * an SV so it gets freed properly if there is a croak while
2426              * running the match */
2427             ENTER;
2428             SAVETMPS;
2429             sv_points=newSV(maxlen * sizeof(U8 *));
2430             SvCUR_set(sv_points,
2431                 maxlen * sizeof(U8 *));
2432             SvPOK_on(sv_points);
2433             sv_2mortal(sv_points);
2434             points=(U8**)SvPV_nolen(sv_points );
2435             if ( trie_type != trie_utf8_fold
2436                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2437             {
2438                 if (trie->bitmap)
2439                     bitmap=(U8*)trie->bitmap;
2440                 else
2441                     bitmap=(U8*)ANYOF_BITMAP(c);
2442             }
2443             /* this is the Aho-Corasick algorithm modified a touch
2444                to include special handling for long "unknown char" sequences.
2445                The basic idea being that we use AC as long as we are dealing
2446                with a possible matching char, when we encounter an unknown char
2447                (and we have not encountered an accepting state) we scan forward
2448                until we find a legal starting char.
2449                AC matching is basically that of trie matching, except that when
2450                we encounter a failing transition, we fall back to the current
2451                states "fail state", and try the current char again, a process
2452                we repeat until we reach the root state, state 1, or a legal
2453                transition. If we fail on the root state then we can either
2454                terminate if we have reached an accepting state previously, or
2455                restart the entire process from the beginning if we have not.
2456
2457              */
2458             while (s <= last_start) {
2459                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2460                 U8 *uc = (U8*)s;
2461                 U16 charid = 0;
2462                 U32 base = 1;
2463                 U32 state = 1;
2464                 UV uvc = 0;
2465                 STRLEN len = 0;
2466                 STRLEN foldlen = 0;
2467                 U8 *uscan = (U8*)NULL;
2468                 U8 *leftmost = NULL;
2469 #ifdef DEBUGGING
2470                 U32 accepted_word= 0;
2471 #endif
2472                 U32 pointpos = 0;
2473
2474                 while ( state && uc <= (U8*)strend ) {
2475                     int failed=0;
2476                     U32 word = aho->states[ state ].wordnum;
2477
2478                     if( state==1 ) {
2479                         if ( bitmap ) {
2480                             DEBUG_TRIE_EXECUTE_r(
2481                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2482                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2483                                         (char *)uc, utf8_target );
2484                                     PerlIO_printf( Perl_debug_log,
2485                                         " Scanning for legal start char...\n");
2486                                 }
2487                             );
2488                             if (utf8_target) {
2489                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2490                                     uc += UTF8SKIP(uc);
2491                                 }
2492                             } else {
2493                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2494                                     uc++;
2495                                 }
2496                             }
2497                             s= (char *)uc;
2498                         }
2499                         if (uc >(U8*)last_start) break;
2500                     }
2501
2502                     if ( word ) {
2503                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2504                         if (!leftmost || lpos < leftmost) {
2505                             DEBUG_r(accepted_word=word);
2506                             leftmost= lpos;
2507                         }
2508                         if (base==0) break;
2509
2510                     }
2511                     points[pointpos++ % maxlen]= uc;
2512                     if (foldlen || uc < (U8*)strend) {
2513                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2514                                          widecharmap, uc,
2515                                          uscan, len, uvc, charid, foldlen,
2516                                          foldbuf, uniflags);
2517                         DEBUG_TRIE_EXECUTE_r({
2518                             dump_exec_pos( (char *)uc, c, strend,
2519                                         real_start, s, utf8_target);
2520                             PerlIO_printf(Perl_debug_log,
2521                                 " Charid:%3u CP:%4"UVxf" ",
2522                                  charid, uvc);
2523                         });
2524                     }
2525                     else {
2526                         len = 0;
2527                         charid = 0;
2528                     }
2529
2530
2531                     do {
2532 #ifdef DEBUGGING
2533                         word = aho->states[ state ].wordnum;
2534 #endif
2535                         base = aho->states[ state ].trans.base;
2536
2537                         DEBUG_TRIE_EXECUTE_r({
2538                             if (failed)
2539                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2540                                     s,   utf8_target );
2541                             PerlIO_printf( Perl_debug_log,
2542                                 "%sState: %4"UVxf", word=%"UVxf,
2543                                 failed ? " Fail transition to " : "",
2544                                 (UV)state, (UV)word);
2545                         });
2546                         if ( base ) {
2547                             U32 tmp;
2548                             I32 offset;
2549                             if (charid &&
2550                                  ( ((offset = base + charid
2551                                     - 1 - trie->uniquecharcount)) >= 0)
2552                                  && ((U32)offset < trie->lasttrans)
2553                                  && trie->trans[offset].check == state
2554                                  && (tmp=trie->trans[offset].next))
2555                             {
2556                                 DEBUG_TRIE_EXECUTE_r(
2557                                     PerlIO_printf( Perl_debug_log," - legal\n"));
2558                                 state = tmp;
2559                                 break;
2560                             }
2561                             else {
2562                                 DEBUG_TRIE_EXECUTE_r(
2563                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2564                                 failed = 1;
2565                                 state = aho->fail[state];
2566                             }
2567                         }
2568                         else {
2569                             /* we must be accepting here */
2570                             DEBUG_TRIE_EXECUTE_r(
2571                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2572                             failed = 1;
2573                             break;
2574                         }
2575                     } while(state);
2576                     uc += len;
2577                     if (failed) {
2578                         if (leftmost)
2579                             break;
2580                         if (!state) state = 1;
2581                     }
2582                 }
2583                 if ( aho->states[ state ].wordnum ) {
2584                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2585                     if (!leftmost || lpos < leftmost) {
2586                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2587                         leftmost = lpos;
2588                     }
2589                 }
2590                 if (leftmost) {
2591                     s = (char*)leftmost;
2592                     DEBUG_TRIE_EXECUTE_r({
2593                         PerlIO_printf(
2594                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2595                             (UV)accepted_word, (IV)(s - real_start)
2596                         );
2597                     });
2598                     if (reginfo->intuit || regtry(reginfo, &s)) {
2599                         FREETMPS;
2600                         LEAVE;
2601                         goto got_it;
2602                     }
2603                     s = HOPc(s,1);
2604                     DEBUG_TRIE_EXECUTE_r({
2605                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2606                     });
2607                 } else {
2608                     DEBUG_TRIE_EXECUTE_r(
2609                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2610                     break;
2611                 }
2612             }
2613             FREETMPS;
2614             LEAVE;
2615         }
2616         break;
2617     default:
2618         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2619     }
2620     return 0;
2621   got_it:
2622     return s;
2623 }
2624
2625 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2626  * flags have same meanings as with regexec_flags() */
2627
2628 static void
2629 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2630                             char *strbeg,
2631                             char *strend,
2632                             SV *sv,
2633                             U32 flags,
2634                             bool utf8_target)
2635 {
2636     struct regexp *const prog = ReANY(rx);
2637
2638     if (flags & REXEC_COPY_STR) {
2639 #ifdef PERL_ANY_COW
2640         if (SvCANCOW(sv)) {
2641             if (DEBUG_C_TEST) {
2642                 PerlIO_printf(Perl_debug_log,
2643                               "Copy on write: regexp capture, type %d\n",
2644                               (int) SvTYPE(sv));
2645             }
2646             /* Create a new COW SV to share the match string and store
2647              * in saved_copy, unless the current COW SV in saved_copy
2648              * is valid and suitable for our purpose */
2649             if ((   prog->saved_copy
2650                  && SvIsCOW(prog->saved_copy)
2651                  && SvPOKp(prog->saved_copy)
2652                  && SvIsCOW(sv)
2653                  && SvPOKp(sv)
2654                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2655             {
2656                 /* just reuse saved_copy SV */
2657                 if (RXp_MATCH_COPIED(prog)) {
2658                     Safefree(prog->subbeg);
2659                     RXp_MATCH_COPIED_off(prog);
2660                 }
2661             }
2662             else {
2663                 /* create new COW SV to share string */
2664                 RX_MATCH_COPY_FREE(rx);
2665                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2666             }
2667             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2668             assert (SvPOKp(prog->saved_copy));
2669             prog->sublen  = strend - strbeg;
2670             prog->suboffset = 0;
2671             prog->subcoffset = 0;
2672         } else
2673 #endif
2674         {
2675             SSize_t min = 0;
2676             SSize_t max = strend - strbeg;
2677             SSize_t sublen;
2678
2679             if (    (flags & REXEC_COPY_SKIP_POST)
2680                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2681                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2682             ) { /* don't copy $' part of string */
2683                 U32 n = 0;
2684                 max = -1;
2685                 /* calculate the right-most part of the string covered
2686                  * by a capture. Due to look-ahead, this may be to
2687                  * the right of $&, so we have to scan all captures */
2688                 while (n <= prog->lastparen) {
2689                     if (prog->offs[n].end > max)
2690                         max = prog->offs[n].end;
2691                     n++;
2692                 }
2693                 if (max == -1)
2694                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2695                             ? prog->offs[0].start
2696                             : 0;
2697                 assert(max >= 0 && max <= strend - strbeg);
2698             }
2699
2700             if (    (flags & REXEC_COPY_SKIP_PRE)
2701                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2702                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2703             ) { /* don't copy $` part of string */
2704                 U32 n = 0;
2705                 min = max;
2706                 /* calculate the left-most part of the string covered
2707                  * by a capture. Due to look-behind, this may be to
2708                  * the left of $&, so we have to scan all captures */
2709                 while (min && n <= prog->lastparen) {
2710                     if (   prog->offs[n].start != -1
2711                         && prog->offs[n].start < min)
2712                     {
2713                         min = prog->offs[n].start;
2714                     }
2715                     n++;
2716                 }
2717                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2718                     && min >  prog->offs[0].end
2719                 )
2720                     min = prog->offs[0].end;
2721
2722             }
2723
2724             assert(min >= 0 && min <= max && min <= strend - strbeg);
2725             sublen = max - min;
2726
2727             if (RX_MATCH_COPIED(rx)) {
2728                 if (sublen > prog->sublen)
2729                     prog->subbeg =
2730                             (char*)saferealloc(prog->subbeg, sublen+1);
2731             }
2732             else
2733                 prog->subbeg = (char*)safemalloc(sublen+1);
2734             Copy(strbeg + min, prog->subbeg, sublen, char);
2735             prog->subbeg[sublen] = '\0';
2736             prog->suboffset = min;
2737             prog->sublen = sublen;
2738             RX_MATCH_COPIED_on(rx);
2739         }
2740         prog->subcoffset = prog->suboffset;
2741         if (prog->suboffset && utf8_target) {
2742             /* Convert byte offset to chars.
2743              * XXX ideally should only compute this if @-/@+
2744              * has been seen, a la PL_sawampersand ??? */
2745
2746             /* If there's a direct correspondence between the
2747              * string which we're matching and the original SV,
2748              * then we can use the utf8 len cache associated with
2749              * the SV. In particular, it means that under //g,
2750              * sv_pos_b2u() will use the previously cached
2751              * position to speed up working out the new length of
2752              * subcoffset, rather than counting from the start of
2753              * the string each time. This stops
2754              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2755              * from going quadratic */
2756             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2757                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2758                                                 SV_GMAGIC|SV_CONST_RETURN);
2759             else
2760                 prog->subcoffset = utf8_length((U8*)strbeg,
2761                                     (U8*)(strbeg+prog->suboffset));
2762         }
2763     }
2764     else {
2765         RX_MATCH_COPY_FREE(rx);
2766         prog->subbeg = strbeg;
2767         prog->suboffset = 0;
2768         prog->subcoffset = 0;
2769         prog->sublen = strend - strbeg;
2770     }
2771 }
2772
2773
2774
2775
2776 /*
2777  - regexec_flags - match a regexp against a string
2778  */
2779 I32
2780 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2781               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2782 /* stringarg: the point in the string at which to begin matching */
2783 /* strend:    pointer to null at end of string */
2784 /* strbeg:    real beginning of string */
2785 /* minend:    end of match must be >= minend bytes after stringarg. */
2786 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2787  *            itself is accessed via the pointers above */
2788 /* data:      May be used for some additional optimizations.
2789               Currently unused. */
2790 /* flags:     For optimizations. See REXEC_* in regexp.h */
2791
2792 {
2793     struct regexp *const prog = ReANY(rx);
2794     char *s;
2795     regnode *c;
2796     char *startpos;
2797     SSize_t minlen;             /* must match at least this many chars */
2798     SSize_t dontbother = 0;     /* how many characters not to try at end */
2799     const bool utf8_target = cBOOL(DO_UTF8(sv));
2800     I32 multiline;
2801     RXi_GET_DECL(prog,progi);
2802     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2803     regmatch_info *const reginfo = &reginfo_buf;
2804     regexp_paren_pair *swap = NULL;
2805     I32 oldsave;
2806     GET_RE_DEBUG_FLAGS_DECL;
2807
2808     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2809     PERL_UNUSED_ARG(data);
2810
2811     /* Be paranoid... */
2812     if (prog == NULL) {
2813         Perl_croak(aTHX_ "NULL regexp parameter");
2814     }
2815
2816     DEBUG_EXECUTE_r(
2817         debug_start_match(rx, utf8_target, stringarg, strend,
2818         "Matching");
2819     );
2820
2821     startpos = stringarg;
2822
2823     if (prog->intflags & PREGf_GPOS_SEEN) {
2824         MAGIC *mg;
2825
2826         /* set reginfo->ganch, the position where \G can match */
2827
2828         reginfo->ganch =
2829             (flags & REXEC_IGNOREPOS)
2830             ? stringarg /* use start pos rather than pos() */
2831             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2832               /* Defined pos(): */
2833             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2834             : strbeg; /* pos() not defined; use start of string */
2835
2836         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2837             "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2838
2839         /* in the presence of \G, we may need to start looking earlier in
2840          * the string than the suggested start point of stringarg:
2841          * if prog->gofs is set, then that's a known, fixed minimum
2842          * offset, such as
2843          * /..\G/:   gofs = 2
2844          * /ab|c\G/: gofs = 1
2845          * or if the minimum offset isn't known, then we have to go back
2846          * to the start of the string, e.g. /w+\G/
2847          */
2848
2849         if (prog->intflags & PREGf_ANCH_GPOS) {
2850             startpos  = reginfo->ganch - prog->gofs;
2851             if (startpos <
2852                 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2853             {
2854                 DEBUG_r(PerlIO_printf(Perl_debug_log,
2855                         "fail: ganch-gofs before earliest possible start\n"));
2856                 return 0;
2857             }
2858         }
2859         else if (prog->gofs) {
2860             if (startpos - prog->gofs < strbeg)
2861                 startpos = strbeg;
2862             else
2863                 startpos -= prog->gofs;
2864         }
2865         else if (prog->intflags & PREGf_GPOS_FLOAT)
2866             startpos = strbeg;
2867     }
2868
2869     minlen = prog->minlen;
2870     if ((startpos + minlen) > strend || startpos < strbeg) {
2871         DEBUG_r(PerlIO_printf(Perl_debug_log,
2872                     "Regex match can't succeed, so not even tried\n"));
2873         return 0;
2874     }
2875
2876     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2877      * which will call destuctors to reset PL_regmatch_state, free higher
2878      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2879      * regmatch_info_aux_eval */
2880
2881     oldsave = PL_savestack_ix;
2882
2883     s = startpos;
2884
2885     if ((prog->extflags & RXf_USE_INTUIT)
2886         && !(flags & REXEC_CHECKED))
2887     {
2888         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2889                                     flags, NULL);
2890         if (!s)
2891             return 0;
2892
2893         if (prog->extflags & RXf_CHECK_ALL) {
2894             /* we can match based purely on the result of INTUIT.
2895              * Set up captures etc just for $& and $-[0]
2896              * (an intuit-only match wont have $1,$2,..) */
2897             assert(!prog->nparens);
2898
2899             /* s/// doesn't like it if $& is earlier than where we asked it to
2900              * start searching (which can happen on something like /.\G/) */
2901             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2902                     && (s < stringarg))
2903             {
2904                 /* this should only be possible under \G */
2905                 assert(prog->intflags & PREGf_GPOS_SEEN);
2906                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2907                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2908                 goto phooey;
2909             }
2910
2911             /* match via INTUIT shouldn't have any captures.
2912              * Let @-, @+, $^N know */
2913             prog->lastparen = prog->lastcloseparen = 0;
2914             RX_MATCH_UTF8_set(rx, utf8_target);
2915             prog->offs[0].start = s - strbeg;
2916             prog->offs[0].end = utf8_target
2917                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2918                 : s - strbeg + prog->minlenret;
2919             if ( !(flags & REXEC_NOT_FIRST) )
2920                 S_reg_set_capture_string(aTHX_ rx,
2921                                         strbeg, strend,
2922                                         sv, flags, utf8_target);
2923
2924             return 1;
2925         }
2926     }
2927
2928     multiline = prog->extflags & RXf_PMf_MULTILINE;
2929
2930     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2931         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2932                               "String too short [regexec_flags]...\n"));
2933         goto phooey;
2934     }
2935
2936     /* Check validity of program. */
2937     if (UCHARAT(progi->program) != REG_MAGIC) {
2938         Perl_croak(aTHX_ "corrupted regexp program");
2939     }
2940
2941     RX_MATCH_TAINTED_off(rx);
2942     RX_MATCH_UTF8_set(rx, utf8_target);
2943
2944     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2945     reginfo->intuit = 0;
2946     reginfo->is_utf8_target = cBOOL(utf8_target);
2947     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2948     reginfo->warned = FALSE;
2949     reginfo->strbeg  = strbeg;
2950     reginfo->sv = sv;
2951     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2952     reginfo->strend = strend;
2953     /* see how far we have to get to not match where we matched before */
2954     reginfo->till = stringarg + minend;
2955
2956     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2957         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2958            S_cleanup_regmatch_info_aux has executed (registered by
2959            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2960            magic belonging to this SV.
2961            Not newSVsv, either, as it does not COW.
2962         */
2963         reginfo->sv = newSV(0);
2964         SvSetSV_nosteal(reginfo->sv, sv);
2965         SAVEFREESV(reginfo->sv);
2966     }
2967
2968     /* reserve next 2 or 3 slots in PL_regmatch_state:
2969      * slot N+0: may currently be in use: skip it
2970      * slot N+1: use for regmatch_info_aux struct
2971      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2972      * slot N+3: ready for use by regmatch()
2973      */
2974
2975     {
2976         regmatch_state *old_regmatch_state;
2977         regmatch_slab  *old_regmatch_slab;
2978         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2979
2980         /* on first ever match, allocate first slab */
2981         if (!PL_regmatch_slab) {
2982             Newx(PL_regmatch_slab, 1, regmatch_slab);
2983             PL_regmatch_slab->prev = NULL;
2984             PL_regmatch_slab->next = NULL;
2985             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2986         }
2987
2988         old_regmatch_state = PL_regmatch_state;
2989         old_regmatch_slab  = PL_regmatch_slab;
2990
2991         for (i=0; i <= max; i++) {
2992             if (i == 1)
2993                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2994             else if (i ==2)
2995                 reginfo->info_aux_eval =
2996                 reginfo->info_aux->info_aux_eval =
2997                             &(PL_regmatch_state->u.info_aux_eval);
2998
2999             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3000                 PL_regmatch_state = S_push_slab(aTHX);
3001         }
3002
3003         /* note initial PL_regmatch_state position; at end of match we'll
3004          * pop back to there and free any higher slabs */
3005
3006         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3007         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3008         reginfo->info_aux->poscache = NULL;
3009
3010         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3011
3012         if ((prog->extflags & RXf_EVAL_SEEN))
3013             S_setup_eval_state(aTHX_ reginfo);
3014         else
3015             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3016     }
3017
3018     /* If there is a "must appear" string, look for it. */
3019
3020     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3021         /* We have to be careful. If the previous successful match
3022            was from this regex we don't want a subsequent partially
3023            successful match to clobber the old results.
3024            So when we detect this possibility we add a swap buffer
3025            to the re, and switch the buffer each match. If we fail,
3026            we switch it back; otherwise we leave it swapped.
3027         */
3028         swap = prog->offs;
3029         /* do we need a save destructor here for eval dies? */
3030         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3031         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3032             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3033             PTR2UV(prog),
3034             PTR2UV(swap),
3035             PTR2UV(prog->offs)
3036         ));
3037     }
3038
3039     /* Simplest case: anchored match need be tried only once, or with
3040      * MBOL, only at the beginning of each line.
3041      *
3042      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3043      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3044      * match at the start of the string then it won't match anywhere else
3045      * either; while with /.*.../, if it doesn't match at the beginning,
3046      * the earliest it could match is at the start of the next line */
3047
3048     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3049         char *end;
3050
3051         if (regtry(reginfo, &s))
3052             goto got_it;
3053
3054         if (!(prog->intflags & PREGf_ANCH_MBOL))
3055             goto phooey;
3056
3057         /* didn't match at start, try at other newline positions */
3058
3059         if (minlen)
3060             dontbother = minlen - 1;
3061         end = HOP3c(strend, -dontbother, strbeg) - 1;
3062
3063         /* skip to next newline */
3064
3065         while (s <= end) { /* note it could be possible to match at the end of the string */
3066             /* NB: newlines are the same in unicode as they are in latin */
3067             if (*s++ != '\n')
3068                 continue;
3069             if (prog->check_substr || prog->check_utf8) {
3070             /* note that with PREGf_IMPLICIT, intuit can only fail
3071              * or return the start position, so it's of limited utility.
3072              * Nevertheless, I made the decision that the potential for
3073              * quick fail was still worth it - DAPM */
3074                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3075                 if (!s)
3076                     goto phooey;
3077             }
3078             if (regtry(reginfo, &s))
3079                 goto got_it;
3080         }
3081         goto phooey;
3082     } /* end anchored search */
3083
3084     if (prog->intflags & PREGf_ANCH_GPOS)
3085     {
3086         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3087         assert(prog->intflags & PREGf_GPOS_SEEN);
3088         /* For anchored \G, the only position it can match from is
3089          * (ganch-gofs); we already set startpos to this above; if intuit
3090          * moved us on from there, we can't possibly succeed */
3091         assert(startpos == reginfo->ganch - prog->gofs);
3092         if (s == startpos && regtry(reginfo, &s))
3093             goto got_it;
3094         goto phooey;
3095     }
3096
3097     /* Messy cases:  unanchored match. */
3098     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3099         /* we have /x+whatever/ */
3100         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3101         char ch;
3102 #ifdef DEBUGGING
3103         int did_match = 0;
3104 #endif
3105         if (utf8_target) {
3106             if (! prog->anchored_utf8) {
3107                 to_utf8_substr(prog);
3108             }
3109             ch = SvPVX_const(prog->anchored_utf8)[0];
3110             REXEC_FBC_SCAN(
3111                 if (*s == ch) {
3112                     DEBUG_EXECUTE_r( did_match = 1 );
3113                     if (regtry(reginfo, &s)) goto got_it;
3114                     s += UTF8SKIP(s);
3115                     while (s < strend && *s == ch)
3116                         s += UTF8SKIP(s);
3117                 }
3118             );
3119
3120         }
3121         else {
3122             if (! prog->anchored_substr) {
3123                 if (! to_byte_substr(prog)) {
3124                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3125                 }
3126             }
3127             ch = SvPVX_const(prog->anchored_substr)[0];
3128             REXEC_FBC_SCAN(
3129                 if (*s == ch) {
3130                     DEBUG_EXECUTE_r( did_match = 1 );
3131                     if (regtry(reginfo, &s)) goto got_it;
3132                     s++;
3133                     while (s < strend && *s == ch)
3134                         s++;
3135                 }
3136             );
3137         }
3138         DEBUG_EXECUTE_r(if (!did_match)
3139                 PerlIO_printf(Perl_debug_log,
3140                                   "Did not find anchored character...\n")
3141                );
3142     }
3143     else if (prog->anchored_substr != NULL
3144               || prog->anchored_utf8 != NULL
3145               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3146                   && prog->float_max_offset < strend - s)) {
3147         SV *must;
3148         SSize_t back_max;
3149         SSize_t back_min;
3150         char *last;
3151         char *last1;            /* Last position checked before */
3152 #ifdef DEBUGGING
3153         int did_match = 0;
3154 #endif
3155         if (prog->anchored_substr || prog->anchored_utf8) {
3156             if (utf8_target) {
3157                 if (! prog->anchored_utf8) {
3158                     to_utf8_substr(prog);
3159                 }
3160                 must = prog->anchored_utf8;
3161             }
3162             else {
3163                 if (! prog->anchored_substr) {
3164                     if (! to_byte_substr(prog)) {
3165                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3166                     }
3167                 }
3168                 must = prog->anchored_substr;
3169             }
3170             back_max = back_min = prog->anchored_offset;
3171         } else {
3172             if (utf8_target) {
3173                 if (! prog->float_utf8) {
3174                     to_utf8_substr(prog);
3175                 }
3176                 must = prog->float_utf8;
3177             }
3178             else {
3179                 if (! prog->float_substr) {
3180                     if (! to_byte_substr(prog)) {
3181                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3182                     }
3183                 }
3184                 must = prog->float_substr;
3185             }
3186             back_max = prog->float_max_offset;
3187             back_min = prog->float_min_offset;
3188         }
3189
3190         if (back_min<0) {
3191             last = strend;
3192         } else {
3193             last = HOP3c(strend,        /* Cannot start after this */
3194                   -(SSize_t)(CHR_SVLEN(must)
3195                          - (SvTAIL(must) != 0) + back_min), strbeg);
3196         }
3197         if (s > reginfo->strbeg)
3198             last1 = HOPc(s, -1);
3199         else
3200             last1 = s - 1;      /* bogus */
3201
3202         /* XXXX check_substr already used to find "s", can optimize if
3203            check_substr==must. */
3204         dontbother = 0;
3205         strend = HOPc(strend, -dontbother);
3206         while ( (s <= last) &&
3207                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3208                                   (unsigned char*)strend, must,
3209                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3210             DEBUG_EXECUTE_r( did_match = 1 );
3211             if (HOPc(s, -back_max) > last1) {
3212                 last1 = HOPc(s, -back_min);
3213                 s = HOPc(s, -back_max);
3214             }
3215             else {
3216                 char * const t = (last1 >= reginfo->strbeg)
3217                                     ? HOPc(last1, 1) : last1 + 1;
3218
3219                 last1 = HOPc(s, -back_min);
3220                 s = t;
3221             }
3222             if (utf8_target) {
3223                 while (s <= last1) {
3224                     if (regtry(reginfo, &s))
3225                         goto got_it;
3226                     if (s >= last1) {
3227                         s++; /* to break out of outer loop */
3228                         break;
3229                     }
3230                     s += UTF8SKIP(s);
3231                 }
3232             }
3233             else {
3234                 while (s <= last1) {
3235                     if (regtry(reginfo, &s))
3236                         goto got_it;
3237                     s++;
3238                 }
3239             }
3240         }
3241         DEBUG_EXECUTE_r(if (!did_match) {
3242             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3243                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3244             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
3245                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3246                                ? "anchored" : "floating"),
3247                 quoted, RE_SV_TAIL(must));
3248         });
3249         goto phooey;
3250     }
3251     else if ( (c = progi->regstclass) ) {
3252         if (minlen) {
3253             const OPCODE op = OP(progi->regstclass);
3254             /* don't bother with what can't match */
3255             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3256                 strend = HOPc(strend, -(minlen - 1));
3257         }
3258         DEBUG_EXECUTE_r({
3259             SV * const prop = sv_newmortal();
3260             regprop(prog, prop, c, reginfo, NULL);
3261             {
3262                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3263                     s,strend-s,60);
3264                 PerlIO_printf(Perl_debug_log,
3265                     "Matching stclass %.*s against %s (%d bytes)\n",
3266                     (int)SvCUR(prop), SvPVX_const(prop),
3267                      quoted, (int)(strend - s));
3268             }
3269         });
3270         if (find_byclass(prog, c, s, strend, reginfo))
3271             goto got_it;
3272         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
3273     }
3274     else {
3275         dontbother = 0;
3276         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3277             /* Trim the end. */
3278             char *last= NULL;
3279             SV* float_real;
3280             STRLEN len;
3281             const char *little;
3282
3283             if (utf8_target) {
3284                 if (! prog->float_utf8) {
3285                     to_utf8_substr(prog);
3286                 }
3287                 float_real = prog->float_utf8;
3288             }
3289             else {
3290                 if (! prog->float_substr) {
3291                     if (! to_byte_substr(prog)) {
3292                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3293                     }
3294                 }
3295                 float_real = prog->float_substr;
3296             }
3297
3298             little = SvPV_const(float_real, len);
3299             if (SvTAIL(float_real)) {
3300                     /* This means that float_real contains an artificial \n on
3301                      * the end due to the presence of something like this:
3302                      * /foo$/ where we can match both "foo" and "foo\n" at the
3303                      * end of the string.  So we have to compare the end of the
3304                      * string first against the float_real without the \n and
3305                      * then against the full float_real with the string.  We
3306                      * have to watch out for cases where the string might be
3307                      * smaller than the float_real or the float_real without
3308                      * the \n. */
3309                     char *checkpos= strend - len;
3310                     DEBUG_OPTIMISE_r(
3311                         PerlIO_printf(Perl_debug_log,
3312                             "%sChecking for float_real.%s\n",
3313                             PL_colors[4], PL_colors[5]));
3314                     if (checkpos + 1 < strbeg) {
3315                         /* can't match, even if we remove the trailing \n
3316                          * string is too short to match */
3317                         DEBUG_EXECUTE_r(
3318                             PerlIO_printf(Perl_debug_log,
3319                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3320                                 PL_colors[4], PL_colors[5]));
3321                         goto phooey;
3322                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3323                         /* can match, the end of the string matches without the
3324                          * "\n" */
3325                         last = checkpos + 1;
3326                     } else if (checkpos < strbeg) {
3327                         /* cant match, string is too short when the "\n" is
3328                          * included */
3329                         DEBUG_EXECUTE_r(
3330                             PerlIO_printf(Perl_debug_log,
3331                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3332                                 PL_colors[4], PL_colors[5]));
3333                         goto phooey;
3334                     } else if (!multiline) {
3335                         /* non multiline match, so compare with the "\n" at the
3336                          * end of the string */
3337                         if (memEQ(checkpos, little, len)) {
3338                             last= checkpos;
3339                         } else {
3340                             DEBUG_EXECUTE_r(
3341                                 PerlIO_printf(Perl_debug_log,
3342                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3343                                     PL_colors[4], PL_colors[5]));
3344                             goto phooey;
3345                         }
3346                     } else {
3347                         /* multiline match, so we have to search for a place
3348                          * where the full string is located */
3349                         goto find_last;
3350                     }
3351             } else {
3352                   find_last:
3353                     if (len)
3354                         last = rninstr(s, strend, little, little + len);
3355                     else
3356                         last = strend;  /* matching "$" */
3357             }
3358             if (!last) {
3359                 /* at one point this block contained a comment which was
3360                  * probably incorrect, which said that this was a "should not
3361                  * happen" case.  Even if it was true when it was written I am
3362                  * pretty sure it is not anymore, so I have removed the comment
3363                  * and replaced it with this one. Yves */
3364                 DEBUG_EXECUTE_r(
3365                     PerlIO_printf(Perl_debug_log,
3366                         "%sString does not contain required substring, cannot match.%s\n",
3367                         PL_colors[4], PL_colors[5]
3368                     ));
3369                 goto phooey;
3370             }
3371             dontbother = strend - last + prog->float_min_offset;
3372         }
3373         if (minlen && (dontbother < minlen))
3374             dontbother = minlen - 1;
3375         strend -= dontbother;              /* this one's always in bytes! */
3376         /* We don't know much -- general case. */
3377         if (utf8_target) {
3378             for (;;) {
3379                 if (regtry(reginfo, &s))
3380                     goto got_it;
3381                 if (s >= strend)
3382                     break;
3383                 s += UTF8SKIP(s);
3384             };
3385         }
3386         else {
3387             do {
3388                 if (regtry(reginfo, &s))
3389                     goto got_it;
3390             } while (s++ < strend);
3391         }
3392     }
3393
3394     /* Failure. */
3395     goto phooey;
3396
3397   got_it:
3398     /* s/// doesn't like it if $& is earlier than where we asked it to
3399      * start searching (which can happen on something like /.\G/) */
3400     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3401             && (prog->offs[0].start < stringarg - strbeg))
3402     {
3403         /* this should only be possible under \G */
3404         assert(prog->intflags & PREGf_GPOS_SEEN);
3405         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3406             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3407         goto phooey;
3408     }
3409
3410     DEBUG_BUFFERS_r(
3411         if (swap)
3412             PerlIO_printf(Perl_debug_log,
3413                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3414                 PTR2UV(prog),
3415                 PTR2UV(swap)
3416             );
3417     );
3418     Safefree(swap);
3419
3420     /* clean up; this will trigger destructors that will free all slabs
3421      * above the current one, and cleanup the regmatch_info_aux
3422      * and regmatch_info_aux_eval sructs */
3423
3424     LEAVE_SCOPE(oldsave);
3425
3426     if (RXp_PAREN_NAMES(prog))
3427         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3428
3429     /* make sure $`, $&, $', and $digit will work later */
3430     if ( !(flags & REXEC_NOT_FIRST) )
3431         S_reg_set_capture_string(aTHX_ rx,
3432                                     strbeg, reginfo->strend,
3433                                     sv, flags, utf8_target);
3434
3435     return 1;
3436
3437   phooey:
3438     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3439                           PL_colors[4], PL_colors[5]));
3440
3441     /* clean up; this will trigger destructors that will free all slabs
3442      * above the current one, and cleanup the regmatch_info_aux
3443      * and regmatch_info_aux_eval sructs */
3444
3445     LEAVE_SCOPE(oldsave);
3446
3447     if (swap) {
3448         /* we failed :-( roll it back */
3449         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3450             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3451             PTR2UV(prog),
3452             PTR2UV(prog->offs),
3453             PTR2UV(swap)
3454         ));
3455         Safefree(prog->offs);
3456         prog->offs = swap;
3457     }
3458     return 0;
3459 }
3460
3461
3462 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3463  * Do inc before dec, in case old and new rex are the same */
3464 #define SET_reg_curpm(Re2)                          \
3465     if (reginfo->info_aux_eval) {                   \
3466         (void)ReREFCNT_inc(Re2);                    \
3467         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3468         PM_SETRE((PL_reg_curpm), (Re2));            \
3469     }
3470
3471
3472 /*
3473  - regtry - try match at specific point
3474  */
3475 STATIC bool                     /* 0 failure, 1 success */
3476 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3477 {
3478     CHECKPOINT lastcp;
3479     REGEXP *const rx = reginfo->prog;
3480     regexp *const prog = ReANY(rx);
3481     SSize_t result;
3482     RXi_GET_DECL(prog,progi);
3483     GET_RE_DEBUG_FLAGS_DECL;
3484
3485     PERL_ARGS_ASSERT_REGTRY;
3486
3487     reginfo->cutpoint=NULL;
3488
3489     prog->offs[0].start = *startposp - reginfo->strbeg;
3490     prog->lastparen = 0;
3491     prog->lastcloseparen = 0;
3492
3493     /* XXXX What this code is doing here?!!!  There should be no need
3494        to do this again and again, prog->lastparen should take care of
3495        this!  --ilya*/
3496
3497     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3498      * Actually, the code in regcppop() (which Ilya may be meaning by
3499      * prog->lastparen), is not needed at all by the test suite
3500      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3501      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3502      * Meanwhile, this code *is* needed for the
3503      * above-mentioned test suite tests to succeed.  The common theme
3504      * on those tests seems to be returning null fields from matches.
3505      * --jhi updated by dapm */
3506 #if 1
3507     if (prog->nparens) {
3508         regexp_paren_pair *pp = prog->offs;
3509         I32 i;
3510         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3511             ++pp;
3512             pp->start = -1;
3513             pp->end = -1;
3514         }
3515     }
3516 #endif
3517     REGCP_SET(lastcp);
3518     result = regmatch(reginfo, *startposp, progi->program + 1);
3519     if (result != -1) {
3520         prog->offs[0].end = result;
3521         return 1;
3522     }
3523     if (reginfo->cutpoint)
3524         *startposp= reginfo->cutpoint;
3525     REGCP_UNWIND(lastcp);
3526     return 0;
3527 }
3528
3529
3530 #define sayYES goto yes
3531 #define sayNO goto no
3532 #define sayNO_SILENT goto no_silent
3533
3534 /* we dont use STMT_START/END here because it leads to
3535    "unreachable code" warnings, which are bogus, but distracting. */
3536 #define CACHEsayNO \
3537     if (ST.cache_mask) \
3538        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3539     sayNO
3540
3541 /* this is used to determine how far from the left messages like
3542    'failed...' are printed. It should be set such that messages
3543    are inline with the regop output that created them.
3544 */
3545 #define REPORT_CODE_OFF 32
3546
3547
3548 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3549 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3550 #define CHRTEST_NOT_A_CP_1 -999
3551 #define CHRTEST_NOT_A_CP_2 -998
3552
3553 /* grab a new slab and return the first slot in it */
3554
3555 STATIC regmatch_state *
3556 S_push_slab(pTHX)
3557 {
3558 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3559     dMY_CXT;
3560 #endif
3561     regmatch_slab *s = PL_regmatch_slab->next;
3562     if (!s) {
3563         Newx(s, 1, regmatch_slab);
3564         s->prev = PL_regmatch_slab;
3565         s->next = NULL;
3566         PL_regmatch_slab->next = s;
3567     }
3568     PL_regmatch_slab = s;
3569     return SLAB_FIRST(s);
3570 }
3571
3572
3573 /* push a new state then goto it */
3574
3575 #define PUSH_STATE_GOTO(state, node, input) \
3576     pushinput = input; \
3577     scan = node; \
3578     st->resume_state = state; \
3579     goto push_state;
3580
3581 /* push a new state with success backtracking, then goto it */
3582
3583 #define PUSH_YES_STATE_GOTO(state, node, input) \
3584     pushinput = input; \
3585     scan = node; \
3586     st->resume_state = state; \
3587     goto push_yes_state;
3588
3589
3590
3591
3592 /*
3593
3594 regmatch() - main matching routine
3595
3596 This is basically one big switch statement in a loop. We execute an op,
3597 set 'next' to point the next op, and continue. If we come to a point which
3598 we may need to backtrack to on failure such as (A|B|C), we push a
3599 backtrack state onto the backtrack stack. On failure, we pop the top
3600 state, and re-enter the loop at the state indicated. If there are no more
3601 states to pop, we return failure.
3602
3603 Sometimes we also need to backtrack on success; for example /A+/, where
3604 after successfully matching one A, we need to go back and try to
3605 match another one; similarly for lookahead assertions: if the assertion
3606 completes successfully, we backtrack to the state just before the assertion
3607 and then carry on.  In these cases, the pushed state is marked as
3608 'backtrack on success too'. This marking is in fact done by a chain of
3609 pointers, each pointing to the previous 'yes' state. On success, we pop to
3610 the nearest yes state, discarding any intermediate failure-only states.
3611 Sometimes a yes state is pushed just to force some cleanup code to be
3612 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3613 it to free the inner regex.
3614
3615 Note that failure backtracking rewinds the cursor position, while
3616 success backtracking leaves it alone.
3617
3618 A pattern is complete when the END op is executed, while a subpattern
3619 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3620 ops trigger the "pop to last yes state if any, otherwise return true"
3621 behaviour.
3622
3623 A common convention in this function is to use A and B to refer to the two
3624 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3625 the subpattern to be matched possibly multiple times, while B is the entire
3626 rest of the pattern. Variable and state names reflect this convention.
3627
3628 The states in the main switch are the union of ops and failure/success of
3629 substates associated with with that op.  For example, IFMATCH is the op
3630 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3631 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3632 successfully matched A and IFMATCH_A_fail is a state saying that we have
3633 just failed to match A. Resume states always come in pairs. The backtrack
3634 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3635 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3636 on success or failure.
3637
3638 The struct that holds a backtracking state is actually a big union, with
3639 one variant for each major type of op. The variable st points to the
3640 top-most backtrack struct. To make the code clearer, within each
3641 block of code we #define ST to alias the relevant union.
3642
3643 Here's a concrete example of a (vastly oversimplified) IFMATCH
3644 implementation:
3645
3646     switch (state) {
3647     ....
3648
3649 #define ST st->u.ifmatch
3650
3651     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3652         ST.foo = ...; // some state we wish to save
3653         ...
3654         // push a yes backtrack state with a resume value of
3655         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3656         // first node of A:
3657         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3658         // NOTREACHED
3659
3660     case IFMATCH_A: // we have successfully executed A; now continue with B
3661         next = B;
3662         bar = ST.foo; // do something with the preserved value
3663         break;
3664
3665     case IFMATCH_A_fail: // A failed, so the assertion failed
3666         ...;   // do some housekeeping, then ...
3667         sayNO; // propagate the failure
3668
3669 #undef ST
3670
3671     ...
3672     }
3673
3674 For any old-timers reading this who are familiar with the old recursive
3675 approach, the code above is equivalent to:
3676
3677     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3678     {
3679         int foo = ...
3680         ...
3681         if (regmatch(A)) {
3682             next = B;
3683             bar = foo;
3684             break;
3685         }
3686         ...;   // do some housekeeping, then ...
3687         sayNO; // propagate the failure
3688     }
3689
3690 The topmost backtrack state, pointed to by st, is usually free. If you
3691 want to claim it, populate any ST.foo fields in it with values you wish to
3692 save, then do one of
3693
3694         PUSH_STATE_GOTO(resume_state, node, newinput);
3695         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3696
3697 which sets that backtrack state's resume value to 'resume_state', pushes a
3698 new free entry to the top of the backtrack stack, then goes to 'node'.
3699 On backtracking, the free slot is popped, and the saved state becomes the
3700 new free state. An ST.foo field in this new top state can be temporarily
3701 accessed to retrieve values, but once the main loop is re-entered, it
3702 becomes available for reuse.
3703
3704 Note that the depth of the backtrack stack constantly increases during the
3705 left-to-right execution of the pattern, rather than going up and down with
3706 the pattern nesting. For example the stack is at its maximum at Z at the
3707 end of the pattern, rather than at X in the following:
3708
3709     /(((X)+)+)+....(Y)+....Z/
3710
3711 The only exceptions to this are lookahead/behind assertions and the cut,
3712 (?>A), which pop all the backtrack states associated with A before
3713 continuing.
3714
3715 Backtrack state structs are allocated in slabs of about 4K in size.
3716 PL_regmatch_state and st always point to the currently active state,
3717 and PL_regmatch_slab points to the slab currently containing
3718 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3719 allocated, and is never freed until interpreter destruction. When the slab
3720 is full, a new one is allocated and chained to the end. At exit from
3721 regmatch(), slabs allocated since entry are freed.
3722
3723 */
3724
3725
3726 #define DEBUG_STATE_pp(pp)                                  \
3727     DEBUG_STATE_r({                                         \
3728         DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3729         PerlIO_printf(Perl_debug_log,                       \
3730             "    %*s"pp" %s%s%s%s%s\n",                     \
3731             depth*2, "",                                    \
3732             PL_reg_name[st->resume_state],                  \
3733             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3734             ((st==yes_state) ? "Y" : ""),                   \
3735             ((st==mark_state) ? "M" : ""),                  \
3736             ((st==yes_state||st==mark_state) ? "]" : "")    \
3737         );                                                  \
3738     });
3739
3740
3741 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3742
3743 #ifdef DEBUGGING
3744
3745 STATIC void
3746 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3747     const char *start, const char *end, const char *blurb)
3748 {
3749     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3750
3751     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3752
3753     if (!PL_colorset)
3754             reginitcolors();
3755     {
3756         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3757             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3758
3759         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3760             start, end - start, 60);
3761
3762         PerlIO_printf(Perl_debug_log,
3763             "%s%s REx%s %s against %s\n",
3764                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3765
3766         if (utf8_target||utf8_pat)
3767             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3768                 utf8_pat ? "pattern" : "",
3769                 utf8_pat && utf8_target ? " and " : "",
3770                 utf8_target ? "string" : ""
3771             );
3772     }
3773 }
3774
3775 STATIC void
3776 S_dump_exec_pos(pTHX_ const char *locinput,
3777                       const regnode *scan,
3778                       const char *loc_regeol,
3779                       const char *loc_bostr,
3780                       const char *loc_reg_starttry,
3781                       const bool utf8_target)
3782 {
3783     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3784     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3785     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3786     /* The part of the string before starttry has one color
3787        (pref0_len chars), between starttry and current
3788        position another one (pref_len - pref0_len chars),
3789        after the current position the third one.
3790        We assume that pref0_len <= pref_len, otherwise we
3791        decrease pref0_len.  */
3792     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3793         ? (5 + taill) - l : locinput - loc_bostr;
3794     int pref0_len;
3795
3796     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3797
3798     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3799         pref_len++;
3800     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3801     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3802         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3803               ? (5 + taill) - pref_len : loc_regeol - locinput);
3804     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3805         l--;
3806     if (pref0_len < 0)
3807         pref0_len = 0;
3808     if (pref0_len > pref_len)
3809         pref0_len = pref_len;
3810     {
3811         const int is_uni = utf8_target ? 1 : 0;
3812
3813         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3814             (locinput - pref_len),pref0_len, 60, 4, 5);
3815
3816         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3817                     (locinput - pref_len + pref0_len),
3818                     pref_len - pref0_len, 60, 2, 3);
3819
3820         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3821                     locinput, loc_regeol - locinput, 10, 0, 1);
3822
3823         const STRLEN tlen=len0+len1+len2;
3824         PerlIO_printf(Perl_debug_log,
3825                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3826                     (IV)(locinput - loc_bostr),
3827                     len0, s0,
3828                     len1, s1,
3829                     (docolor ? "" : "> <"),
3830                     len2, s2,
3831                     (int)(tlen > 19 ? 0 :  19 - tlen),
3832                     "");
3833     }
3834 }
3835
3836 #endif
3837
3838 /* reg_check_named_buff_matched()
3839  * Checks to see if a named buffer has matched. The data array of
3840  * buffer numbers corresponding to the buffer is expected to reside
3841  * in the regexp->data->data array in the slot stored in the ARG() of
3842  * node involved. Note that this routine doesn't actually care about the
3843  * name, that information is not preserved from compilation to execution.
3844  * Returns the index of the leftmost defined buffer with the given name
3845  * or 0 if non of the buffers matched.
3846  */
3847 STATIC I32
3848 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3849 {
3850     I32 n;
3851     RXi_GET_DECL(rex,rexi);
3852     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3853     I32 *nums=(I32*)SvPVX(sv_dat);
3854
3855     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3856
3857     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3858         if ((I32)rex->lastparen >= nums[n] &&
3859             rex->offs[nums[n]].end != -1)
3860         {
3861             return nums[n];
3862         }
3863     }
3864     return 0;
3865 }
3866
3867
3868 static bool
3869 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3870         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3871 {
3872     /* This function determines if there are one or two characters that match
3873      * the first character of the passed-in EXACTish node <text_node>, and if
3874      * so, returns them in the passed-in pointers.
3875      *
3876      * If it determines that no possible character in the target string can
3877      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3878      * the first character in <text_node> requires UTF-8 to represent, and the
3879      * target string isn't in UTF-8.)
3880      *
3881      * If there are more than two characters that could match the beginning of
3882      * <text_node>, or if more context is required to determine a match or not,
3883      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3884      *
3885      * The motiviation behind this function is to allow the caller to set up
3886      * tight loops for matching.  If <text_node> is of type EXACT, there is
3887      * only one possible character that can match its first character, and so
3888      * the situation is quite simple.  But things get much more complicated if
3889      * folding is involved.  It may be that the first character of an EXACTFish
3890      * node doesn't participate in any possible fold, e.g., punctuation, so it
3891      * can be matched only by itself.  The vast majority of characters that are
3892      * in folds match just two things, their lower and upper-case equivalents.
3893      * But not all are like that; some have multiple possible matches, or match
3894      * sequences of more than one character.  This function sorts all that out.
3895      *
3896      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3897      * loop of trying to match A*, we know we can't exit where the thing
3898      * following it isn't a B.  And something can't be a B unless it is the
3899      * beginning of B.  By putting a quick test for that beginning in a tight
3900      * loop, we can rule out things that can't possibly be B without having to
3901      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3902      * character, we can make a tight loop matching A*, using the outputs of
3903      * this function.
3904      *
3905      * If the target string to match isn't in UTF-8, and there aren't
3906      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3907      * the one or two possible octets (which are characters in this situation)
3908      * that can match.  In all cases, if there is only one character that can
3909      * match, *<c1p> and *<c2p> will be identical.
3910      *
3911      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3912      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3913      * can match the beginning of <text_node>.  They should be declared with at
3914      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3915      * undefined what these contain.)  If one or both of the buffers are
3916      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3917      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3918      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3919      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3920      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3921
3922     const bool utf8_target = reginfo->is_utf8_target;
3923
3924     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3925     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
3926     bool use_chrtest_void = FALSE;
3927     const bool is_utf8_pat = reginfo->is_utf8_pat;
3928
3929     /* Used when we have both utf8 input and utf8 output, to avoid converting
3930      * to/from code points */
3931     bool utf8_has_been_setup = FALSE;
3932
3933     dVAR;
3934
3935     U8 *pat = (U8*)STRING(text_node);
3936     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3937
3938     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
3939
3940         /* In an exact node, only one thing can be matched, that first
3941          * character.  If both the pat and the target are UTF-8, we can just
3942          * copy the input to the output, avoiding finding the code point of
3943          * that character */
3944         if (!is_utf8_pat) {
3945             c2 = c1 = *pat;
3946         }
3947         else if (utf8_target) {
3948             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3949             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3950             utf8_has_been_setup = TRUE;
3951         }
3952         else {
3953             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3954         }
3955     }
3956     else { /* an EXACTFish node */
3957         U8 *pat_end = pat + STR_LEN(text_node);
3958
3959         /* An EXACTFL node has at least some characters unfolded, because what
3960          * they match is not known until now.  So, now is the time to fold
3961          * the first few of them, as many as are needed to determine 'c1' and
3962          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3963          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3964          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3965          * need to fold as many characters as a single character can fold to,
3966          * so that later we can check if the first ones are such a multi-char
3967          * fold.  But, in such a pattern only locale-problematic characters
3968          * aren't folded, so we can skip this completely if the first character
3969          * in the node isn't one of the tricky ones */
3970         if (OP(text_node) == EXACTFL) {
3971
3972             if (! is_utf8_pat) {
3973                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3974                 {
3975                     folded[0] = folded[1] = 's';
3976                     pat = folded;
3977                     pat_end = folded + 2;
3978                 }
3979             }
3980             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3981                 U8 *s = pat;
3982                 U8 *d = folded;
3983                 int i;
3984
3985                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3986                     if (isASCII(*s)) {
3987                         *(d++) = (U8) toFOLD_LC(*s);
3988                         s++;
3989                     }
3990                     else {
3991                         STRLEN len;
3992                         _to_utf8_fold_flags(s,
3993                                             d,
3994                                             &len,
3995                                             FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3996                         d += len;
3997                         s += UTF8SKIP(s);
3998                     }
3999                 }
4000
4001                 pat = folded;
4002                 pat_end = d;
4003             }
4004         }
4005
4006         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4007              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4008         {
4009             /* Multi-character folds require more context to sort out.  Also
4010              * PL_utf8_foldclosures used below doesn't handle them, so have to
4011              * be handled outside this routine */
4012             use_chrtest_void = TRUE;
4013         }
4014         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4015             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4016             if (c1 > 255) {
4017                 /* Load the folds hash, if not already done */
4018                 SV** listp;
4019                 if (! PL_utf8_foldclosures) {
4020                     _load_PL_utf8_foldclosures();
4021                 }
4022
4023                 /* The fold closures data structure is a hash with the keys
4024                  * being the UTF-8 of every character that is folded to, like
4025                  * 'k', and the values each an array of all code points that
4026                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4027                  * Multi-character folds are not included */
4028                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4029                                         (char *) pat,
4030                                         UTF8SKIP(pat),
4031                                         FALSE))))
4032                 {
4033                     /* Not found in the hash, therefore there are no folds
4034                     * containing it, so there is only a single character that
4035                     * could match */
4036                     c2 = c1;
4037                 }
4038                 else {  /* Does participate in folds */
4039                     AV* list = (AV*) *listp;
4040                     if (av_tindex(list) != 1) {
4041
4042                         /* If there aren't exactly two folds to this, it is
4043                          * outside the scope of this function */
4044                         use_chrtest_void = TRUE;
4045                     }
4046                     else {  /* There are two.  Get them */
4047                         SV** c_p = av_fetch(list, 0, FALSE);
4048                         if (c_p == NULL) {
4049                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4050                         }
4051                         c1 = SvUV(*c_p);
4052
4053                         c_p = av_fetch(list, 1, FALSE);
4054                         if (c_p == NULL) {
4055                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4056                         }
4057                         c2 = SvUV(*c_p);
4058
4059                         /* Folds that cross the 255/256 boundary are forbidden
4060                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4061                          * one is ASCIII.  Since the pattern character is above
4062                          * 255, and its only other match is below 256, the only
4063                          * legal match will be to itself.  We have thrown away
4064                          * the original, so have to compute which is the one
4065                          * above 255. */
4066                         if ((c1 < 256) != (c2 < 256)) {
4067                             if ((OP(text_node) == EXACTFL
4068                                  && ! IN_UTF8_CTYPE_LOCALE)
4069                                 || ((OP(text_node) == EXACTFA
4070                                     || OP(text_node) == EXACTFA_NO_TRIE)
4071                                     && (isASCII(c1) || isASCII(c2))))
4072                             {
4073                                 if (c1 < 256) {
4074                                     c1 = c2;
4075                                 }
4076                                 else {
4077                                     c2 = c1;
4078                                 }
4079                             }
4080                         }
4081                     }
4082                 }
4083             }
4084             else /* Here, c1 is <= 255 */
4085                 if (utf8_target
4086                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4087                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4088                     && ((OP(text_node) != EXACTFA
4089                         && OP(text_node) != EXACTFA_NO_TRIE)
4090                         || ! isASCII(c1)))
4091             {
4092                 /* Here, there could be something above Latin1 in the target
4093                  * which folds to this character in the pattern.  All such
4094                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4095                  * than two characters involved in their folds, so are outside
4096                  * the scope of this function */
4097                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4098                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4099                 }
4100                 else {
4101                     use_chrtest_void = TRUE;
4102                 }
4103             }
4104             else { /* Here nothing above Latin1 can fold to the pattern
4105                       character */
4106                 switch (OP(text_node)) {
4107
4108                     case EXACTFL:   /* /l rules */
4109                         c2 = PL_fold_locale[c1];
4110                         break;
4111
4112                     case EXACTF:   /* This node only generated for non-utf8
4113                                     patterns */
4114                         assert(! is_utf8_pat);
4115                         if (! utf8_target) {    /* /d rules */
4116                             c2 = PL_fold[c1];
4117                             break;
4118                         }
4119                         /* FALLTHROUGH */
4120                         /* /u rules for all these.  This happens to work for
4121                         * EXACTFA as nothing in Latin1 folds to ASCII */
4122                     case EXACTFA_NO_TRIE:   /* This node only generated for
4123                                             non-utf8 patterns */
4124                         assert(! is_utf8_pat);
4125                         /* FALLTHROUGH */
4126                     case EXACTFA:
4127                     case EXACTFU_SS:
4128                     case EXACTFU:
4129                         c2 = PL_fold_latin1[c1];
4130                         break;
4131
4132                     default:
4133                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4134                         NOT_REACHED; /* NOTREACHED */
4135                 }
4136             }
4137         }
4138     }
4139
4140     /* Here have figured things out.  Set up the returns */
4141     if (use_chrtest_void) {
4142         *c2p = *c1p = CHRTEST_VOID;
4143     }
4144     else if (utf8_target) {
4145         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4146             uvchr_to_utf8(c1_utf8, c1);
4147             uvchr_to_utf8(c2_utf8, c2);
4148         }
4149
4150         /* Invariants are stored in both the utf8 and byte outputs; Use
4151          * negative numbers otherwise for the byte ones.  Make sure that the
4152          * byte ones are the same iff the utf8 ones are the same */
4153         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4154         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4155                 ? *c2_utf8
4156                 : (c1 == c2)
4157                   ? CHRTEST_NOT_A_CP_1
4158                   : CHRTEST_NOT_A_CP_2;
4159     }
4160     else if (c1 > 255) {
4161        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4162                            can represent */
4163            return FALSE;
4164        }
4165
4166        *c1p = *c2p = c2;    /* c2 is the only representable value */
4167     }
4168     else {  /* c1 is representable; see about c2 */
4169        *c1p = c1;
4170        *c2p = (c2 < 256) ? c2 : c1;
4171     }
4172
4173     return TRUE;
4174 }
4175
4176 /* This creates a single number by combining two, with 'before' being like the
4177  * 10's digit, but this isn't necessarily base 10; it is base however many
4178  * elements of the enum there are */
4179 #define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
4180
4181 STATIC bool
4182 S_isGCB(const GCB_enum before, const GCB_enum after)
4183 {
4184     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4185      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4186
4187     switch (GCBcase(before, after)) {
4188
4189         /*  Break at the start and end of text.
4190             GB1.   sot ÷
4191             GB2.   ÷ eot
4192
4193             Break before and after controls except between CR and LF
4194             GB4.  ( Control | CR | LF )  ÷
4195             GB5.   ÷  ( Control | CR | LF )
4196
4197             Otherwise, break everywhere.
4198             GB10.  Any  ÷  Any */
4199         default:
4200             return TRUE;
4201
4202         /* Do not break between a CR and LF.
4203             GB3.  CR  ×  LF */
4204         case GCBcase(GCB_CR, GCB_LF):
4205             return FALSE;
4206
4207         /* Do not break Hangul syllable sequences.
4208             GB6.  L  ×  ( L | V | LV | LVT ) */
4209         case GCBcase(GCB_L, GCB_L):
4210         case GCBcase(GCB_L, GCB_V):
4211         case GCBcase(GCB_L, GCB_LV):
4212         case GCBcase(GCB_L, GCB_LVT):
4213             return FALSE;
4214
4215         /*  GB7.  ( LV | V )  ×  ( V | T ) */
4216         case GCBcase(GCB_LV, GCB_V):
4217         case GCBcase(GCB_LV, GCB_T):
4218         case GCBcase(GCB_V, GCB_V):
4219         case GCBcase(GCB_V, GCB_T):
4220             return FALSE;
4221
4222         /*  GB8.  ( LVT | T)  ×  T */
4223         case GCBcase(GCB_LVT, GCB_T):
4224         case GCBcase(GCB_T, GCB_T):
4225             return FALSE;
4226
4227         /* Do not break between regional indicator symbols.
4228             GB8a.  Regional_Indicator  ×  Regional_Indicator */
4229         case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
4230             return FALSE;
4231
4232         /* Do not break before extending characters.
4233             GB9.     ×  Extend */
4234         case GCBcase(GCB_Other, GCB_Extend):
4235         case GCBcase(GCB_Extend, GCB_Extend):
4236         case GCBcase(GCB_L, GCB_Extend):
4237         case GCBcase(GCB_LV, GCB_Extend):
4238         case GCBcase(GCB_LVT, GCB_Extend):
4239         case GCBcase(GCB_Prepend, GCB_Extend):
4240         case GCBcase(GCB_Regional_Indicator, GCB_Extend):
4241         case GCBcase(GCB_SpacingMark, GCB_Extend):
4242         case GCBcase(GCB_T, GCB_Extend):
4243         case GCBcase(GCB_V, GCB_Extend):
4244             return FALSE;
4245
4246         /* Do not break before SpacingMarks, or after Prepend characters.
4247             GB9a.     ×  SpacingMark */
4248         case GCBcase(GCB_Other, GCB_SpacingMark):
4249         case GCBcase(GCB_Extend, GCB_SpacingMark):
4250         case GCBcase(GCB_L, GCB_SpacingMark):
4251         case GCBcase(GCB_LV, GCB_SpacingMark):
4252         case GCBcase(GCB_LVT, GCB_SpacingMark):
4253         case GCBcase(GCB_Prepend, GCB_SpacingMark):
4254         case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
4255         case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
4256         case GCBcase(GCB_T, GCB_SpacingMark):
4257         case GCBcase(GCB_V, GCB_SpacingMark):
4258             return FALSE;
4259
4260         /* GB9b.  Prepend  ×   */
4261         case GCBcase(GCB_Prepend, GCB_Other):
4262         case GCBcase(GCB_Prepend, GCB_L):
4263         case GCBcase(GCB_Prepend, GCB_LV):
4264         case GCBcase(GCB_Prepend, GCB_LVT):
4265         case GCBcase(GCB_Prepend, GCB_Prepend):
4266         case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
4267         case GCBcase(GCB_Prepend, GCB_T):
4268         case GCBcase(GCB_Prepend, GCB_V):
4269             return FALSE;
4270     }
4271
4272     NOT_REACHED; /* NOTREACHED */
4273 }
4274
4275 #define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
4276
4277 STATIC bool
4278 S_isSB(pTHX_ SB_enum before,
4279              SB_enum after,
4280              const U8 * const strbeg,
4281              const U8 * const curpos,
4282              const U8 * const strend,
4283              const bool utf8_target)
4284 {
4285     /* returns a boolean indicating if there is a Sentence Boundary Break
4286      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4287
4288     U8 * lpos = (U8 *) curpos;
4289     U8 * temp_pos;
4290     SB_enum backup;
4291
4292     PERL_ARGS_ASSERT_ISSB;
4293
4294     /* Break at the start and end of text.
4295         SB1.  sot  ÷
4296         SB2.  ÷  eot */
4297     if (before == SB_EDGE || after == SB_EDGE) {
4298         return TRUE;
4299     }
4300
4301     /* SB 3: Do not break within CRLF. */
4302     if (before == SB_CR && after == SB_LF) {
4303         return FALSE;
4304     }
4305
4306     /* Break after paragraph separators.  (though why CR and LF are considered
4307      * so is beyond me (khw)
4308        SB4.  Sep | CR | LF  ÷ */
4309     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4310         return TRUE;
4311     }
4312
4313     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4314      * (See Section 6.2, Replacing Ignore Rules.)
4315         SB5.  X (Extend | Format)*  →  X */
4316     if (after == SB_Extend || after == SB_Format) {
4317         return FALSE;
4318     }
4319
4320     if (before == SB_Extend || before == SB_Format) {
4321         before = backup_one_SB(strbeg, &lpos, utf8_target);
4322     }
4323
4324     /* Do not break after ambiguous terminators like period, if they are
4325      * immediately followed by a number or lowercase letter, if they are
4326      * between uppercase letters, if the first following letter (optionally
4327      * after certain punctuation) is lowercase, or if they are followed by
4328      * "continuation" punctuation such as comma, colon, or semicolon. For
4329      * example, a period may be an abbreviation or numeric period, and thus may
4330      * not mark the end of a sentence.
4331
4332      * SB6. ATerm  ×  Numeric */
4333     if (before == SB_ATerm && after == SB_Numeric) {
4334         return FALSE;
4335     }
4336
4337     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4338     if (before == SB_ATerm && after == SB_Upper) {
4339         temp_pos = lpos;
4340         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4341         if (backup == SB_Upper || backup == SB_Lower) {
4342             return FALSE;
4343         }
4344     }
4345
4346     /* SB8a.  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4347      * SB10.  (STerm | ATerm) Close* Sp*  ×  ( Sp | Sep | CR | LF )      */
4348     backup = before;
4349     temp_pos = lpos;
4350     while (backup == SB_Sp) {
4351         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4352     }
4353     while (backup == SB_Close) {
4354         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4355     }
4356     if ((backup == SB_STerm || backup == SB_ATerm)
4357         && (   after == SB_SContinue
4358             || after == SB_STerm
4359             || after == SB_ATerm
4360             || after == SB_Sp
4361             || after == SB_Sep
4362             || after == SB_CR
4363             || after == SB_LF))
4364     {
4365         return FALSE;
4366     }
4367
4368     /* SB8.  ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
4369      *                                              STerm | ATerm) )* Lower */
4370     if (backup == SB_ATerm) {
4371         U8 * rpos = (U8 *) curpos;
4372         SB_enum later = after;
4373
4374         while (    later != SB_OLetter
4375                 && later != SB_Upper
4376                 && later != SB_Lower
4377                 && later != SB_Sep
4378                 && later != SB_CR
4379                 && later != SB_LF
4380                 && later != SB_STerm
4381                 && later != SB_ATerm
4382                 && later != SB_EDGE)
4383         {
4384             later = advance_one_SB(&rpos, strend, utf8_target);
4385         }
4386         if (later == SB_Lower) {
4387             return FALSE;
4388         }
4389     }
4390
4391     /* Break after sentence terminators, but include closing punctuation,
4392      * trailing spaces, and a paragraph separator (if present). [See note
4393      * below.]
4394      * SB9.  ( STerm | ATerm ) Close*  ×  ( Close | Sp | Sep | CR | LF ) */
4395     backup = before;
4396     temp_pos = lpos;
4397     while (backup == SB_Close) {
4398         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4399     }
4400     if ((backup == SB_STerm || backup == SB_ATerm)
4401         && (   after == SB_Close
4402             || after == SB_Sp
4403             || after == SB_Sep
4404             || after == SB_CR
4405             || after == SB_LF))
4406     {
4407         return FALSE;
4408     }
4409
4410
4411     /* SB11.  ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )?  ÷ */
4412     temp_pos = lpos;
4413     backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4414     if (   backup == SB_Sep
4415         || backup == SB_CR
4416         || backup == SB_LF)
4417     {
4418         lpos = temp_pos;
4419     }
4420     else {
4421         backup = before;
4422     }
4423     while (backup == SB_Sp) {
4424         backup = backup_one_SB(strbeg, &lpos, utf8_target);
4425     }
4426     while (backup == SB_Close) {
4427         backup = backup_one_SB(strbeg, &lpos, utf8_target);
4428     }
4429     if (backup == SB_STerm || backup == SB_ATerm) {
4430         return TRUE;
4431     }
4432
4433     /* Otherwise, do not break.
4434     SB12.  Any  ×  Any */
4435
4436     return FALSE;
4437 }
4438
4439 STATIC SB_enum
4440 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4441 {
4442     SB_enum sb;
4443
4444     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4445
4446     if (*curpos >= strend) {
4447         return SB_EDGE;
4448     }
4449
4450     if (utf8_target) {
4451         do {
4452             *curpos += UTF8SKIP(*curpos);
4453             if (*curpos >= strend) {
4454                 return SB_EDGE;
4455             }
4456             sb = getSB_VAL_UTF8(*curpos, strend);
4457         } while (sb == SB_Extend || sb == SB_Format);
4458     }
4459     else {
4460         do {
4461             (*curpos)++;
4462             if (*curpos >= strend) {
4463                 return SB_EDGE;
4464             }
4465             sb = getSB_VAL_CP(**curpos);
4466         } while (sb == SB_Extend || sb == SB_Format);
4467     }
4468
4469     return sb;
4470 }
4471
4472 STATIC SB_enum
4473 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4474 {
4475     SB_enum sb;
4476
4477     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4478
4479     if (*curpos < strbeg) {
4480         return SB_EDGE;
4481     }
4482
4483     if (utf8_target) {
4484         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4485         if (! prev_char_pos) {
4486             return SB_EDGE;
4487         }
4488
4489         /* Back up over Extend and Format.  curpos is always just to the right
4490          * of the characater whose value we are getting */
4491         do {
4492             U8 * prev_prev_char_pos;
4493             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4494                                                                       strbeg)))
4495             {
4496                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4497                 *curpos = prev_char_pos;
4498                 prev_char_pos = prev_prev_char_pos;
4499             }
4500             else {
4501                 *curpos = (U8 *) strbeg;
4502                 return SB_EDGE;
4503             }
4504         } while (sb == SB_Extend || sb == SB_Format);
4505     }
4506     else {
4507         do {
4508             if (*curpos - 2 < strbeg) {
4509                 *curpos = (U8 *) strbeg;
4510                 return SB_EDGE;
4511             }
4512             (*curpos)--;
4513             sb = getSB_VAL_CP(*(*curpos - 1));
4514         } while (sb == SB_Extend || sb == SB_Format);
4515     }
4516
4517     return sb;
4518 }
4519
4520 #define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
4521
4522 STATIC bool
4523 S_isWB(pTHX_ WB_enum previous,
4524              WB_enum before,
4525              WB_enum after,
4526              const U8 * const strbeg,
4527              const U8 * const curpos,
4528              const U8 * const strend,
4529              const bool utf8_target)
4530 {
4531     /*  Return a boolean as to if the boundary between 'before' and 'after' is
4532      *  a Unicode word break, using their published algorithm.  Context may be
4533      *  needed to make this determination.  If the value for the character
4534      *  before 'before' is known, it is passed as 'previous'; otherwise that
4535      *  should be set to WB_UNKNOWN.  The other input parameters give the
4536      *  boundaries and current position in the matching of the string.  That
4537      *  is, 'curpos' marks the position where the character whose wb value is
4538      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
4539
4540     U8 * before_pos = (U8 *) curpos;
4541     U8 * after_pos = (U8 *) curpos;
4542
4543     PERL_ARGS_ASSERT_ISWB;
4544
4545     /* WB1 and WB2: Break at the start and end of text. */
4546     if (before == WB_EDGE || after == WB_EDGE) {
4547         return TRUE;
4548     }
4549
4550     /* WB 3: Do not break within CRLF. */
4551     if (before == WB_CR && after == WB_LF) {
4552         return FALSE;
4553     }
4554
4555     /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
4556      * and LF) */
4557     if (   before == WB_CR || before == WB_LF || before == WB_Newline
4558         || after ==  WB_CR || after ==  WB_LF || after ==  WB_Newline)
4559     {
4560         return TRUE;
4561     }
4562
4563     /* Ignore Format and Extend characters, except when they appear at the
4564      * beginning of a region of text.
4565      * WB4.  X (Extend | Format)*  →  X. */
4566
4567     if (after == WB_Extend || after == WB_Format) {
4568         return FALSE;
4569     }
4570
4571     if (before == WB_Extend || before == WB_Format) {
4572         before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4573     }
4574
4575     switch (WBcase(before, after)) {
4576             /* Otherwise, break everywhere (including around ideographs).
4577                 WB14.  Any  ÷  Any */
4578             default:
4579                 return TRUE;
4580
4581             /* Do not break between most letters.
4582                 WB5.  (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
4583             case WBcase(WB_ALetter, WB_ALetter):
4584             case WBcase(WB_ALetter, WB_Hebrew_Letter):
4585             case WBcase(WB_Hebrew_Letter, WB_ALetter):
4586             case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
4587                 return FALSE;
4588
4589             /* Do not break letters across certain punctuation.
4590                 WB6.  (ALetter | Hebrew_Letter)
4591                         × (MidLetter | MidNumLet | Single_Quote) (ALetter
4592                                                             | Hebrew_Letter) */
4593             case WBcase(WB_ALetter, WB_MidLetter):
4594             case WBcase(WB_ALetter, WB_MidNumLet):
4595             case WBcase(WB_ALetter, WB_Single_Quote):
4596             case WBcase(WB_Hebrew_Letter, WB_MidLetter):
4597             case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
4598             /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
4599                 after = advance_one_WB(&after_pos, strend, utf8_target);
4600                 return after != WB_ALetter && after != WB_Hebrew_Letter;
4601
4602             /* WB7.  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
4603              *                    Single_Quote) ×  (ALetter | Hebrew_Letter) */
4604             case WBcase(WB_MidLetter, WB_ALetter):
4605             case WBcase(WB_MidLetter, WB_Hebrew_Letter):
4606             case WBcase(WB_MidNumLet, WB_ALetter):
4607             case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
4608             case WBcase(WB_Single_Quote, WB_ALetter):
4609             case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
4610                 before
4611                   = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4612                 return before != WB_ALetter && before != WB_Hebrew_Letter;
4613
4614             /* WB7a.  Hebrew_Letter  ×  Single_Quote */
4615             case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
4616                 return FALSE;
4617
4618             /* WB7b.  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
4619             case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
4620                 return advance_one_WB(&after_pos, strend, utf8_target)
4621                                                         != WB_Hebrew_Letter;
4622
4623             /* WB7c.  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
4624             case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
4625                 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4626                                                         != WB_Hebrew_Letter;
4627
4628             /* Do not break within sequences of digits, or digits adjacent to
4629              * letters (“3a”, or “A3”).
4630                 WB8.  Numeric  ×  Numeric */
4631             case WBcase(WB_Numeric, WB_Numeric):
4632                 return FALSE;
4633
4634             /* WB9.  (ALetter | Hebrew_Letter)  ×  Numeric */
4635             case WBcase(WB_ALetter, WB_Numeric):
4636             case WBcase(WB_Hebrew_Letter, WB_Numeric):
4637                 return FALSE;
4638
4639             /* WB10.  Numeric  ×  (ALetter | Hebrew_Letter) */
4640             case WBcase(WB_Numeric, WB_ALetter):
4641             case WBcase(WB_Numeric, WB_Hebrew_Letter):
4642                 return FALSE;
4643
4644             /* Do not break within sequences, such as “3.2” or “3,456.789”.
4645                 WB11.   Numeric (MidNum | MidNumLet | Single_Quote)  ×  Numeric
4646              */
4647             case WBcase(WB_MidNum, WB_Numeric):
4648             case WBcase(WB_MidNumLet, WB_Numeric):
4649             case WBcase(WB_Single_Quote, WB_Numeric):
4650                 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4651                                                                != WB_Numeric;
4652
4653             /*  WB12.   Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric
4654              *  */
4655             case WBcase(WB_Numeric, WB_MidNum):
4656             case WBcase(WB_Numeric, WB_MidNumLet):
4657             case WBcase(WB_Numeric, WB_Single_Quote):
4658                 return advance_one_WB(&after_pos, strend, utf8_target)
4659                                                                != WB_Numeric;
4660
4661             /* Do not break between Katakana.
4662                WB13.  Katakana  ×  Katakana */
4663             case WBcase(WB_Katakana, WB_Katakana):
4664                 return FALSE;
4665
4666             /* Do not break from extenders.
4667                WB13a.  (ALetter | Hebrew_Letter | Numeric | Katakana |
4668                                             ExtendNumLet)  ×  ExtendNumLet */
4669             case WBcase(WB_ALetter, WB_ExtendNumLet):
4670             case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
4671             case WBcase(WB_Numeric, WB_ExtendNumLet):
4672             case WBcase(WB_Katakana, WB_ExtendNumLet):
4673             case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
4674                 return FALSE;
4675
4676             /* WB13b.  ExtendNumLet  ×  (ALetter | Hebrew_Letter | Numeric
4677              *                                                 | Katakana) */
4678             case WBcase(WB_ExtendNumLet, WB_ALetter):
4679             case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
4680             case WBcase(WB_ExtendNumLet, WB_Numeric):
4681             case WBcase(WB_ExtendNumLet, WB_Katakana):
4682                 return FALSE;
4683
4684             /* Do not break between regional indicator symbols.
4685                WB13c.  Regional_Indicator  ×  Regional_Indicator */
4686             case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
4687                 return FALSE;
4688
4689     }
4690
4691     NOT_REACHED; /* NOTREACHED */
4692 }
4693
4694 STATIC WB_enum
4695 S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4696 {
4697     WB_enum wb;
4698
4699     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
4700
4701     if (*curpos >= strend) {
4702         return WB_EDGE;
4703     }
4704
4705     if (utf8_target) {
4706
4707         /* Advance over Extend and Format */
4708         do {
4709             *curpos += UTF8SKIP(*curpos);
4710             if (*curpos >= strend) {
4711                 return WB_EDGE;
4712             }
4713             wb = getWB_VAL_UTF8(*curpos, strend);
4714         } while (wb == WB_Extend || wb == WB_Format);
4715     }
4716     else {
4717         do {
4718             (*curpos)++;
4719             if (*curpos >= strend) {
4720                 return WB_EDGE;
4721             }
4722             wb = getWB_VAL_CP(**curpos);
4723         } while (wb == WB_Extend || wb == WB_Format);
4724     }
4725
4726     return wb;
4727 }
4728
4729 STATIC WB_enum
4730 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4731 {
4732     WB_enum wb;
4733
4734     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
4735
4736     /* If we know what the previous character's break value is, don't have
4737         * to look it up */
4738     if (*previous != WB_UNKNOWN) {
4739         wb = *previous;
4740
4741         /* But we need to move backwards by one */
4742         if (utf8_target) {
4743             *curpos = reghopmaybe3(*curpos, -1, strbeg);
4744             if (! *curpos) {
4745                 *previous = WB_EDGE;
4746                 *curpos = (U8 *) strbeg;
4747             }
4748             else {
4749                 *previous = WB_UNKNOWN;
4750             }
4751         }
4752         else {
4753             (*curpos)--;
4754             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
4755         }
4756
4757         /* And we always back up over these two types */
4758         if (wb != WB_Extend && wb != WB_Format) {
4759             return wb;
4760         }
4761     }
4762
4763     if (*curpos < strbeg) {
4764         return WB_EDGE;
4765     }
4766
4767     if (utf8_target) {
4768         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4769         if (! prev_char_pos) {
4770             return WB_EDGE;
4771         }
4772
4773         /* Back up over Extend and Format.  curpos is always just to the right
4774          * of the characater whose value we are getting */
4775         do {
4776             U8 * prev_prev_char_pos;
4777             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
4778                                                    -1,
4779                                                    strbeg)))
4780             {
4781                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4782                 *curpos = prev_char_pos;
4783                 prev_char_pos = prev_prev_char_pos;
4784             }
4785             else {
4786                 *curpos = (U8 *) strbeg;
4787                 return WB_EDGE;
4788             }
4789         } while (wb == WB_Extend || wb == WB_Format);
4790     }
4791     else {
4792         do {
4793             if (*curpos - 2 < strbeg) {
4794                 *curpos = (U8 *) strbeg;
4795                 return WB_EDGE;
4796             }
4797             (*curpos)--;
4798             wb = getWB_VAL_CP(*(*curpos - 1));
4799         } while (wb == WB_Extend || wb == WB_Format);
4800     }
4801
4802     return wb;
4803 }
4804
4805 /* returns -1 on failure, $+[0] on success */
4806 STATIC SSize_t
4807 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
4808 {
4809 #if PERL_VERSION < 9 && !defined(PERL_CORE)
4810     dMY_CXT;
4811 #endif
4812     dVAR;
4813     const bool utf8_target = reginfo->is_utf8_target;
4814     const U32 uniflags = UTF8_ALLOW_DEFAULT;
4815     REGEXP *rex_sv = reginfo->prog;
4816     regexp *rex = ReANY(rex_sv);
4817     RXi_GET_DECL(rex,rexi);
4818     /* the current state. This is a cached copy of PL_regmatch_state */
4819     regmatch_state *st;
4820     /* cache heavy used fields of st in registers */
4821     regnode *scan;
4822     regnode *next;
4823     U32 n = 0;  /* general value; init to avoid compiler warning */
4824     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
4825     char *locinput = startpos;
4826     char *pushinput; /* where to continue after a PUSH */
4827     I32 nextchr;   /* is always set to UCHARAT(locinput) */
4828
4829     bool result = 0;        /* return value of S_regmatch */
4830     int depth = 0;          /* depth of backtrack stack */
4831     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
4832     const U32 max_nochange_depth =
4833         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
4834         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
4835     regmatch_state *yes_state = NULL; /* state to pop to on success of
4836                                                             subpattern */
4837     /* mark_state piggy backs on the yes_state logic so that when we unwind
4838        the stack on success we can update the mark_state as we go */
4839     regmatch_state *mark_state = NULL; /* last mark state we have seen */
4840     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
4841     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
4842     U32 state_num;
4843     bool no_final = 0;      /* prevent failure from backtracking? */
4844     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
4845     char *startpoint = locinput;
4846     SV *popmark = NULL;     /* are we looking for a mark? */
4847     SV *sv_commit = NULL;   /* last mark name seen in failure */
4848     SV *sv_yes_mark = NULL; /* last mark name we have seen
4849                                during a successful match */
4850     U32 lastopen = 0;       /* last open we saw */
4851     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4852     SV* const oreplsv = GvSVn(PL_replgv);
4853     /* these three flags are set by various ops to signal information to
4854      * the very next op. They have a useful lifetime of exactly one loop
4855      * iteration, and are not preserved or restored by state pushes/pops
4856      */
4857     bool sw = 0;            /* the condition value in (?(cond)a|b) */
4858     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
4859     int logical = 0;        /* the following EVAL is:
4860                                 0: (?{...})
4861                                 1: (?(?{...})X|Y)
4862                                 2: (??{...})
4863                                or the following IFMATCH/UNLESSM is:
4864                                 false: plain (?=foo)
4865                                 true:  used as a condition: (?(?=foo))
4866                             */
4867     PAD* last_pad = NULL;
4868     dMULTICALL;
4869     I32 gimme = G_SCALAR;
4870     CV *caller_cv = NULL;       /* who called us */
4871     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
4872     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
4873     U32 maxopenparen = 0;       /* max '(' index seen so far */
4874     int to_complement;  /* Invert the result? */
4875     _char_class_number classnum;
4876     bool is_utf8_pat = reginfo->is_utf8_pat;
4877     bool match = FALSE;
4878
4879
4880 #ifdef DEBUGGING
4881     GET_RE_DEBUG_FLAGS_DECL;
4882 #endif
4883
4884     /* protect against undef(*^R) */
4885     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
4886
4887     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
4888     multicall_oldcatch = 0;
4889     multicall_cv = NULL;
4890     cx = NULL;
4891     PERL_UNUSED_VAR(multicall_cop);
4892     PERL_UNUSED_VAR(newsp);
4893
4894
4895     PERL_ARGS_ASSERT_REGMATCH;
4896
4897     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
4898             PerlIO_printf(Perl_debug_log,"regmatch start\n");
4899     }));
4900
4901     st = PL_regmatch_state;
4902
4903     /* Note that nextchr is a byte even in UTF */
4904     SET_nextchr;
4905     scan = prog;
4906     while (scan != NULL) {
4907
4908         DEBUG_EXECUTE_r( {
4909             SV * const prop = sv_newmortal();
4910             regnode *rnext=regnext(scan);
4911             DUMP_EXEC_POS( locinput, scan, utf8_target );
4912             regprop(rex, prop, scan, reginfo, NULL);
4913
4914             PerlIO_printf(Perl_debug_log,
4915                     "%3"IVdf":%*s%s(%"IVdf")\n",
4916                     (IV)(scan - rexi->program), depth*2, "",
4917                     SvPVX_const(prop),
4918                     (PL_regkind[OP(scan)] == END || !rnext) ?
4919                         0 : (IV)(rnext - rexi->program));
4920         });
4921
4922         next = scan + NEXT_OFF(scan);
4923         if (next == scan)
4924             next = NULL;
4925         state_num = OP(scan);
4926
4927       reenter_switch:
4928         to_complement = 0;
4929
4930         SET_nextchr;
4931         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4932
4933         switch (state_num) {
4934         case SBOL: /*  /^../ and /\A../  */
4935             if (locinput == reginfo->strbeg)
4936                 break;
4937             sayNO;
4938
4939         case MBOL: /*  /^../m  */
4940             if (locinput == reginfo->strbeg ||
4941                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4942             {
4943                 break;
4944             }
4945             sayNO;
4946
4947         case GPOS: /*  \G  */
4948             if (locinput == reginfo->ganch)
4949                 break;
4950             sayNO;
4951
4952         case KEEPS: /*   \K  */
4953             /* update the startpoint */
4954             st->u.keeper.val = rex->offs[0].start;
4955             rex->offs[0].start = locinput - reginfo->strbeg;
4956             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4957             /* NOTREACHED */
4958             NOT_REACHED; /* NOTREACHED */
4959
4960         case KEEPS_next_fail:
4961             /* rollback the start point change */
4962             rex->offs[0].start = st->u.keeper.val;
4963             sayNO_SILENT;
4964             /* NOTREACHED */
4965             NOT_REACHED; /* NOTREACHED */
4966
4967         case MEOL: /* /..$/m  */
4968             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4969                 sayNO;
4970             break;
4971
4972         case SEOL: /* /..$/  */
4973             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4974                 sayNO;
4975             if (reginfo->strend - locinput > 1)
4976                 sayNO;
4977             break;
4978
4979         case EOS: /*  \z  */
4980             if (!NEXTCHR_IS_EOS)
4981                 sayNO;
4982             break;
4983
4984         case SANY: /*  /./s  */
4985             if (NEXTCHR_IS_EOS)
4986                 sayNO;
4987             goto increment_locinput;
4988
4989         case REG_ANY: /*  /./  */
4990             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4991                 sayNO;
4992             goto increment_locinput;
4993
4994
4995 #undef  ST
4996 #define ST st->u.trie
4997         case TRIEC: /* (ab|cd) with known charclass */
4998             /* In this case the charclass data is available inline so
4999                we can fail fast without a lot of extra overhead.
5000              */
5001             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5002                 DEBUG_EXECUTE_r(
5003                     PerlIO_printf(Perl_debug_log,
5004                               "%*s  %sfailed to match trie start class...%s\n",
5005                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5006                 );
5007                 sayNO_SILENT;
5008                 /* NOTREACHED */
5009                 NOT_REACHED; /* NOTREACHED */
5010             }
5011             /* FALLTHROUGH */
5012         case TRIE:  /* (ab|cd)  */
5013             /* the basic plan of execution of the trie is:
5014              * At the beginning, run though all the states, and
5015              * find the longest-matching word. Also remember the position
5016              * of the shortest matching word. For example, this pattern:
5017              *    1  2 3 4    5
5018              *    ab|a|x|abcd|abc
5019              * when matched against the string "abcde", will generate
5020              * accept states for all words except 3, with the longest
5021              * matching word being 4, and the shortest being 2 (with
5022              * the position being after char 1 of the string).
5023              *
5024              * Then for each matching word, in word order (i.e. 1,2,4,5),
5025              * we run the remainder of the pattern; on each try setting
5026              * the current position to the character following the word,
5027              * returning to try the next word on failure.
5028              *
5029              * We avoid having to build a list of words at runtime by
5030              * using a compile-time structure, wordinfo[].prev, which
5031              * gives, for each word, the previous accepting word (if any).
5032              * In the case above it would contain the mappings 1->2, 2->0,
5033              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5034              * the longest word (4 above), a list of all words, by
5035              * following the list of prev pointers; this gives us the
5036              * unordered list 4,5,1,2. Then given the current word we have
5037              * just tried, we can go through the list and find the
5038              * next-biggest word to try (so if we just failed on word 2,
5039              * the next in the list is 4).
5040              *
5041              * Since at runtime we don't record the matching position in
5042              * the string for each word, we have to work that out for
5043              * each word we're about to process. The wordinfo table holds
5044              * the character length of each word; given that we recorded
5045              * at the start: the position of the shortest word and its
5046              * length in chars, we just need to move the pointer the
5047              * difference between the two char lengths. Depending on
5048              * Unicode status and folding, that's cheap or expensive.
5049              *
5050              * This algorithm is optimised for the case where are only a
5051              * small number of accept states, i.e. 0,1, or maybe 2.
5052              * With lots of accepts states, and having to try all of them,
5053              * it becomes quadratic on number of accept states to find all
5054              * the next words.
5055              */
5056
5057             {
5058                 /* what type of TRIE am I? (utf8 makes this contextual) */
5059                 DECL_TRIE_TYPE(scan);
5060
5061                 /* what trie are we using right now */
5062                 reg_trie_data * const trie
5063                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5064                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5065                 U32 state = trie->startstate;
5066
5067                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5068                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5069                     if (utf8_target
5070                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5071                         && scan->flags == EXACTL)
5072                     {
5073                         /* We only output for EXACTL, as we let the folder
5074                          * output this message for EXACTFLU8 to avoid
5075                          * duplication */
5076                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5077                                                                reginfo->strend);
5078                     }
5079                 }
5080                 if (   trie->bitmap
5081                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5082                 {
5083                     if (trie->states[ state ].wordnum) {
5084                          DEBUG_EXECUTE_r(
5085                             PerlIO_printf(Perl_debug_log,
5086                                           "%*s  %smatched empty string...%s\n",
5087                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5088                         );
5089                         if (!trie->jump)
5090                             break;
5091                     } else {
5092                         DEBUG_EXECUTE_r(
5093                             PerlIO_printf(Perl_debug_log,
5094                                           "%*s  %sfailed to match trie start class...%s\n",
5095                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5096                         );
5097                         sayNO_SILENT;
5098                    }
5099                 }
5100
5101             {
5102                 U8 *uc = ( U8* )locinput;
5103
5104                 STRLEN len = 0;
5105                 STRLEN foldlen = 0;
5106                 U8 *uscan = (U8*)NULL;
5107                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5108                 U32 charcount = 0; /* how many input chars we have matched */
5109                 U32 accepted = 0; /* have we seen any accepting states? */
5110
5111                 ST.jump = trie->jump;
5112                 ST.me = scan;
5113                 ST.firstpos = NULL;
5114                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5115                 ST.nextword = 0;
5116
5117                 /* fully traverse the TRIE; note the position of the
5118                    shortest accept state and the wordnum of the longest
5119                    accept state */
5120
5121                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5122                     U32 base = trie->states[ state ].trans.base;
5123                     UV uvc = 0;
5124                     U16 charid = 0;
5125                     U16 wordnum;
5126                     wordnum = trie->states[ state ].wordnum;
5127
5128                     if (wordnum) { /* it's an accept state */
5129                         if (!accepted) {
5130                             accepted = 1;
5131                             /* record first match position */
5132                             if (ST.longfold) {
5133                                 ST.firstpos = (U8*)locinput;
5134                                 ST.firstchars = 0;
5135                             }
5136                             else {
5137                                 ST.firstpos = uc;
5138                                 ST.firstchars = charcount;
5139                             }
5140                         }
5141                         if (!ST.nextword || wordnum < ST.nextword)
5142                             ST.nextword = wordnum;
5143                         ST.topword = wordnum;
5144                     }
5145
5146                     DEBUG_TRIE_EXECUTE_r({
5147                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
5148                                 PerlIO_printf( Perl_debug_log,
5149                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
5150                                     2+depth * 2, "", PL_colors[4],
5151                                     (UV)state, (accepted ? 'Y' : 'N'));
5152                     });
5153
5154                     /* read a char and goto next state */
5155                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5156                         I32 offset;
5157                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5158                                              uscan, len, uvc, charid, foldlen,
5159                                              foldbuf, uniflags);
5160                         charcount++;
5161                         if (foldlen>0)
5162                             ST.longfold = TRUE;
5163                         if (charid &&
5164                              ( ((offset =
5165                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5166
5167                              && ((U32)offset < trie->lasttrans)
5168                              && trie->trans[offset].check == state)
5169                         {
5170                             state = trie->trans[offset].next;
5171                         }
5172                         else {
5173                             state = 0;
5174                         }
5175                         uc += len;
5176
5177                     }
5178                     else {
5179                         state = 0;
5180                     }
5181                     DEBUG_TRIE_EXECUTE_r(
5182                         PerlIO_printf( Perl_debug_log,
5183                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5184                             charid, uvc, (UV)state, PL_colors[5] );
5185                     );
5186                 }
5187                 if (!accepted)
5188                    sayNO;
5189
5190                 /* calculate total number of accept states */
5191                 {
5192                     U16 w = ST.topword;
5193                     accepted = 0;
5194                     while (w) {
5195                         w = trie->wordinfo[w].prev;
5196                         accepted++;
5197                     }
5198                     ST.accepted = accepted;
5199                 }
5200
5201                 DEBUG_EXECUTE_r(
5202                     PerlIO_printf( Perl_debug_log,
5203                         "%*s  %sgot %"IVdf" possible matches%s\n",
5204                         REPORT_CODE_OFF + depth * 2, "",
5205                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5206                 );
5207                 goto trie_first_try; /* jump into the fail handler */
5208             }}
5209             /* NOTREACHED */
5210             NOT_REACHED; /* NOTREACHED */
5211
5212         case TRIE_next_fail: /* we failed - try next alternative */
5213         {
5214             U8 *uc;
5215             if ( ST.jump) {
5216                 REGCP_UNWIND(ST.cp);
5217                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5218             }
5219             if (!--ST.accepted) {
5220                 DEBUG_EXECUTE_r({
5221                     PerlIO_printf( Perl_debug_log,
5222                         "%*s  %sTRIE failed...%s\n",
5223                         REPORT_CODE_OFF+depth*2, "",
5224                         PL_colors[4],
5225                         PL_colors[5] );
5226                 });
5227                 sayNO_SILENT;
5228             }
5229             {
5230                 /* Find next-highest word to process.  Note that this code
5231                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5232                 U16 min = 0;
5233                 U16 word;
5234                 U16 const nextword = ST.nextword;
5235                 reg_trie_wordinfo * const wordinfo
5236                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5237                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5238                     if (word > nextword && (!min || word < min))
5239                         min = word;
5240                 }
5241                 ST.nextword = min;
5242             }
5243
5244           trie_first_try:
5245             if (do_cutgroup) {
5246                 do_cutgroup = 0;
5247                 no_final = 0;
5248             }
5249
5250             if ( ST.jump) {
5251                 ST.lastparen = rex->lastparen;
5252                 ST.lastcloseparen = rex->lastcloseparen;
5253                 REGCP_SET(ST.cp);
5254             }
5255
5256             /* find start char of end of current word */
5257             {
5258                 U32 chars; /* how many chars to skip */
5259                 reg_trie_data * const trie
5260                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5261
5262                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5263                             >=  ST.firstchars);
5264                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5265                             - ST.firstchars;
5266                 uc = ST.firstpos;
5267
5268                 if (ST.longfold) {
5269                     /* the hard option - fold each char in turn and find
5270                      * its folded length (which may be different */
5271                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5272                     STRLEN foldlen;
5273                     STRLEN len;
5274                     UV uvc;
5275                     U8 *uscan;
5276
5277                     while (chars) {
5278                         if (utf8_target) {
5279                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5280                                                     uniflags);
5281                             uc += len;
5282                         }
5283                         else {
5284                             uvc = *uc;
5285                             uc++;
5286                         }
5287                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5288                         uscan = foldbuf;
5289                         while (foldlen) {
5290                             if (!--chars)
5291                                 break;
5292                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5293                                             uniflags);
5294                             uscan += len;
5295                             foldlen -= len;
5296                         }
5297                     }
5298                 }
5299                 else {
5300                     if (utf8_target)
5301                         while (chars--)
5302                             uc += UTF8SKIP(uc);
5303                     else
5304                         uc += chars;
5305                 }
5306             }
5307
5308             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5309                             ? ST.jump[ST.nextword]
5310                             : NEXT_OFF(ST.me));
5311
5312             DEBUG_EXECUTE_r({
5313                 PerlIO_printf( Perl_debug_log,
5314                     "%*s  %sTRIE matched word #%d, continuing%s\n",
5315                     REPORT_CODE_OFF+depth*2, "",
5316                     PL_colors[4],
5317                     ST.nextword,
5318                     PL_colors[5]
5319                     );
5320             });
5321
5322             if (ST.accepted > 1 || has_cutgroup) {
5323                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5324                 /* NOTREACHED */
5325                 NOT_REACHED; /* NOTREACHED */
5326             }
5327             /* only one choice left - just continue */
5328             DEBUG_EXECUTE_r({
5329                 AV *const trie_words
5330                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5331                 SV ** const tmp = trie_words
5332                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5333                 SV *sv= tmp ? sv_newmortal() : NULL;
5334
5335                 PerlIO_printf( Perl_debug_log,
5336                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
5337                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
5338                     ST.nextword,
5339                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5340                             PL_colors[0], PL_colors[1],
5341                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5342                         )
5343                     : "not compiled under -Dr",
5344                     PL_colors[5] );
5345             });
5346
5347             locinput = (char*)uc;
5348             continue; /* execute rest of RE */
5349             /* NOTREACHED */
5350         }
5351 #undef  ST
5352
5353         case EXACTL:             /*  /abc/l       */
5354             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5355
5356             /* Complete checking would involve going through every character
5357              * matched by the string to see if any is above latin1.  But the
5358              * comparision otherwise might very well be a fast assembly
5359              * language routine, and I (khw) don't think slowing things down
5360              * just to check for this warning is worth it.  So this just checks
5361              * the first character */
5362             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5363                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5364             }
5365             /* FALLTHROUGH */
5366         case EXACT: {            /*  /abc/        */
5367             char *s = STRING(scan);
5368             ln = STR_LEN(scan);
5369             if (utf8_target != is_utf8_pat) {
5370                 /* The target and the pattern have differing utf8ness. */
5371                 char *l = locinput;
5372                 const char * const e = s + ln;
5373
5374                 if (utf8_target) {
5375                     /* The target is utf8, the pattern is not utf8.
5376                      * Above-Latin1 code points can't match the pattern;
5377                      * invariants match exactly, and the other Latin1 ones need
5378                      * to be downgraded to a single byte in order to do the
5379                      * comparison.  (If we could be confident that the target
5380                      * is not malformed, this could be refactored to have fewer
5381                      * tests by just assuming that if the first bytes match, it
5382                      * is an invariant, but there are tests in the test suite
5383                      * dealing with (??{...}) which violate this) */
5384                     while (s < e) {
5385                         if (l >= reginfo->strend
5386                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5387                         {
5388                             sayNO;
5389                         }
5390                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
5391                             if (*l != *s) {
5392                                 sayNO;
5393                             }
5394                             l++;
5395                         }
5396                         else {
5397                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5398                             {
5399                                 sayNO;
5400                             }
5401                             l += 2;
5402                         }
5403                         s++;
5404                     }
5405                 }
5406                 else {
5407                     /* The target is not utf8, the pattern is utf8. */
5408                     while (s < e) {
5409                         if (l >= reginfo->strend
5410                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5411                         {
5412                             sayNO;
5413                         }
5414                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
5415                             if (*s != *l) {
5416                                 sayNO;
5417                             }
5418                             s++;
5419                         }
5420                         else {
5421                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5422                             {
5423                                 sayNO;
5424                             }
5425                             s += 2;
5426                         }
5427                         l++;
5428                     }
5429                 }
5430                 locinput = l;
5431             }
5432             else {
5433                 /* The target and the pattern have the same utf8ness. */
5434                 /* Inline the first character, for speed. */
5435                 if (reginfo->strend - locinput < ln
5436                     || UCHARAT(s) != nextchr
5437                     || (ln > 1 && memNE(s, locinput, ln)))
5438                 {
5439                     sayNO;
5440                 }
5441                 locinput += ln;
5442             }
5443             break;
5444             }
5445
5446         case EXACTFL: {          /*  /abc/il      */
5447             re_fold_t folder;
5448             const U8 * fold_array;
5449             const char * s;
5450             U32 fold_utf8_flags;
5451
5452             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5453             folder = foldEQ_locale;
5454             fold_array = PL_fold_locale;
5455             fold_utf8_flags = FOLDEQ_LOCALE;
5456             goto do_exactf;
5457
5458         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5459                                       is effectively /u; hence to match, target
5460                                       must be UTF-8. */
5461             if (! utf8_target) {
5462                 sayNO;
5463             }
5464             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5465                                              | FOLDEQ_S1_FOLDS_SANE;
5466             folder = foldEQ_latin1;
5467             fold_array = PL_fold_latin1;
5468             goto do_exactf;
5469
5470         case EXACTFU_SS:         /*  /\x{df}/iu   */
5471         case EXACTFU:            /*  /abc/iu      */
5472             folder = foldEQ_latin1;
5473             fold_array = PL_fold_latin1;
5474             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5475             goto do_exactf;
5476
5477         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
5478                                    patterns */
5479             assert(! is_utf8_pat);
5480             /* FALLTHROUGH */
5481         case EXACTFA:            /*  /abc/iaa     */
5482             folder = foldEQ_latin1;
5483             fold_array = PL_fold_latin1;
5484             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5485             goto do_exactf;
5486
5487         case EXACTF:             /*  /abc/i    This node only generated for
5488                                                non-utf8 patterns */
5489             assert(! is_utf8_pat);
5490             folder = foldEQ;
5491             fold_array = PL_fold;
5492             fold_utf8_flags = 0;
5493
5494           do_exactf:
5495             s = STRING(scan);
5496             ln = STR_LEN(scan);
5497
5498             if (utf8_target
5499                 || is_utf8_pat
5500                 || state_num == EXACTFU_SS
5501                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
5502             {
5503               /* Either target or the pattern are utf8, or has the issue where
5504                * the fold lengths may differ. */
5505                 const char * const l = locinput;
5506                 char *e = reginfo->strend;
5507
5508                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
5509                                         l, &e, 0,  utf8_target, fold_utf8_flags))
5510                 {
5511                     sayNO;
5512                 }
5513                 locinput = e;
5514                 break;
5515             }
5516
5517             /* Neither the target nor the pattern are utf8 */
5518             if (UCHARAT(s) != nextchr
5519                 && !NEXTCHR_IS_EOS
5520                 && UCHARAT(s) != fold_array[nextchr])
5521             {
5522                 sayNO;
5523             }
5524             if (reginfo->strend - locinput < ln)
5525                 sayNO;
5526             if (ln > 1 && ! folder(s, locinput, ln))
5527                 sayNO;
5528             locinput += ln;
5529             break;
5530         }
5531
5532         case NBOUNDL: /*  /\B/l  */
5533             to_complement = 1;
5534             /* FALLTHROUGH */
5535
5536         case BOUNDL:  /*  /\b/l  */
5537         {
5538             bool b1, b2;
5539             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5540
5541             if (FLAGS(scan) != TRADITIONAL_BOUND) {
5542                 if (! IN_UTF8_CTYPE_LOCALE) {
5543                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
5544                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
5545                 }
5546                 goto boundu;
5547             }
5548
5549             if (utf8_target) {
5550                 if (locinput == reginfo->strbeg)
5551                     b1 = isWORDCHAR_LC('\n');
5552                 else {
5553                     b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
5554                                                         (U8*)(reginfo->strbeg)));
5555                 }
5556                 b2 = (NEXTCHR_IS_EOS)
5557                     ? isWORDCHAR_LC('\n')
5558                     : isWORDCHAR_LC_utf8((U8*)locinput);
5559             }
5560             else { /* Here the string isn't utf8 */
5561                 b1 = (locinput == reginfo->strbeg)
5562                      ? isWORDCHAR_LC('\n')
5563                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
5564                 b2 = (NEXTCHR_IS_EOS)
5565                     ? isWORDCHAR_LC('\n')
5566                     : isWORDCHAR_LC(nextchr);
5567             }
5568             if (to_complement ^ (b1 == b2)) {
5569                 sayNO;
5570             }
5571             break;
5572         }
5573
5574         case NBOUND:  /*  /\B/   */
5575             to_complement = 1;
5576             /* FALLTHROUGH */
5577
5578         case BOUND:   /*  /\b/   */
5579             if (utf8_target) {
5580                 goto bound_utf8;
5581             }
5582             goto bound_ascii_match_only;
5583
5584         case NBOUNDA: /*  /\B/a  */
5585             to_complement = 1;
5586             /* FALLTHROUGH */
5587
5588         case BOUNDA:  /*  /\b/a  */
5589         {
5590             bool b1, b2;
5591
5592           bound_ascii_match_only:
5593             /* Here the string isn't utf8, or is utf8 and only ascii characters
5594              * are to match \w.  In the latter case looking at the byte just
5595              * prior to the current one may be just the final byte of a
5596              * multi-byte character.  This is ok.  There are two cases:
5597              * 1) it is a single byte character, and then the test is doing
5598              *    just what it's supposed to.
5599              * 2) it is a multi-byte character, in which case the final byte is
5600              *    never mistakable for ASCII, and so the test will say it is
5601              *    not a word character, which is the correct answer. */
5602             b1 = (locinput == reginfo->strbeg)
5603                  ? isWORDCHAR_A('\n')
5604                  : isWORDCHAR_A(UCHARAT(locinput - 1));
5605             b2 = (NEXTCHR_IS_EOS)
5606                 ? isWORDCHAR_A('\n')
5607                 : isWORDCHAR_A(nextchr);
5608             if (to_complement ^ (b1 == b2)) {
5609                 sayNO;
5610             }
5611             break;
5612         }
5613
5614         case NBOUNDU: /*  /\B/u  */
5615             to_complement = 1;
5616             /* FALLTHROUGH */
5617
5618         case BOUNDU:  /*  /\b/u  */
5619
5620           boundu:
5621             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
5622                 match = FALSE;
5623             }
5624             else if (utf8_target) {
5625               bound_utf8:
5626                 switch((bound_type) FLAGS(scan)) {
5627                     case TRADITIONAL_BOUND:
5628                     {
5629                         bool b1, b2;
5630                         b1 = (locinput == reginfo->strbeg)
5631                              ? 0 /* isWORDCHAR_L1('\n') */
5632                              : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
5633                                                                 (U8*)(reginfo->strbeg)));
5634                         b2 = (NEXTCHR_IS_EOS)
5635                             ? 0 /* isWORDCHAR_L1('\n') */
5636                             : isWORDCHAR_utf8((U8*)locinput);
5637                         match = cBOOL(b1 != b2);
5638                         break;
5639                     }
5640                     case GCB_BOUND:
5641                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5642                             match = TRUE; /* GCB always matches at begin and
5643                                              end */
5644                         }
5645                         else {
5646                             /* Find the gcb values of previous and current
5647                              * chars, then see if is a break point */
5648                             match = isGCB(getGCB_VAL_UTF8(
5649                                                 reghop3((U8*)locinput,
5650                                                         -1,
5651                                                         (U8*)(reginfo->strbeg)),
5652                                                 (U8*) reginfo->strend),
5653                                           getGCB_VAL_UTF8((U8*) locinput,
5654                                                         (U8*) reginfo->strend));
5655                         }
5656                         break;
5657
5658                     case SB_BOUND: /* Always matches at begin and end */
5659                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5660                             match = TRUE;
5661                         }
5662                         else {
5663                             match = isSB(getSB_VAL_UTF8(
5664                                                 reghop3((U8*)locinput,
5665                                                         -1,
5666                                                         (U8*)(reginfo->strbeg)),
5667                                                 (U8*) reginfo->strend),
5668                                           getSB_VAL_UTF8((U8*) locinput,
5669                                                         (U8*) reginfo->strend),
5670                                           (U8*) reginfo->strbeg,
5671                                           (U8*) locinput,
5672                                           (U8*) reginfo->strend,
5673                                           utf8_target);
5674                         }
5675                         break;
5676
5677                     case WB_BOUND:
5678                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5679                             match = TRUE;
5680                         }
5681                         else {
5682                             match = isWB(WB_UNKNOWN,
5683                                          getWB_VAL_UTF8(
5684                                                 reghop3((U8*)locinput,
5685                                                         -1,
5686                                                         (U8*)(reginfo->strbeg)),
5687                                                 (U8*) reginfo->strend),
5688                                           getWB_VAL_UTF8((U8*) locinput,
5689                                                         (U8*) reginfo->strend),
5690                                           (U8*) reginfo->strbeg,
5691                                           (U8*) locinput,
5692                                           (U8*) reginfo->strend,
5693                                           utf8_target);
5694                         }
5695                         break;
5696                 }
5697             }
5698             else {  /* Not utf8 target */
5699                 switch((bound_type) FLAGS(scan)) {
5700                     case TRADITIONAL_BOUND:
5701                     {
5702                         bool b1, b2;
5703                         b1 = (locinput == reginfo->strbeg)
5704                             ? 0 /* isWORDCHAR_L1('\n') */
5705                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
5706                         b2 = (NEXTCHR_IS_EOS)
5707                             ? 0 /* isWORDCHAR_L1('\n') */
5708                             : isWORDCHAR_L1(nextchr);
5709                         match = cBOOL(b1 != b2);
5710                         break;
5711                     }
5712
5713                     case GCB_BOUND:
5714                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5715                             match = TRUE; /* GCB always matches at begin and
5716                                              end */
5717                         }
5718                         else {  /* Only CR-LF combo isn't a GCB in 0-255
5719                                    range */
5720                             match =    UCHARAT(locinput - 1) != '\r'
5721                                     || UCHARAT(locinput) != '\n';
5722                         }
5723                         break;
5724
5725                     case SB_BOUND: /* Always matches at begin and end */
5726                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5727                             match = TRUE;
5728                         }
5729                         else {
5730                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
5731                                          getSB_VAL_CP(UCHARAT(locinput)),
5732                                          (U8*) reginfo->strbeg,
5733                                          (U8*) locinput,
5734                                          (U8*) reginfo->strend,
5735                                          utf8_target);
5736                         }
5737                         break;
5738
5739                     case WB_BOUND:
5740                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5741                             match = TRUE;
5742                         }
5743                         else {
5744                             match = isWB(WB_UNKNOWN,
5745                                          getWB_VAL_CP(UCHARAT(locinput -1)),
5746                                          getWB_VAL_CP(UCHARAT(locinput)),
5747                                          (U8*) reginfo->strbeg,
5748                                          (U8*) locinput,
5749                                          (U8*) reginfo->strend,
5750                                          utf8_target);
5751                         }
5752                         break;
5753                 }
5754             }
5755
5756             if (to_complement ^ ! match) {
5757                 sayNO;
5758             }
5759             break;
5760
5761         case ANYOFL:  /*  /[abc]/l      */
5762             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5763
5764             if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE)
5765             {
5766               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
5767             }
5768             /* FALLTHROUGH */
5769         case ANYOFD:  /*   /[abc]/d       */
5770         case ANYOF:  /*   /[abc]/       */
5771             if (NEXTCHR_IS_EOS)
5772                 sayNO;
5773             if (utf8_target) {
5774                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
5775                                                                    utf8_target))
5776                     sayNO;
5777                 locinput += UTF8SKIP(locinput);
5778             }
5779             else {
5780                 if (!REGINCLASS(rex, scan, (U8*)locinput))
5781                     sayNO;
5782                 locinput++;
5783             }
5784             break;
5785
5786         /* The argument (FLAGS) to all the POSIX node types is the class number
5787          * */
5788
5789         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
5790             to_complement = 1;
5791             /* FALLTHROUGH */
5792
5793         case POSIXL:    /* \w or [:punct:] etc. under /l */
5794             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5795             if (NEXTCHR_IS_EOS)
5796                 sayNO;
5797
5798             /* Use isFOO_lc() for characters within Latin1.  (Note that
5799              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5800              * wouldn't be invariant) */
5801             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5802                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
5803                     sayNO;
5804                 }
5805             }
5806             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5807                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
5808                                            (U8) EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
5809                                                             *(locinput + 1))))))
5810                 {
5811                     sayNO;
5812                 }
5813             }
5814             else { /* Here, must be an above Latin-1 code point */
5815                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5816                 goto utf8_posix_above_latin1;
5817             }
5818
5819             /* Here, must be utf8 */
5820             locinput += UTF8SKIP(locinput);
5821             break;
5822
5823         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
5824             to_complement = 1;
5825             /* FALLTHROUGH */
5826
5827         case POSIXD:    /* \w or [:punct:] etc. under /d */
5828             if (utf8_target) {
5829                 goto utf8_posix;
5830             }
5831             goto posixa;
5832
5833         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
5834
5835             if (NEXTCHR_IS_EOS) {
5836                 sayNO;
5837             }
5838
5839             /* All UTF-8 variants match */
5840             if (! UTF8_IS_INVARIANT(nextchr)) {
5841                 goto increment_locinput;
5842             }
5843
5844             to_complement = 1;
5845             /* FALLTHROUGH */
5846
5847         case POSIXA:    /* \w or [:punct:] etc. under /a */
5848
5849           posixa:
5850             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
5851              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
5852              * character is a single byte */
5853
5854             if (NEXTCHR_IS_EOS
5855                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
5856                                                             FLAGS(scan)))))
5857             {
5858                 sayNO;
5859             }
5860
5861             /* Here we are either not in utf8, or we matched a utf8-invariant,
5862              * so the next char is the next byte */
5863             locinput++;
5864             break;
5865
5866         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
5867             to_complement = 1;
5868             /* FALLTHROUGH */
5869
5870         case POSIXU:    /* \w or [:punct:] etc. under /u */
5871           utf8_posix:
5872             if (NEXTCHR_IS_EOS) {
5873                 sayNO;
5874             }
5875
5876             /* Use _generic_isCC() for characters within Latin1.  (Note that
5877              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5878              * wouldn't be invariant) */
5879             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5880                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
5881                                                            FLAGS(scan)))))
5882                 {
5883                     sayNO;
5884                 }
5885                 locinput++;
5886             }
5887             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5888                 if (! (to_complement
5889                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
5890                                                                *(locinput + 1)),
5891                                              FLAGS(scan)))))
5892                 {
5893                     sayNO;
5894                 }
5895                 locinput += 2;
5896             }
5897             else {  /* Handle above Latin-1 code points */
5898               utf8_posix_above_latin1:
5899                 classnum = (_char_class_number) FLAGS(scan);
5900                 if (classnum < _FIRST_NON_SWASH_CC) {
5901
5902                     /* Here, uses a swash to find such code points.  Load if if
5903                      * not done already */
5904                     if (! PL_utf8_swash_ptrs[classnum]) {
5905                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
5906                         PL_utf8_swash_ptrs[classnum]
5907                                 = _core_swash_init("utf8",
5908                                         "",
5909                                         &PL_sv_undef, 1, 0,
5910                                         PL_XPosix_ptrs[classnum], &flags);
5911                     }
5912                     if (! (to_complement
5913                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
5914                                                (U8 *) locinput, TRUE))))
5915                     {
5916                         sayNO;
5917                     }
5918                 }
5919                 else {  /* Here, uses macros to find above Latin-1 code points */
5920                     switch (classnum) {
5921                         case _CC_ENUM_SPACE:
5922                             if (! (to_complement
5923                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
5924                             {
5925                                 sayNO;
5926                             }
5927                             break;
5928                         case _CC_ENUM_BLANK:
5929                             if (! (to_complement
5930                                             ^ cBOOL(is_HORIZWS_high(locinput))))
5931                             {
5932                                 sayNO;
5933                             }
5934                             break;
5935                         case _CC_ENUM_XDIGIT:
5936                             if (! (to_complement
5937                                             ^ cBOOL(is_XDIGIT_high(locinput))))
5938                             {
5939                                 sayNO;
5940                             }
5941                             break;
5942                         case _CC_ENUM_VERTSPACE:
5943                             if (! (to_complement
5944                                             ^ cBOOL(is_VERTWS_high(locinput))))
5945                             {
5946                                 sayNO;
5947                             }
5948                             break;
5949                         default:    /* The rest, e.g. [:cntrl:], can't match
5950                                        above Latin1 */
5951                             if (! to_complement) {
5952                                 sayNO;
5953                             }
5954                             break;
5955                     }
5956                 }
5957                 locinput += UTF8SKIP(locinput);
5958             }
5959             break;
5960
5961         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
5962                        a Unicode extended Grapheme Cluster */
5963             if (NEXTCHR_IS_EOS)
5964                 sayNO;
5965             if  (! utf8_target) {
5966
5967                 /* Match either CR LF  or '.', as all the other possibilities
5968                  * require utf8 */
5969                 locinput++;         /* Match the . or CR */
5970                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
5971                                        match the LF */
5972                     && locinput < reginfo->strend
5973                     && UCHARAT(locinput) == '\n')
5974                 {
5975                     locinput++;
5976                 }
5977             }
5978             else {
5979
5980                 /* Get the gcb type for the current character */
5981                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
5982                                                        (U8*) reginfo->strend);
5983
5984                 /* Then scan through the input until we get to the first
5985                  * character whose type is supposed to be a gcb with the
5986                  * current character.  (There is always a break at the
5987                  * end-of-input) */
5988                 locinput += UTF8SKIP(locinput);
5989                 while (locinput < reginfo->strend) {
5990                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
5991                                                          (U8*) reginfo->strend);
5992                     if (isGCB(prev_gcb, cur_gcb)) {
5993                         break;
5994                     }
5995
5996                     prev_gcb = cur_gcb;
5997                     locinput += UTF8SKIP(locinput);
5998                 }
5999
6000
6001             }
6002             break;
6003
6004         case NREFFL:  /*  /\g{name}/il  */
6005         {   /* The capture buffer cases.  The ones beginning with N for the
6006                named buffers just convert to the equivalent numbered and
6007                pretend they were called as the corresponding numbered buffer
6008                op.  */
6009             /* don't initialize these in the declaration, it makes C++
6010                unhappy */
6011             const char *s;
6012             char type;
6013             re_fold_t folder;
6014             const U8 *fold_array;
6015             UV utf8_fold_flags;
6016
6017             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6018             folder = foldEQ_locale;
6019             fold_array = PL_fold_locale;
6020             type = REFFL;
6021             utf8_fold_flags = FOLDEQ_LOCALE;
6022             goto do_nref;
6023
6024         case NREFFA:  /*  /\g{name}/iaa  */
6025             folder = foldEQ_latin1;
6026             fold_array = PL_fold_latin1;
6027             type = REFFA;
6028             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6029             goto do_nref;
6030
6031         case NREFFU:  /*  /\g{name}/iu  */
6032             folder = foldEQ_latin1;
6033             fold_array = PL_fold_latin1;
6034             type = REFFU;
6035             utf8_fold_flags = 0;
6036             goto do_nref;
6037
6038         case NREFF:  /*  /\g{name}/i  */
6039             folder = foldEQ;
6040             fold_array = PL_fold;
6041             type = REFF;
6042             utf8_fold_flags = 0;
6043             goto do_nref;
6044
6045         case NREF:  /*  /\g{name}/   */
6046             type = REF;
6047             folder = NULL;
6048             fold_array = NULL;
6049             utf8_fold_flags = 0;
6050           do_nref:
6051
6052             /* For the named back references, find the corresponding buffer
6053              * number */
6054             n = reg_check_named_buff_matched(rex,scan);
6055
6056             if ( ! n ) {
6057                 sayNO;
6058             }
6059             goto do_nref_ref_common;
6060
6061         case REFFL:  /*  /\1/il  */
6062             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6063             folder = foldEQ_locale;
6064             fold_array = PL_fold_locale;
6065             utf8_fold_flags = FOLDEQ_LOCALE;
6066             goto do_ref;
6067
6068         case REFFA:  /*  /\1/iaa  */
6069             folder = foldEQ_latin1;
6070             fold_array = PL_fold_latin1;
6071             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6072             goto do_ref;
6073
6074         case REFFU:  /*  /\1/iu  */
6075             folder = foldEQ_latin1;
6076             fold_array = PL_fold_latin1;
6077             utf8_fold_flags = 0;
6078             goto do_ref;
6079
6080         case REFF:  /*  /\1/i  */
6081             folder = foldEQ;
6082             fold_array = PL_fold;
6083             utf8_fold_flags = 0;
6084             goto do_ref;
6085
6086         case REF:  /*  /\1/    */
6087             folder = NULL;
6088             fold_array = NULL;
6089             utf8_fold_flags = 0;
6090
6091           do_ref:
6092             type = OP(scan);
6093             n = ARG(scan);  /* which paren pair */
6094
6095           do_nref_ref_common:
6096             ln = rex->offs[n].start;
6097             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6098             if (rex->lastparen < n || ln == -1)
6099                 sayNO;                  /* Do not match unless seen CLOSEn. */
6100             if (ln == rex->offs[n].end)
6101                 break;
6102
6103             s = reginfo->strbeg + ln;
6104             if (type != REF     /* REF can do byte comparison */
6105                 && (utf8_target || type == REFFU || type == REFFL))
6106             {
6107                 char * limit = reginfo->strend;
6108
6109                 /* This call case insensitively compares the entire buffer
6110                     * at s, with the current input starting at locinput, but
6111                     * not going off the end given by reginfo->strend, and
6112                     * returns in <limit> upon success, how much of the
6113                     * current input was matched */
6114                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6115                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6116                 {
6117                     sayNO;
6118                 }
6119                 locinput = limit;
6120                 break;
6121             }
6122
6123             /* Not utf8:  Inline the first character, for speed. */
6124             if (!NEXTCHR_IS_EOS &&
6125                 UCHARAT(s) != nextchr &&
6126                 (type == REF ||
6127                  UCHARAT(s) != fold_array[nextchr]))
6128                 sayNO;
6129             ln = rex->offs[n].end - ln;
6130             if (locinput + ln > reginfo->strend)
6131                 sayNO;
6132             if (ln > 1 && (type == REF
6133                            ? memNE(s, locinput, ln)
6134                            : ! folder(s, locinput, ln)))
6135                 sayNO;
6136             locinput += ln;
6137             break;
6138         }
6139
6140         case NOTHING: /* null op; e.g. the 'nothing' following
6141                        * the '*' in m{(a+|b)*}' */
6142             break;
6143         case TAIL: /* placeholder while compiling (A|B|C) */
6144             break;
6145
6146 #undef  ST
6147 #define ST st->u.eval
6148         {
6149             SV *ret;
6150             REGEXP *re_sv;
6151             regexp *re;
6152             regexp_internal *rei;
6153             regnode *startpoint;
6154
6155         case GOSTART: /*  (?R)  */
6156         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6157             if (cur_eval && cur_eval->locinput==locinput) {
6158                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
6159                     Perl_croak(aTHX_ "Infinite recursion in regex");
6160                 if ( ++nochange_depth > max_nochange_depth )
6161                     Perl_croak(aTHX_
6162                         "Pattern subroutine nesting without pos change"
6163                         " exceeded limit in regex");
6164             } else {
6165                 nochange_depth = 0;
6166             }
6167             re_sv = rex_sv;
6168             re = rex;
6169             rei = rexi;
6170             if (OP(scan)==GOSUB) {
6171                 startpoint = scan + ARG2L(scan);
6172                 ST.close_paren = ARG(scan);
6173             } else {
6174                 startpoint = rei->program+1;
6175                 ST.close_paren = 0;
6176             }
6177
6178             /* Save all the positions seen so far. */
6179             ST.cp = regcppush(rex, 0, maxopenparen);
6180             REGCP_SET(ST.lastcp);
6181
6182             /* and then jump to the code we share with EVAL */
6183             goto eval_recurse_doit;
6184             /* NOTREACHED */
6185
6186         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
6187             if (cur_eval && cur_eval->locinput==locinput) {
6188                 if ( ++nochange_depth > max_nochange_depth )
6189                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6190             } else {
6191                 nochange_depth = 0;
6192             }
6193             {
6194                 /* execute the code in the {...} */
6195
6196                 dSP;
6197                 IV before;
6198                 OP * const oop = PL_op;
6199                 COP * const ocurcop = PL_curcop;
6200                 OP *nop;
6201                 CV *newcv;
6202
6203                 /* save *all* paren positions */
6204                 regcppush(rex, 0, maxopenparen);
6205                 REGCP_SET(runops_cp);
6206
6207                 if (!caller_cv)
6208                     caller_cv = find_runcv(NULL);
6209
6210                 n = ARG(scan);
6211
6212                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6213                     newcv = (ReANY(
6214                                                 (REGEXP*)(rexi->data->data[n])
6215                                             ))->qr_anoncv
6216                                         ;
6217                     nop = (OP*)rexi->data->data[n+1];
6218                 }
6219                 else if (rexi->data->what[n] == 'l') { /* literal code */
6220                     newcv = caller_cv;
6221                     nop = (OP*)rexi->data->data[n];
6222                     assert(CvDEPTH(newcv));
6223                 }
6224                 else {
6225                     /* literal with own CV */
6226                     assert(rexi->data->what[n] == 'L');
6227                     newcv = rex->qr_anoncv;
6228                     nop = (OP*)rexi->data->data[n];
6229                 }
6230
6231                 /* normally if we're about to execute code from the same
6232                  * CV that we used previously, we just use the existing
6233                  * CX stack entry. However, its possible that in the
6234                  * meantime we may have backtracked, popped from the save
6235                  * stack, and undone the SAVECOMPPAD(s) associated with
6236                  * PUSH_MULTICALL; in which case PL_comppad no longer
6237                  * points to newcv's pad. */
6238                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6239                 {
6240                     U8 flags = (CXp_SUB_RE |
6241                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6242                     if (last_pushed_cv) {
6243                         CHANGE_MULTICALL_FLAGS(newcv, flags);
6244                     }
6245                     else {
6246                         PUSH_MULTICALL_FLAGS(newcv, flags);
6247                     }
6248                     last_pushed_cv = newcv;
6249                 }
6250                 else {
6251                     /* these assignments are just to silence compiler
6252                      * warnings */
6253                     multicall_cop = NULL;
6254                     newsp = NULL;
6255                 }
6256                 last_pad = PL_comppad;
6257
6258                 /* the initial nextstate you would normally execute
6259                  * at the start of an eval (which would cause error
6260                  * messages to come from the eval), may be optimised
6261                  * away from the execution path in the regex code blocks;
6262                  * so manually set PL_curcop to it initially */
6263                 {
6264                     OP *o = cUNOPx(nop)->op_first;
6265                     assert(o->op_type == OP_NULL);
6266                     if (o->op_targ == OP_SCOPE) {
6267                         o = cUNOPo->op_first;
6268                     }
6269                     else {
6270                         assert(o->op_targ == OP_LEAVE);
6271                         o = cUNOPo->op_first;
6272                         assert(o->op_type == OP_ENTER);
6273                         o = OpSIBLING(o);
6274                     }
6275
6276                     if (o->op_type != OP_STUB) {
6277                         assert(    o->op_type == OP_NEXTSTATE
6278                                 || o->op_type == OP_DBSTATE
6279                                 || (o->op_type == OP_NULL
6280                                     &&  (  o->op_targ == OP_NEXTSTATE
6281                                         || o->op_targ == OP_DBSTATE
6282                                         )
6283                                     )
6284                         );
6285                         PL_curcop = (COP*)o;
6286                     }
6287                 }
6288                 nop = nop->op_next;
6289
6290                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
6291                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6292
6293                 rex->offs[0].end = locinput - reginfo->strbeg;
6294                 if (reginfo->info_aux_eval->pos_magic)
6295                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6296                                   reginfo->sv, reginfo->strbeg,
6297                                   locinput - reginfo->strbeg);
6298
6299                 if (sv_yes_mark) {
6300                     SV *sv_mrk = get_sv("REGMARK", 1);
6301                     sv_setsv(sv_mrk, sv_yes_mark);
6302                 }
6303
6304                 /* we don't use MULTICALL here as we want to call the
6305                  * first op of the block of interest, rather than the
6306                  * first op of the sub */
6307                 before = (IV)(SP-PL_stack_base);
6308                 PL_op = nop;
6309                 CALLRUNOPS(aTHX);                       /* Scalar context. */
6310                 SPAGAIN;
6311                 if ((IV)(SP-PL_stack_base) == before)
6312                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6313                 else {
6314                     ret = POPs;
6315                     PUTBACK;
6316                 }
6317
6318                 /* before restoring everything, evaluate the returned
6319                  * value, so that 'uninit' warnings don't use the wrong
6320                  * PL_op or pad. Also need to process any magic vars
6321                  * (e.g. $1) *before* parentheses are restored */
6322
6323                 PL_op = NULL;
6324
6325                 re_sv = NULL;
6326                 if (logical == 0)        /*   (?{})/   */
6327                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6328                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6329                     sw = cBOOL(SvTRUE(ret));
6330                     logical = 0;
6331                 }
6332                 else {                   /*  /(??{})  */
6333                     /*  if its overloaded, let the regex compiler handle
6334                      *  it; otherwise extract regex, or stringify  */
6335                     if (SvGMAGICAL(ret))
6336                         ret = sv_mortalcopy(ret);
6337                     if (!SvAMAGIC(ret)) {
6338                         SV *sv = ret;
6339                         if (SvROK(sv))
6340                             sv = SvRV(sv);
6341                         if (SvTYPE(sv) == SVt_REGEXP)
6342                             re_sv = (REGEXP*) sv;
6343                         else if (SvSMAGICAL(ret)) {
6344                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6345                             if (mg)
6346                                 re_sv = (REGEXP *) mg->mg_obj;
6347                         }
6348
6349                         /* force any undef warnings here */
6350                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6351                             ret = sv_mortalcopy(ret);
6352                             (void) SvPV_force_nolen(ret);
6353                         }
6354                     }
6355
6356                 }
6357
6358                 /* *** Note that at this point we don't restore
6359                  * PL_comppad, (or pop the CxSUB) on the assumption it may
6360                  * be used again soon. This is safe as long as nothing
6361                  * in the regexp code uses the pad ! */
6362                 PL_op = oop;
6363                 PL_curcop = ocurcop;
6364                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
6365                 PL_curpm = PL_reg_curpm;
6366
6367                 if (logical != 2)
6368                     break;
6369             }
6370
6371                 /* only /(??{})/  from now on */
6372                 logical = 0;
6373                 {
6374                     /* extract RE object from returned value; compiling if
6375                      * necessary */
6376
6377                     if (re_sv) {
6378                         re_sv = reg_temp_copy(NULL, re_sv);
6379                     }
6380                     else {
6381                         U32 pm_flags = 0;
6382
6383                         if (SvUTF8(ret) && IN_BYTES) {
6384                             /* In use 'bytes': make a copy of the octet
6385                              * sequence, but without the flag on */
6386                             STRLEN len;
6387                             const char *const p = SvPV(ret, len);
6388                             ret = newSVpvn_flags(p, len, SVs_TEMP);
6389                         }
6390                         if (rex->intflags & PREGf_USE_RE_EVAL)
6391                             pm_flags |= PMf_USE_RE_EVAL;
6392
6393                         /* if we got here, it should be an engine which
6394                          * supports compiling code blocks and stuff */
6395                         assert(rex->engine && rex->engine->op_comp);
6396                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6397                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
6398                                     rex->engine, NULL, NULL,
6399                                     /* copy /msixn etc to inner pattern */
6400                                     ARG2L(scan),
6401                                     pm_flags);
6402
6403                         if (!(SvFLAGS(ret)
6404                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
6405                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
6406                             /* This isn't a first class regexp. Instead, it's
6407                                caching a regexp onto an existing, Perl visible
6408                                scalar.  */
6409                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
6410                         }
6411                     }
6412                     SAVEFREESV(re_sv);
6413                     re = ReANY(re_sv);
6414                 }
6415                 RXp_MATCH_COPIED_off(re);
6416                 re->subbeg = rex->subbeg;
6417                 re->sublen = rex->sublen;
6418                 re->suboffset = rex->suboffset;
6419                 re->subcoffset = rex->subcoffset;
6420                 re->lastparen = 0;
6421                 re->lastcloseparen = 0;
6422                 rei = RXi_GET(re);
6423                 DEBUG_EXECUTE_r(
6424                     debug_start_match(re_sv, utf8_target, locinput,
6425                                     reginfo->strend, "Matching embedded");
6426                 );
6427                 startpoint = rei->program + 1;
6428                 ST.close_paren = 0; /* only used for GOSUB */
6429                 /* Save all the seen positions so far. */
6430                 ST.cp = regcppush(rex, 0, maxopenparen);
6431                 REGCP_SET(ST.lastcp);
6432                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
6433                 maxopenparen = 0;
6434                 /* run the pattern returned from (??{...}) */
6435
6436               eval_recurse_doit: /* Share code with GOSUB below this line
6437                             * At this point we expect the stack context to be
6438                             * set up correctly */
6439
6440                 /* invalidate the S-L poscache. We're now executing a
6441                  * different set of WHILEM ops (and their associated
6442                  * indexes) against the same string, so the bits in the
6443                  * cache are meaningless. Setting maxiter to zero forces
6444                  * the cache to be invalidated and zeroed before reuse.
6445                  * XXX This is too dramatic a measure. Ideally we should
6446                  * save the old cache and restore when running the outer
6447                  * pattern again */
6448                 reginfo->poscache_maxiter = 0;
6449
6450                 /* the new regexp might have a different is_utf8_pat than we do */
6451                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
6452
6453                 ST.prev_rex = rex_sv;
6454                 ST.prev_curlyx = cur_curlyx;
6455                 rex_sv = re_sv;
6456                 SET_reg_curpm(rex_sv);
6457                 rex = re;
6458                 rexi = rei;
6459                 cur_curlyx = NULL;
6460                 ST.B = next;
6461                 ST.prev_eval = cur_eval;
6462                 cur_eval = st;
6463                 /* now continue from first node in postoned RE */
6464                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
6465                 /* NOTREACHED */
6466                 NOT_REACHED; /* NOTREACHED */
6467         }
6468
6469         case EVAL_AB: /* cleanup after a successful (??{A})B */
6470             /* note: this is called twice; first after popping B, then A */
6471             rex_sv = ST.prev_rex;
6472             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6473             SET_reg_curpm(rex_sv);
6474             rex = ReANY(rex_sv);
6475             rexi = RXi_GET(rex);
6476             {
6477                 /* preserve $^R across LEAVE's. See Bug 121070. */
6478                 SV *save_sv= GvSV(PL_replgv);
6479                 SvREFCNT_inc(save_sv);
6480                 regcpblow(ST.cp); /* LEAVE in disguise */
6481                 sv_setsv(GvSV(PL_replgv), save_sv);
6482                 SvREFCNT_dec(save_sv);
6483             }
6484             cur_eval = ST.prev_eval;
6485             cur_curlyx = ST.prev_curlyx;
6486
6487             /* Invalidate cache. See "invalidate" comment above. */
6488             reginfo->poscache_maxiter = 0;
6489             if ( nochange_depth )
6490                 nochange_depth--;
6491             sayYES;
6492
6493
6494         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
6495             /* note: this is called twice; first after popping B, then A */
6496             rex_sv = ST.prev_rex;
6497             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6498             SET_reg_curpm(rex_sv);
6499             rex = ReANY(rex_sv);
6500             rexi = RXi_GET(rex);
6501
6502             REGCP_UNWIND(ST.lastcp);
6503             regcppop(rex, &maxopenparen);
6504             cur_eval = ST.prev_eval;
6505             cur_curlyx = ST.prev_curlyx;
6506             /* Invalidate cache. See "invalidate" comment above. */
6507             reginfo->poscache_maxiter = 0;
6508             if ( nochange_depth )
6509                 nochange_depth--;
6510             sayNO_SILENT;
6511 #undef ST
6512
6513         case OPEN: /*  (  */
6514             n = ARG(scan);  /* which paren pair */
6515             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
6516             if (n > maxopenparen)
6517                 maxopenparen = n;
6518             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
6519                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
6520                 PTR2UV(rex),
6521                 PTR2UV(rex->offs),
6522                 (UV)n,
6523                 (IV)rex->offs[n].start_tmp,
6524                 (UV)maxopenparen
6525             ));
6526             lastopen = n;
6527             break;
6528
6529 /* XXX really need to log other places start/end are set too */
6530 #define CLOSE_CAPTURE \
6531     rex->offs[n].start = rex->offs[n].start_tmp; \
6532     rex->offs[n].end = locinput - reginfo->strbeg; \
6533     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
6534         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
6535         PTR2UV(rex), \
6536         PTR2UV(rex->offs), \
6537         (UV)n, \
6538         (IV)rex->offs[n].start, \
6539         (IV)rex->offs[n].end \
6540     ))
6541
6542         case CLOSE:  /*  )  */
6543             n = ARG(scan);  /* which paren pair */
6544             CLOSE_CAPTURE;
6545             if (n > rex->lastparen)
6546                 rex->lastparen = n;
6547             rex->lastcloseparen = n;
6548             if (cur_eval && cur_eval->u.eval.close_paren == n) {
6549                 goto fake_end;
6550             }
6551             break;
6552
6553         case ACCEPT:  /*  (*ACCEPT)  */
6554             if (scan->flags)
6555                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6556             if (ARG2L(scan)){
6557                 regnode *cursor;
6558                 for (cursor=scan;
6559                      cursor && OP(cursor)!=END;
6560                      cursor=regnext(cursor))
6561                 {
6562                     if ( OP(cursor)==CLOSE ){
6563                         n = ARG(cursor);
6564                         if ( n <= lastopen ) {
6565                             CLOSE_CAPTURE;
6566                             if (n > rex->lastparen)
6567                                 rex->lastparen = n;
6568                             rex->lastcloseparen = n;
6569                             if ( n == ARG(scan) || (cur_eval &&
6570                                 cur_eval->u.eval.close_paren == n))
6571                                 break;
6572                         }
6573                     }
6574                 }
6575             }
6576             goto fake_end;
6577             /* NOTREACHED */
6578
6579         case GROUPP:  /*  (?(1))  */
6580             n = ARG(scan);  /* which paren pair */
6581             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
6582             break;
6583
6584         case NGROUPP:  /*  (?(<name>))  */
6585             /* reg_check_named_buff_matched returns 0 for no match */
6586             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
6587             break;
6588
6589         case INSUBP:   /*  (?(R))  */
6590             n = ARG(scan);
6591             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
6592             break;
6593
6594         case DEFINEP:  /*  (?(DEFINE))  */
6595             sw = 0;
6596             break;
6597
6598         case IFTHEN:   /*  (?(cond)A|B)  */
6599             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6600             if (sw)
6601                 next = NEXTOPER(NEXTOPER(scan));
6602             else {
6603                 next = scan + ARG(scan);
6604                 if (OP(next) == IFTHEN) /* Fake one. */
6605                     next = NEXTOPER(NEXTOPER(next));
6606             }
6607             break;
6608
6609         case LOGICAL:  /* modifier for EVAL and IFMATCH */
6610             logical = scan->flags;
6611             break;
6612
6613 /*******************************************************************
6614
6615 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
6616 pattern, where A and B are subpatterns. (For simple A, CURLYM or
6617 STAR/PLUS/CURLY/CURLYN are used instead.)
6618
6619 A*B is compiled as <CURLYX><A><WHILEM><B>
6620
6621 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
6622 state, which contains the current count, initialised to -1. It also sets
6623 cur_curlyx to point to this state, with any previous value saved in the
6624 state block.
6625
6626 CURLYX then jumps straight to the WHILEM op, rather than executing A,
6627 since the pattern may possibly match zero times (i.e. it's a while {} loop
6628 rather than a do {} while loop).
6629
6630 Each entry to WHILEM represents a successful match of A. The count in the
6631 CURLYX block is incremented, another WHILEM state is pushed, and execution
6632 passes to A or B depending on greediness and the current count.
6633
6634 For example, if matching against the string a1a2a3b (where the aN are
6635 substrings that match /A/), then the match progresses as follows: (the
6636 pushed states are interspersed with the bits of strings matched so far):
6637
6638     <CURLYX cnt=-1>
6639     <CURLYX cnt=0><WHILEM>
6640     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
6641     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
6642     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
6643     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
6644
6645 (Contrast this with something like CURLYM, which maintains only a single
6646 backtrack state:
6647
6648     <CURLYM cnt=0> a1
6649     a1 <CURLYM cnt=1> a2
6650     a1 a2 <CURLYM cnt=2> a3
6651     a1 a2 a3 <CURLYM cnt=3> b
6652 )
6653
6654 Each WHILEM state block marks a point to backtrack to upon partial failure
6655 of A or B, and also contains some minor state data related to that
6656 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
6657 overall state, such as the count, and pointers to the A and B ops.
6658
6659 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
6660 must always point to the *current* CURLYX block, the rules are:
6661
6662 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
6663 and set cur_curlyx to point the new block.
6664
6665 When popping the CURLYX block after a successful or unsuccessful match,
6666 restore the previous cur_curlyx.
6667
6668 When WHILEM is about to execute B, save the current cur_curlyx, and set it
6669 to the outer one saved in the CURLYX block.
6670
6671 When popping the WHILEM block after a successful or unsuccessful B match,
6672 restore the previous cur_curlyx.
6673
6674 Here's an example for the pattern (AI* BI)*BO
6675 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
6676
6677 cur_
6678 curlyx backtrack stack
6679 ------ ---------------
6680 NULL
6681 CO     <CO prev=NULL> <WO>
6682 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6683 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6684 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
6685
6686 At this point the pattern succeeds, and we work back down the stack to
6687 clean up, restoring as we go:
6688
6689 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6690 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6691 CO     <CO prev=NULL> <WO>
6692 NULL
6693
6694 *******************************************************************/
6695
6696 #define ST st->u.curlyx
6697
6698         case CURLYX:    /* start of /A*B/  (for complex A) */
6699         {
6700             /* No need to save/restore up to this paren */
6701             I32 parenfloor = scan->flags;
6702
6703             assert(next); /* keep Coverity happy */
6704             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
6705                 next += ARG(next);
6706
6707             /* XXXX Probably it is better to teach regpush to support
6708                parenfloor > maxopenparen ... */
6709             if (parenfloor > (I32)rex->lastparen)
6710                 parenfloor = rex->lastparen; /* Pessimization... */
6711
6712             ST.prev_curlyx= cur_curlyx;
6713             cur_curlyx = st;
6714             ST.cp = PL_savestack_ix;
6715
6716             /* these fields contain the state of the current curly.
6717              * they are accessed by subsequent WHILEMs */
6718             ST.parenfloor = parenfloor;
6719             ST.me = scan;
6720             ST.B = next;
6721             ST.minmod = minmod;
6722             minmod = 0;
6723             ST.count = -1;      /* this will be updated by WHILEM */
6724             ST.lastloc = NULL;  /* this will be updated by WHILEM */
6725
6726             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
6727             /* NOTREACHED */
6728             NOT_REACHED; /* NOTREACHED */
6729         }
6730
6731         case CURLYX_end: /* just finished matching all of A*B */
6732             cur_curlyx = ST.prev_curlyx;
6733             sayYES;
6734             /* NOTREACHED */
6735             NOT_REACHED; /* NOTREACHED */
6736
6737         case CURLYX_end_fail: /* just failed to match all of A*B */
6738             regcpblow(ST.cp);
6739             cur_curlyx = ST.prev_curlyx;
6740             sayNO;
6741             /* NOTREACHED */
6742             NOT_REACHED; /* NOTREACHED */
6743
6744
6745 #undef ST
6746 #define ST st->u.whilem
6747
6748         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
6749         {
6750             /* see the discussion above about CURLYX/WHILEM */
6751             I32 n;
6752             int min, max;
6753             regnode *A;
6754
6755             assert(cur_curlyx); /* keep Coverity happy */
6756
6757             min = ARG1(cur_curlyx->u.curlyx.me);
6758             max = ARG2(cur_curlyx->u.curlyx.me);
6759             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
6760             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
6761             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
6762             ST.cache_offset = 0;
6763             ST.cache_mask = 0;
6764
6765
6766             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6767                   "%*s  whilem: matched %ld out of %d..%d\n",
6768                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
6769             );
6770
6771             /* First just match a string of min A's. */
6772
6773             if (n < min) {
6774                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6775                                     maxopenparen);
6776                 cur_curlyx->u.curlyx.lastloc = locinput;
6777                 REGCP_SET(ST.lastcp);
6778
6779                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
6780                 /* NOTREACHED */
6781                 NOT_REACHED; /* NOTREACHED */
6782             }
6783
6784             /* If degenerate A matches "", assume A done. */
6785
6786             if (locinput == cur_curlyx->u.curlyx.lastloc) {
6787                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6788                    "%*s  whilem: empty match detected, trying continuation...\n",
6789                    REPORT_CODE_OFF+depth*2, "")
6790                 );
6791                 goto do_whilem_B_max;
6792             }
6793
6794             /* super-linear cache processing.
6795              *
6796              * The idea here is that for certain types of CURLYX/WHILEM -
6797              * principally those whose upper bound is infinity (and
6798              * excluding regexes that have things like \1 and other very
6799              * non-regular expresssiony things), then if a pattern like
6800              * /....A*.../ fails and we backtrack to the WHILEM, then we
6801              * make a note that this particular WHILEM op was at string
6802              * position 47 (say) when the rest of pattern failed. Then, if
6803              * we ever find ourselves back at that WHILEM, and at string
6804              * position 47 again, we can just fail immediately rather than
6805              * running the rest of the pattern again.
6806              *
6807              * This is very handy when patterns start to go
6808              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
6809              * with a combinatorial explosion of backtracking.
6810              *
6811              * The cache is implemented as a bit array, with one bit per
6812              * string byte position per WHILEM op (up to 16) - so its
6813              * between 0.25 and 2x the string size.
6814              *
6815              * To avoid allocating a poscache buffer every time, we do an
6816              * initially countdown; only after we have  executed a WHILEM
6817              * op (string-length x #WHILEMs) times do we allocate the
6818              * cache.
6819              *
6820              * The top 4 bits of scan->flags byte say how many different
6821              * relevant CURLLYX/WHILEM op pairs there are, while the
6822              * bottom 4-bits is the identifying index number of this
6823              * WHILEM.
6824              */
6825
6826             if (scan->flags) {
6827
6828                 if (!reginfo->poscache_maxiter) {
6829                     /* start the countdown: Postpone detection until we
6830                      * know the match is not *that* much linear. */
6831                     reginfo->poscache_maxiter
6832                         =    (reginfo->strend - reginfo->strbeg + 1)
6833                            * (scan->flags>>4);
6834                     /* possible overflow for long strings and many CURLYX's */
6835                     if (reginfo->poscache_maxiter < 0)
6836                         reginfo->poscache_maxiter = I32_MAX;
6837                     reginfo->poscache_iter = reginfo->poscache_maxiter;
6838                 }
6839
6840                 if (reginfo->poscache_iter-- == 0) {
6841                     /* initialise cache */
6842                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
6843                     regmatch_info_aux *const aux = reginfo->info_aux;
6844                     if (aux->poscache) {
6845                         if ((SSize_t)reginfo->poscache_size < size) {
6846                             Renew(aux->poscache, size, char);
6847                             reginfo->poscache_size = size;
6848                         }
6849                         Zero(aux->poscache, size, char);
6850                     }
6851                     else {
6852                         reginfo->poscache_size = size;
6853                         Newxz(aux->poscache, size, char);
6854                     }
6855                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6856       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
6857                               PL_colors[4], PL_colors[5])
6858                     );
6859                 }
6860
6861                 if (reginfo->poscache_iter < 0) {
6862                     /* have we already failed at this position? */
6863                     SSize_t offset, mask;
6864
6865                     reginfo->poscache_iter = -1; /* stop eventual underflow */
6866                     offset  = (scan->flags & 0xf) - 1
6867                                 +   (locinput - reginfo->strbeg)
6868                                   * (scan->flags>>4);
6869                     mask    = 1 << (offset % 8);
6870                     offset /= 8;
6871                     if (reginfo->info_aux->poscache[offset] & mask) {
6872                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6873                             "%*s  whilem: (cache) already tried at this position...\n",
6874                             REPORT_CODE_OFF+depth*2, "")
6875                         );
6876                         sayNO; /* cache records failure */
6877                     }
6878                     ST.cache_offset = offset;
6879                     ST.cache_mask   = mask;
6880                 }
6881             }
6882
6883             /* Prefer B over A for minimal matching. */
6884
6885             if (cur_curlyx->u.curlyx.minmod) {
6886                 ST.save_curlyx = cur_curlyx;
6887                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6888                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
6889                             maxopenparen);
6890                 REGCP_SET(ST.lastcp);
6891                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
6892                                     locinput);
6893                 /* NOTREACHED */
6894                 NOT_REACHED; /* NOTREACHED */
6895             }
6896
6897             /* Prefer A over B for maximal matching. */
6898
6899             if (n < max) { /* More greed allowed? */
6900                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6901                             maxopenparen);
6902                 cur_curlyx->u.curlyx.lastloc = locinput;
6903                 REGCP_SET(ST.lastcp);
6904                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
6905                 /* NOTREACHED */
6906                 NOT_REACHED; /* NOTREACHED */
6907             }
6908             goto do_whilem_B_max;
6909         }
6910         /* NOTREACHED */
6911         NOT_REACHED; /* NOTREACHED */
6912
6913         case WHILEM_B_min: /* just matched B in a minimal match */
6914         case WHILEM_B_max: /* just matched B in a maximal match */
6915             cur_curlyx = ST.save_curlyx;
6916             sayYES;
6917             /* NOTREACHED */
6918             NOT_REACHED; /* NOTREACHED */
6919
6920         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
6921             cur_curlyx = ST.save_curlyx;
6922             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6923             cur_curlyx->u.curlyx.count--;
6924             CACHEsayNO;
6925             /* NOTREACHED */
6926             NOT_REACHED; /* NOTREACHED */
6927
6928         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
6929             /* FALLTHROUGH */
6930         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
6931             REGCP_UNWIND(ST.lastcp);
6932             regcppop(rex, &maxopenparen);
6933             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6934             cur_curlyx->u.curlyx.count--;
6935             CACHEsayNO;
6936             /* NOTREACHED */
6937             NOT_REACHED; /* NOTREACHED */
6938
6939         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
6940             REGCP_UNWIND(ST.lastcp);
6941             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
6942             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6943                 "%*s  whilem: failed, trying continuation...\n",
6944                 REPORT_CODE_OFF+depth*2, "")
6945             );
6946           do_whilem_B_max:
6947             if (cur_curlyx->u.curlyx.count >= REG_INFTY
6948                 && ckWARN(WARN_REGEXP)
6949                 && !reginfo->warned)
6950             {
6951                 reginfo->warned = TRUE;
6952                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6953                      "Complex regular subexpression recursion limit (%d) "
6954                      "exceeded",
6955                      REG_INFTY - 1);
6956             }
6957
6958             /* now try B */
6959             ST.save_curlyx = cur_curlyx;
6960             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6961             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6962                                 locinput);
6963             /* NOTREACHED */
6964             NOT_REACHED; /* NOTREACHED */
6965
6966         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6967             cur_curlyx = ST.save_curlyx;
6968             REGCP_UNWIND(ST.lastcp);
6969             regcppop(rex, &maxopenparen);
6970
6971             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6972                 /* Maximum greed exceeded */
6973                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6974                     && ckWARN(WARN_REGEXP)
6975                     && !reginfo->warned)
6976                 {
6977                     reginfo->warned     = TRUE;
6978                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6979                         "Complex regular subexpression recursion "
6980                         "limit (%d) exceeded",
6981                         REG_INFTY - 1);
6982                 }
6983                 cur_curlyx->u.curlyx.count--;
6984                 CACHEsayNO;
6985             }
6986
6987             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6988                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6989             );
6990             /* Try grabbing another A and see if it helps. */
6991             cur_curlyx->u.curlyx.lastloc = locinput;
6992             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6993                             maxopenparen);
6994             REGCP_SET(ST.lastcp);
6995             PUSH_STATE_GOTO(WHILEM_A_min,
6996                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6997                 locinput);
6998             /* NOTREACHED */
6999             NOT_REACHED; /* NOTREACHED */
7000
7001 #undef  ST
7002 #define ST st->u.branch
7003
7004         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
7005             next = scan + ARG(scan);
7006             if (next == scan)
7007                 next = NULL;
7008             scan = NEXTOPER(scan);
7009             /* FALLTHROUGH */
7010
7011         case BRANCH:        /*  /(...|A|...)/ */
7012             scan = NEXTOPER(scan); /* scan now points to inner node */
7013             ST.lastparen = rex->lastparen;
7014             ST.lastcloseparen = rex->lastcloseparen;
7015             ST.next_branch = next;
7016             REGCP_SET(ST.cp);
7017
7018             /* Now go into the branch */
7019             if (has_cutgroup) {
7020                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7021             } else {
7022                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7023             }
7024             /* NOTREACHED */
7025             NOT_REACHED; /* NOTREACHED */
7026
7027         case CUTGROUP:  /*  /(*THEN)/  */
7028             sv_yes_mark = st->u.mark.mark_name = scan->flags
7029                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7030                 : NULL;
7031             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7032             /* NOTREACHED */
7033             NOT_REACHED; /* NOTREACHED */
7034
7035         case CUTGROUP_next_fail:
7036             do_cutgroup = 1;
7037             no_final = 1;
7038             if (st->u.mark.mark_name)
7039                 sv_commit = st->u.mark.mark_name;
7040             sayNO;
7041             /* NOTREACHED */
7042             NOT_REACHED; /* NOTREACHED */
7043
7044         case BRANCH_next:
7045             sayYES;
7046             /* NOTREACHED */
7047             NOT_REACHED; /* NOTREACHED */
7048
7049         case BRANCH_next_fail: /* that branch failed; try the next, if any */
7050             if (do_cutgroup) {
7051                 do_cutgroup = 0;
7052                 no_final = 0;
7053             }
7054             REGCP_UNWIND(ST.cp);
7055             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7056             scan = ST.next_branch;
7057             /* no more branches? */
7058             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7059                 DEBUG_EXECUTE_r({
7060                     PerlIO_printf( Perl_debug_log,
7061                         "%*s  %sBRANCH failed...%s\n",
7062                         REPORT_CODE_OFF+depth*2, "",
7063                         PL_colors[4],
7064                         PL_colors[5] );
7065                 });
7066                 sayNO_SILENT;
7067             }
7068             continue; /* execute next BRANCH[J] op */
7069             /* NOTREACHED */
7070
7071         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7072             minmod = 1;
7073             break;
7074
7075 #undef  ST
7076 #define ST st->u.curlym
7077
7078         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7079
7080             /* This is an optimisation of CURLYX that enables us to push
7081              * only a single backtracking state, no matter how many matches
7082              * there are in {m,n}. It relies on the pattern being constant
7083              * length, with no parens to influence future backrefs
7084              */
7085
7086             ST.me = scan;
7087             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7088
7089             ST.lastparen      = rex->lastparen;
7090             ST.lastcloseparen = rex->lastcloseparen;
7091
7092             /* if paren positive, emulate an OPEN/CLOSE around A */
7093             if (ST.me->flags) {
7094                 U32 paren = ST.me->flags;
7095                 if (paren > maxopenparen)
7096                     maxopenparen = paren;
7097                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7098             }
7099             ST.A = scan;
7100             ST.B = next;
7101             ST.alen = 0;
7102             ST.count = 0;
7103             ST.minmod = minmod;
7104             minmod = 0;
7105             ST.c1 = CHRTEST_UNINIT;
7106             REGCP_SET(ST.cp);
7107
7108             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7109                 goto curlym_do_B;
7110
7111           curlym_do_A: /* execute the A in /A{m,n}B/  */
7112             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7113             /* NOTREACHED */
7114             NOT_REACHED; /* NOTREACHED */
7115
7116         case CURLYM_A: /* we've just matched an A */
7117             ST.count++;
7118             /* after first match, determine A's length: u.curlym.alen */
7119             if (ST.count == 1) {
7120                 if (reginfo->is_utf8_target) {
7121                     char *s = st->locinput;
7122                     while (s < locinput) {
7123                         ST.alen++;
7124                         s += UTF8SKIP(s);
7125                     }
7126                 }
7127                 else {
7128                     ST.alen = locinput - st->locinput;
7129                 }
7130                 if (ST.alen == 0)
7131                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7132             }
7133             DEBUG_EXECUTE_r(
7134                 PerlIO_printf(Perl_debug_log,
7135                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7136                           (int)(REPORT_CODE_OFF+(depth*2)), "",
7137                           (IV) ST.count, (IV)ST.alen)
7138             );
7139
7140             if (cur_eval && cur_eval->u.eval.close_paren &&
7141                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7142                 goto fake_end;
7143
7144             {
7145                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7146                 if ( max == REG_INFTY || ST.count < max )
7147                     goto curlym_do_A; /* try to match another A */
7148             }
7149             goto curlym_do_B; /* try to match B */
7150
7151         case CURLYM_A_fail: /* just failed to match an A */
7152             REGCP_UNWIND(ST.cp);
7153
7154             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7155                 || (cur_eval && cur_eval->u.eval.close_paren &&
7156                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
7157                 sayNO;
7158
7159           curlym_do_B: /* execute the B in /A{m,n}B/  */
7160             if (ST.c1 == CHRTEST_UNINIT) {
7161                 /* calculate c1 and c2 for possible match of 1st char
7162                  * following curly */
7163                 ST.c1 = ST.c2 = CHRTEST_VOID;
7164                 assert(ST.B);
7165                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7166                     regnode *text_node = ST.B;
7167                     if (! HAS_TEXT(text_node))
7168                         FIND_NEXT_IMPT(text_node);
7169                     /* this used to be
7170
7171                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7172
7173                         But the former is redundant in light of the latter.
7174
7175                         if this changes back then the macro for
7176                         IS_TEXT and friends need to change.
7177                      */
7178                     if (PL_regkind[OP(text_node)] == EXACT) {
7179                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7180                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7181                            reginfo))
7182                         {
7183                             sayNO;
7184                         }
7185                     }
7186                 }
7187             }
7188
7189             DEBUG_EXECUTE_r(
7190                 PerlIO_printf(Perl_debug_log,
7191                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
7192                     (int)(REPORT_CODE_OFF+(depth*2)),
7193                     "", (IV)ST.count)
7194                 );
7195             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7196                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7197                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7198                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7199                     {
7200                         /* simulate B failing */
7201                         DEBUG_OPTIMISE_r(
7202                             PerlIO_printf(Perl_debug_log,
7203                                 "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7204                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
7205                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7206                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7207                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7208                         );
7209                         state_num = CURLYM_B_fail;
7210                         goto reenter_switch;
7211                     }
7212                 }
7213                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7214                     /* simulate B failing */
7215                     DEBUG_OPTIMISE_r(
7216                         PerlIO_printf(Perl_debug_log,
7217                             "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7218                             (int)(REPORT_CODE_OFF+(depth*2)),"",
7219                             (int) nextchr, ST.c1, ST.c2)
7220                     );
7221                     state_num = CURLYM_B_fail;
7222                     goto reenter_switch;
7223                 }
7224             }
7225
7226             if (ST.me->flags) {
7227                 /* emulate CLOSE: mark current A as captured */
7228                 I32 paren = ST.me->flags;
7229                 if (ST.count) {
7230                     rex->offs[paren].start
7231                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7232                     rex->offs[paren].end = locinput - reginfo->strbeg;
7233                     if ((U32)paren > rex->lastparen)
7234                         rex->lastparen = paren;
7235                     rex->lastcloseparen = paren;
7236                 }
7237                 else
7238                     rex->offs[paren].end = -1;
7239                 if (cur_eval && cur_eval->u.eval.close_paren &&
7240                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7241                 {
7242                     if (ST.count)
7243                         goto fake_end;
7244                     else
7245                         sayNO;
7246                 }
7247             }
7248
7249             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7250             /* NOTREACHED */
7251             NOT_REACHED; /* NOTREACHED */
7252
7253         case CURLYM_B_fail: /* just failed to match a B */
7254             REGCP_UNWIND(ST.cp);
7255             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7256             if (ST.minmod) {
7257                 I32 max = ARG2(ST.me);
7258                 if (max != REG_INFTY && ST.count == max)
7259                     sayNO;
7260                 goto curlym_do_A; /* try to match a further A */
7261             }
7262             /* backtrack one A */
7263             if (ST.count == ARG1(ST.me) /* min */)
7264                 sayNO;
7265             ST.count--;
7266             SET_locinput(HOPc(locinput, -ST.alen));
7267             goto curlym_do_B; /* try to match B */
7268
7269 #undef ST
7270 #define ST st->u.curly
7271
7272 #define CURLY_SETPAREN(paren, success) \
7273     if (paren) { \
7274         if (success) { \
7275             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7276             rex->offs[paren].end = locinput - reginfo->strbeg; \
7277             if (paren > rex->lastparen) \
7278                 rex->lastparen = paren; \
7279             rex->lastcloseparen = paren; \
7280         } \
7281         else { \
7282             rex->offs[paren].end = -1; \
7283             rex->lastparen      = ST.lastparen; \
7284             rex->lastcloseparen = ST.lastcloseparen; \
7285         } \
7286     }
7287
7288         case STAR:              /*  /A*B/ where A is width 1 char */
7289             ST.paren = 0;
7290             ST.min = 0;
7291             ST.max = REG_INFTY;
7292             scan = NEXTOPER(scan);
7293             goto repeat;
7294
7295         case PLUS:              /*  /A+B/ where A is width 1 char */
7296             ST.paren = 0;
7297             ST.min = 1;
7298             ST.max = REG_INFTY;
7299             scan = NEXTOPER(scan);
7300             goto repeat;
7301
7302         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
7303             ST.paren = scan->flags;     /* Which paren to set */
7304             ST.lastparen      = rex->lastparen;
7305             ST.lastcloseparen = rex->lastcloseparen;
7306             if (ST.paren > maxopenparen)
7307                 maxopenparen = ST.paren;
7308             ST.min = ARG1(scan);  /* min to match */
7309             ST.max = ARG2(scan);  /* max to match */
7310             if (cur_eval && cur_eval->u.eval.close_paren &&
7311                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7312                 ST.min=1;
7313                 ST.max=1;
7314             }
7315             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7316             goto repeat;
7317
7318         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
7319             ST.paren = 0;
7320             ST.min = ARG1(scan);  /* min to match */
7321             ST.max = ARG2(scan);  /* max to match */
7322             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7323           repeat:
7324             /*
7325             * Lookahead to avoid useless match attempts
7326             * when we know what character comes next.
7327             *
7328             * Used to only do .*x and .*?x, but now it allows
7329             * for )'s, ('s and (?{ ... })'s to be in the way
7330             * of the quantifier and the EXACT-like node.  -- japhy
7331             */
7332
7333             assert(ST.min <= ST.max);
7334             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7335                 ST.c1 = ST.c2 = CHRTEST_VOID;
7336             }
7337             else {
7338                 regnode *text_node = next;
7339
7340                 if (! HAS_TEXT(text_node))
7341                     FIND_NEXT_IMPT(text_node);
7342
7343                 if (! HAS_TEXT(text_node))
7344                     ST.c1 = ST.c2 = CHRTEST_VOID;
7345                 else {
7346                     if ( PL_regkind[OP(text_node)] != EXACT ) {
7347                         ST.c1 = ST.c2 = CHRTEST_VOID;
7348                     }
7349                     else {
7350
7351                     /*  Currently we only get here when
7352
7353                         PL_rekind[OP(text_node)] == EXACT
7354
7355                         if this changes back then the macro for IS_TEXT and
7356                         friends need to change. */
7357                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7358                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7359                            reginfo))
7360                         {
7361                             sayNO;
7362                         }
7363                     }
7364                 }
7365             }
7366
7367             ST.A = scan;
7368             ST.B = next;
7369             if (minmod) {
7370                 char *li = locinput;
7371                 minmod = 0;
7372                 if (ST.min &&
7373                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
7374                             < ST.min)
7375                     sayNO;
7376                 SET_locinput(li);
7377                 ST.count = ST.min;
7378                 REGCP_SET(ST.cp);
7379                 if (ST.c1 == CHRTEST_VOID)
7380                     goto curly_try_B_min;
7381
7382                 ST.oldloc = locinput;
7383
7384                 /* set ST.maxpos to the furthest point along the
7385                  * string that could possibly match */
7386                 if  (ST.max == REG_INFTY) {
7387                     ST.maxpos = reginfo->strend - 1;
7388                     if (utf8_target)
7389                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7390                             ST.maxpos--;
7391                 }
7392                 else if (utf8_target) {
7393                     int m = ST.max - ST.min;
7394                     for (ST.maxpos = locinput;
7395                          m >0 && ST.maxpos < reginfo->strend; m--)
7396                         ST.maxpos += UTF8SKIP(ST.maxpos);
7397                 }
7398                 else {
7399                     ST.maxpos = locinput + ST.max - ST.min;
7400                     if (ST.maxpos >= reginfo->strend)
7401                         ST.maxpos = reginfo->strend - 1;
7402                 }
7403                 goto curly_try_B_min_known;
7404
7405             }
7406             else {
7407                 /* avoid taking address of locinput, so it can remain
7408                  * a register var */
7409                 char *li = locinput;
7410                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
7411                 if (ST.count < ST.min)
7412                     sayNO;
7413                 SET_locinput(li);
7414                 if ((ST.count > ST.min)
7415                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
7416                 {
7417                     /* A{m,n} must come at the end of the string, there's
7418                      * no point in backing off ... */
7419                     ST.min = ST.count;
7420                     /* ...except that $ and \Z can match before *and* after
7421                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
7422                        We may back off by one in this case. */
7423                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
7424                         ST.min--;
7425                 }
7426                 REGCP_SET(ST.cp);
7427                 goto curly_try_B_max;
7428             }
7429             /* NOTREACHED */
7430             NOT_REACHED; /* NOTREACHED */
7431
7432         case CURLY_B_min_known_fail:
7433             /* failed to find B in a non-greedy match where c1,c2 valid */
7434
7435             REGCP_UNWIND(ST.cp);
7436             if (ST.paren) {
7437                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7438             }
7439             /* Couldn't or didn't -- move forward. */
7440             ST.oldloc = locinput;
7441             if (utf8_target)
7442                 locinput += UTF8SKIP(locinput);
7443             else
7444                 locinput++;
7445             ST.count++;
7446           curly_try_B_min_known:
7447              /* find the next place where 'B' could work, then call B */
7448             {
7449                 int n;
7450                 if (utf8_target) {
7451                     n = (ST.oldloc == locinput) ? 0 : 1;
7452                     if (ST.c1 == ST.c2) {
7453                         /* set n to utf8_distance(oldloc, locinput) */
7454                         while (locinput <= ST.maxpos
7455                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
7456                         {
7457                             locinput += UTF8SKIP(locinput);
7458                             n++;
7459                         }
7460                     }
7461                     else {
7462                         /* set n to utf8_distance(oldloc, locinput) */
7463                         while (locinput <= ST.maxpos
7464                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7465                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7466                         {
7467                             locinput += UTF8SKIP(locinput);
7468                             n++;
7469                         }
7470                     }
7471                 }
7472                 else {  /* Not utf8_target */
7473                     if (ST.c1 == ST.c2) {
7474                         while (locinput <= ST.maxpos &&
7475                                UCHARAT(locinput) != ST.c1)
7476                             locinput++;
7477                     }
7478                     else {
7479                         while (locinput <= ST.maxpos
7480                                && UCHARAT(locinput) != ST.c1
7481                                && UCHARAT(locinput) != ST.c2)
7482                             locinput++;
7483                     }
7484                     n = locinput - ST.oldloc;
7485                 }
7486                 if (locinput > ST.maxpos)
7487                     sayNO;
7488                 if (n) {
7489                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
7490                      * at b; check that everything between oldloc and
7491                      * locinput matches */
7492                     char *li = ST.oldloc;
7493                     ST.count += n;
7494                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
7495                         sayNO;
7496                     assert(n == REG_INFTY || locinput == li);
7497                 }
7498                 CURLY_SETPAREN(ST.paren, ST.count);
7499                 if (cur_eval && cur_eval->u.eval.close_paren &&
7500                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
7501                     goto fake_end;
7502                 }
7503                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
7504             }
7505             /* NOTREACHED */
7506             NOT_REACHED; /* NOTREACHED */
7507
7508         case CURLY_B_min_fail:
7509             /* failed to find B in a non-greedy match where c1,c2 invalid */
7510
7511             REGCP_UNWIND(ST.cp);
7512             if (ST.paren) {
7513                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7514             }
7515             /* failed -- move forward one */
7516             {
7517                 char *li = locinput;
7518                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
7519                     sayNO;
7520                 }
7521                 locinput = li;
7522             }
7523             {
7524                 ST.count++;
7525                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
7526                         ST.count > 0)) /* count overflow ? */
7527                 {
7528                   curly_try_B_min:
7529                     CURLY_SETPAREN(ST.paren, ST.count);
7530                     if (cur_eval && cur_eval->u.eval.close_paren &&
7531                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
7532                         goto fake_end;
7533                     }
7534                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
7535                 }
7536             }
7537             sayNO;
7538             /* NOTREACHED */
7539             NOT_REACHED; /* NOTREACHED */
7540
7541           curly_try_B_max:
7542             /* a successful greedy match: now try to match B */
7543             if (cur_eval && cur_eval->u.eval.close_paren &&
7544                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7545                 goto fake_end;
7546             }
7547             {
7548                 bool could_match = locinput < reginfo->strend;
7549
7550                 /* If it could work, try it. */
7551                 if (ST.c1 != CHRTEST_VOID && could_match) {
7552                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
7553                     {
7554                         could_match = memEQ(locinput,
7555                                             ST.c1_utf8,
7556                                             UTF8SKIP(locinput))
7557                                     || memEQ(locinput,
7558                                              ST.c2_utf8,
7559                                              UTF8SKIP(locinput));
7560                     }
7561                     else {
7562                         could_match = UCHARAT(locinput) == ST.c1
7563                                       || UCHARAT(locinput) == ST.c2;
7564                     }
7565                 }
7566                 if (ST.c1 == CHRTEST_VOID || could_match) {
7567                     CURLY_SETPAREN(ST.paren, ST.count);
7568                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
7569                     /* NOTREACHED */
7570                     NOT_REACHED; /* NOTREACHED */
7571                 }
7572             }
7573             /* FALLTHROUGH */
7574
7575         case CURLY_B_max_fail:
7576             /* failed to find B in a greedy match */
7577
7578             REGCP_UNWIND(ST.cp);
7579             if (ST.paren) {
7580                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7581             }
7582             /*  back up. */
7583             if (--ST.count < ST.min)
7584                 sayNO;
7585             locinput = HOPc(locinput, -1);
7586             goto curly_try_B_max;
7587
7588 #undef ST
7589
7590         case END: /*  last op of main pattern  */
7591           fake_end:
7592             if (cur_eval) {
7593                 /* we've just finished A in /(??{A})B/; now continue with B */
7594
7595                 st->u.eval.prev_rex = rex_sv;           /* inner */
7596
7597                 /* Save *all* the positions. */
7598                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
7599                 rex_sv = cur_eval->u.eval.prev_rex;
7600                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7601                 SET_reg_curpm(rex_sv);
7602                 rex = ReANY(rex_sv);
7603                 rexi = RXi_GET(rex);
7604                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
7605
7606                 REGCP_SET(st->u.eval.lastcp);
7607
7608                 /* Restore parens of the outer rex without popping the
7609                  * savestack */
7610                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
7611                                         &maxopenparen);
7612
7613                 st->u.eval.prev_eval = cur_eval;
7614                 cur_eval = cur_eval->u.eval.prev_eval;
7615                 DEBUG_EXECUTE_r(
7616                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
7617                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
7618                 if ( nochange_depth )
7619                     nochange_depth--;
7620
7621                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
7622                                     locinput); /* match B */
7623             }
7624
7625             if (locinput < reginfo->till) {
7626                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7627                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
7628                                       PL_colors[4],
7629                                       (long)(locinput - startpos),
7630                                       (long)(reginfo->till - startpos),
7631                                       PL_colors[5]));
7632
7633                 sayNO_SILENT;           /* Cannot match: too short. */
7634             }
7635             sayYES;                     /* Success! */
7636
7637         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
7638             DEBUG_EXECUTE_r(
7639             PerlIO_printf(Perl_debug_log,
7640                 "%*s  %ssubpattern success...%s\n",
7641                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
7642             sayYES;                     /* Success! */
7643
7644 #undef  ST
7645 #define ST st->u.ifmatch
7646
7647         {
7648             char *newstart;
7649
7650         case SUSPEND:   /* (?>A) */
7651             ST.wanted = 1;
7652             newstart = locinput;
7653             goto do_ifmatch;
7654
7655         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
7656             ST.wanted = 0;
7657             goto ifmatch_trivial_fail_test;
7658
7659         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
7660             ST.wanted = 1;
7661           ifmatch_trivial_fail_test:
7662             if (scan->flags) {
7663                 char * const s = HOPBACKc(locinput, scan->flags);
7664                 if (!s) {
7665                     /* trivial fail */
7666                     if (logical) {
7667                         logical = 0;
7668                         sw = 1 - cBOOL(ST.wanted);
7669                     }
7670                     else if (ST.wanted)
7671                         sayNO;
7672                     next = scan + ARG(scan);
7673                     if (next == scan)
7674                         next = NULL;
7675                     break;
7676                 }
7677                 newstart = s;
7678             }
7679             else
7680                 newstart = locinput;
7681
7682           do_ifmatch:
7683             ST.me = scan;
7684             ST.logical = logical;
7685             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
7686
7687             /* execute body of (?...A) */
7688             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
7689             /* NOTREACHED */
7690             NOT_REACHED; /* NOTREACHED */
7691         }
7692
7693         case IFMATCH_A_fail: /* body of (?...A) failed */
7694             ST.wanted = !ST.wanted;
7695             /* FALLTHROUGH */
7696
7697         case IFMATCH_A: /* body of (?...A) succeeded */
7698             if (ST.logical) {
7699                 sw = cBOOL(ST.wanted);
7700             }
7701             else if (!ST.wanted)
7702                 sayNO;
7703
7704             if (OP(ST.me) != SUSPEND) {
7705                 /* restore old position except for (?>...) */
7706                 locinput = st->locinput;
7707             }
7708             scan = ST.me + ARG(ST.me);
7709             if (scan == ST.me)
7710                 scan = NULL;
7711             continue; /* execute B */
7712
7713 #undef ST
7714
7715         case LONGJMP: /*  alternative with many branches compiles to
7716                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
7717             next = scan + ARG(scan);
7718             if (next == scan)
7719                 next = NULL;
7720             break;
7721
7722         case COMMIT:  /*  (*COMMIT)  */
7723             reginfo->cutpoint = reginfo->strend;
7724             /* FALLTHROUGH */
7725
7726         case PRUNE:   /*  (*PRUNE)   */
7727             if (scan->flags)
7728                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7729             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
7730             /* NOTREACHED */
7731             NOT_REACHED; /* NOTREACHED */
7732
7733         case COMMIT_next_fail:
7734             no_final = 1;
7735             /* FALLTHROUGH */
7736             sayNO;
7737             NOT_REACHED; /* NOTREACHED */
7738
7739         case OPFAIL:   /* (*FAIL)  */
7740             if (scan->flags)
7741                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7742             if (logical) {
7743                 /* deal with (?(?!)X|Y) properly,
7744                  * make sure we trigger the no branch
7745                  * of the trailing IFTHEN structure*/
7746                 sw= 0;
7747                 break;
7748             } else {
7749                 sayNO;
7750             }
7751             /* NOTREACHED */
7752             NOT_REACHED; /* NOTREACHED */
7753
7754 #define ST st->u.mark
7755         case MARKPOINT: /*  (*MARK:foo)  */
7756             ST.prev_mark = mark_state;
7757             ST.mark_name = sv_commit = sv_yes_mark
7758                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7759             mark_state = st;
7760             ST.mark_loc = locinput;
7761             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
7762             /* NOTREACHED */
7763             NOT_REACHED; /* NOTREACHED */
7764
7765         case MARKPOINT_next:
7766             mark_state = ST.prev_mark;
7767             sayYES;
7768             /* NOTREACHED */
7769             NOT_REACHED; /* NOTREACHED */
7770
7771         case MARKPOINT_next_fail:
7772             if (popmark && sv_eq(ST.mark_name,popmark))
7773             {
7774                 if (ST.mark_loc > startpoint)
7775                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7776                 popmark = NULL; /* we found our mark */
7777                 sv_commit = ST.mark_name;
7778
7779                 DEBUG_EXECUTE_r({
7780                         PerlIO_printf(Perl_debug_log,
7781                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
7782                             REPORT_CODE_OFF+depth*2, "",
7783                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
7784                 });
7785             }
7786             mark_state = ST.prev_mark;
7787             sv_yes_mark = mark_state ?
7788                 mark_state->u.mark.mark_name : NULL;
7789             sayNO;
7790             /* NOTREACHED */
7791             NOT_REACHED; /* NOTREACHED */
7792
7793         case SKIP:  /*  (*SKIP)  */
7794             if (!scan->flags) {
7795                 /* (*SKIP) : if we fail we cut here*/
7796                 ST.mark_name = NULL;
7797                 ST.mark_loc = locinput;
7798                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
7799             } else {
7800                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
7801                    otherwise do nothing.  Meaning we need to scan
7802                  */
7803                 regmatch_state *cur = mark_state;
7804                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7805
7806                 while (cur) {
7807                     if ( sv_eq( cur->u.mark.mark_name,
7808                                 find ) )
7809                     {
7810                         ST.mark_name = find;
7811                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
7812                     }
7813                     cur = cur->u.mark.prev_mark;
7814                 }
7815             }
7816             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
7817             break;
7818
7819         case SKIP_next_fail:
7820             if (ST.mark_name) {
7821                 /* (*CUT:NAME) - Set up to search for the name as we
7822                    collapse the stack*/
7823                 popmark = ST.mark_name;
7824             } else {
7825                 /* (*CUT) - No name, we cut here.*/
7826                 if (ST.mark_loc > startpoint)
7827                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7828                 /* but we set sv_commit to latest mark_name if there
7829                    is one so they can test to see how things lead to this
7830                    cut */
7831                 if (mark_state)
7832                     sv_commit=mark_state->u.mark.mark_name;
7833             }
7834             no_final = 1;
7835             sayNO;
7836             /* NOTREACHED */
7837             NOT_REACHED; /* NOTREACHED */
7838 #undef ST
7839
7840         case LNBREAK: /* \R */
7841             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
7842                 locinput += n;
7843             } else
7844                 sayNO;
7845             break;
7846
7847         default:
7848             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
7849                           PTR2UV(scan), OP(scan));
7850             Perl_croak(aTHX_ "regexp memory corruption");
7851
7852         /* this is a point to jump to in order to increment
7853          * locinput by one character */
7854           increment_locinput:
7855             assert(!NEXTCHR_IS_EOS);
7856             if (utf8_target) {
7857                 locinput += PL_utf8skip[nextchr];
7858                 /* locinput is allowed to go 1 char off the end, but not 2+ */
7859                 if (locinput > reginfo->strend)
7860                     sayNO;
7861             }
7862             else
7863                 locinput++;
7864             break;
7865
7866         } /* end switch */
7867
7868         /* switch break jumps here */
7869         scan = next; /* prepare to execute the next op and ... */
7870         continue;    /* ... jump back to the top, reusing st */
7871         /* NOTREACHED */
7872
7873       push_yes_state:
7874         /* push a state that backtracks on success */
7875         st->u.yes.prev_yes_state = yes_state;
7876         yes_state = st;
7877         /* FALLTHROUGH */
7878       push_state:
7879         /* push a new regex state, then continue at scan  */
7880         {
7881             regmatch_state *newst;
7882
7883             DEBUG_STACK_r({
7884                 regmatch_state *cur = st;
7885                 regmatch_state *curyes = yes_state;
7886                 int curd = depth;
7887                 regmatch_slab *slab = PL_regmatch_slab;
7888                 for (;curd > -1;cur--,curd--) {
7889                     if (cur < SLAB_FIRST(slab)) {
7890                         slab = slab->prev;
7891                         cur = SLAB_LAST(slab);
7892                     }
7893                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
7894                         REPORT_CODE_OFF + 2 + depth * 2,"",
7895                         curd, PL_reg_name[cur->resume_state],
7896                         (curyes == cur) ? "yes" : ""
7897                     );
7898                     if (curyes == cur)
7899                         curyes = cur->u.yes.prev_yes_state;
7900                 }
7901             } else
7902                 DEBUG_STATE_pp("push")
7903             );
7904             depth++;
7905             st->locinput = locinput;
7906             newst = st+1;
7907             if (newst >  SLAB_LAST(PL_regmatch_slab))
7908                 newst = S_push_slab(aTHX);
7909             PL_regmatch_state = newst;
7910
7911             locinput = pushinput;
7912             st = newst;
7913             continue;
7914             /* NOTREACHED */
7915         }
7916     }
7917
7918     /*
7919     * We get here only if there's trouble -- normally "case END" is
7920     * the terminating point.
7921     */
7922     Perl_croak(aTHX_ "corrupted regexp pointers");
7923     /* NOTREACHED */
7924     sayNO;
7925     NOT_REACHED; /* NOTREACHED */
7926
7927   yes:
7928     if (yes_state) {
7929         /* we have successfully completed a subexpression, but we must now
7930          * pop to the state marked by yes_state and continue from there */
7931         assert(st != yes_state);
7932 #ifdef DEBUGGING
7933         while (st != yes_state) {
7934             st--;
7935             if (st < SLAB_FIRST(PL_regmatch_slab)) {
7936                 PL_regmatch_slab = PL_regmatch_slab->prev;
7937                 st = SLAB_LAST(PL_regmatch_slab);
7938             }
7939             DEBUG_STATE_r({
7940                 if (no_final) {
7941                     DEBUG_STATE_pp("pop (no final)");
7942                 } else {
7943                     DEBUG_STATE_pp("pop (yes)");
7944                 }
7945             });
7946             depth--;
7947         }
7948 #else
7949         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
7950             || yes_state > SLAB_LAST(PL_regmatch_slab))
7951         {
7952             /* not in this slab, pop slab */
7953             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
7954             PL_regmatch_slab = PL_regmatch_slab->prev;
7955             st = SLAB_LAST(PL_regmatch_slab);
7956         }
7957         depth -= (st - yes_state);
7958 #endif
7959         st = yes_state;
7960         yes_state = st->u.yes.prev_yes_state;
7961         PL_regmatch_state = st;
7962
7963         if (no_final)
7964             locinput= st->locinput;
7965         state_num = st->resume_state + no_final;
7966         goto reenter_switch;
7967     }
7968
7969     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7970                           PL_colors[4], PL_colors[5]));
7971
7972     if (reginfo->info_aux_eval) {
7973         /* each successfully executed (?{...}) block does the equivalent of
7974          *   local $^R = do {...}
7975          * When popping the save stack, all these locals would be undone;
7976          * bypass this by setting the outermost saved $^R to the latest
7977          * value */
7978         /* I dont know if this is needed or works properly now.
7979          * see code related to PL_replgv elsewhere in this file.
7980          * Yves
7981          */
7982         if (oreplsv != GvSV(PL_replgv))
7983             sv_setsv(oreplsv, GvSV(PL_replgv));
7984     }
7985     result = 1;
7986     goto final_exit;
7987
7988   no:
7989     DEBUG_EXECUTE_r(
7990         PerlIO_printf(Perl_debug_log,
7991             "%*s  %sfailed...%s\n",
7992             REPORT_CODE_OFF+depth*2, "",
7993             PL_colors[4], PL_colors[5])
7994         );
7995
7996   no_silent:
7997     if (no_final) {
7998         if (yes_state) {
7999             goto yes;
8000         } else {
8001             goto final_exit;
8002         }
8003     }
8004     if (depth) {
8005         /* there's a previous state to backtrack to */
8006         st--;
8007         if (st < SLAB_FIRST(PL_regmatch_slab)) {
8008             PL_regmatch_slab = PL_regmatch_slab->prev;
8009             st = SLAB_LAST(PL_regmatch_slab);
8010         }
8011         PL_regmatch_state = st;
8012         locinput= st->locinput;
8013
8014         DEBUG_STATE_pp("pop");
8015         depth--;
8016         if (yes_state == st)
8017             yes_state = st->u.yes.prev_yes_state;
8018
8019         state_num = st->resume_state + 1; /* failure = success + 1 */
8020         goto reenter_switch;
8021     }
8022     result = 0;
8023
8024   final_exit:
8025     if (rex->intflags & PREGf_VERBARG_SEEN) {
8026         SV *sv_err = get_sv("REGERROR", 1);
8027         SV *sv_mrk = get_sv("REGMARK", 1);
8028         if (result) {
8029             sv_commit = &PL_sv_no;
8030             if (!sv_yes_mark)
8031                 sv_yes_mark = &PL_sv_yes;
8032         } else {
8033             if (!sv_commit)
8034                 sv_commit = &PL_sv_yes;
8035             sv_yes_mark = &PL_sv_no;
8036         }
8037         assert(sv_err);
8038         assert(sv_mrk);
8039         sv_setsv(sv_err, sv_commit);
8040         sv_setsv(sv_mrk, sv_yes_mark);
8041     }
8042
8043
8044     if (last_pushed_cv) {
8045         dSP;
8046         POP_MULTICALL;
8047         PERL_UNUSED_VAR(SP);
8048     }
8049
8050     assert(!result ||  locinput - reginfo->strbeg >= 0);
8051     return result ?  locinput - reginfo->strbeg : -1;
8052 }
8053
8054 /*
8055  - regrepeat - repeatedly match something simple, report how many
8056  *
8057  * What 'simple' means is a node which can be the operand of a quantifier like
8058  * '+', or {1,3}
8059  *
8060  * startposp - pointer a pointer to the start position.  This is updated
8061  *             to point to the byte following the highest successful
8062  *             match.
8063  * p         - the regnode to be repeatedly matched against.
8064  * reginfo   - struct holding match state, such as strend
8065  * max       - maximum number of things to match.
8066  * depth     - (for debugging) backtracking depth.
8067  */
8068 STATIC I32
8069 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8070             regmatch_info *const reginfo, I32 max, int depth)
8071 {
8072     char *scan;     /* Pointer to current position in target string */
8073     I32 c;
8074     char *loceol = reginfo->strend;   /* local version */
8075     I32 hardcount = 0;  /* How many matches so far */
8076     bool utf8_target = reginfo->is_utf8_target;
8077     unsigned int to_complement = 0;  /* Invert the result? */
8078     UV utf8_flags;
8079     _char_class_number classnum;
8080 #ifndef DEBUGGING
8081     PERL_UNUSED_ARG(depth);
8082 #endif
8083
8084     PERL_ARGS_ASSERT_REGREPEAT;
8085
8086     scan = *startposp;
8087     if (max == REG_INFTY)
8088         max = I32_MAX;
8089     else if (! utf8_target && loceol - scan > max)
8090         loceol = scan + max;
8091
8092     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8093      * to the maximum of how far we should go in it (leaving it set to the real
8094      * end, if the maximum permissible would take us beyond that).  This allows
8095      * us to make the loop exit condition that we haven't gone past <loceol> to
8096      * also mean that we haven't exceeded the max permissible count, saving a
8097      * test each time through the loop.  But it assumes that the OP matches a
8098      * single byte, which is true for most of the OPs below when applied to a
8099      * non-UTF-8 target.  Those relatively few OPs that don't have this
8100      * characteristic will have to compensate.
8101      *
8102      * There is no adjustment for UTF-8 targets, as the number of bytes per
8103      * character varies.  OPs will have to test both that the count is less
8104      * than the max permissible (using <hardcount> to keep track), and that we
8105      * are still within the bounds of the string (using <loceol>.  A few OPs
8106      * match a single byte no matter what the encoding.  They can omit the max
8107      * test if, for the UTF-8 case, they do the adjustment that was skipped
8108      * above.
8109      *
8110      * Thus, the code above sets things up for the common case; and exceptional
8111      * cases need extra work; the common case is to make sure <scan> doesn't
8112      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8113      * count doesn't exceed the maximum permissible */
8114
8115     switch (OP(p)) {
8116     case REG_ANY:
8117         if (utf8_target) {
8118             while (scan < loceol && hardcount < max && *scan != '\n') {
8119                 scan += UTF8SKIP(scan);
8120                 hardcount++;
8121             }
8122         } else {
8123             while (scan < loceol && *scan != '\n')
8124                 scan++;
8125         }
8126         break;
8127     case SANY:
8128         if (utf8_target) {
8129             while (scan < loceol && hardcount < max) {
8130                 scan += UTF8SKIP(scan);
8131                 hardcount++;
8132             }
8133         }
8134         else
8135             scan = loceol;
8136         break;
8137     case EXACTL:
8138         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8139         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8140             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8141         }
8142         /* FALLTHROUGH */
8143     case EXACT:
8144         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8145
8146         c = (U8)*STRING(p);
8147
8148         /* Can use a simple loop if the pattern char to match on is invariant
8149          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8150          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8151          * true iff it doesn't matter if the argument is in UTF-8 or not */
8152         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8153             if (utf8_target && loceol - scan > max) {
8154                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8155                  * since here, to match at all, 1 char == 1 byte */
8156                 loceol = scan + max;
8157             }
8158             while (scan < loceol && UCHARAT(scan) == c) {
8159                 scan++;
8160             }
8161         }
8162         else if (reginfo->is_utf8_pat) {
8163             if (utf8_target) {
8164                 STRLEN scan_char_len;
8165
8166                 /* When both target and pattern are UTF-8, we have to do
8167                  * string EQ */
8168                 while (hardcount < max
8169                        && scan < loceol
8170                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8171                        && memEQ(scan, STRING(p), scan_char_len))
8172                 {
8173                     scan += scan_char_len;
8174                     hardcount++;
8175                 }
8176             }
8177             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8178
8179                 /* Target isn't utf8; convert the character in the UTF-8
8180                  * pattern to non-UTF8, and do a simple loop */
8181                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8182                 while (scan < loceol && UCHARAT(scan) == c) {
8183                     scan++;
8184                 }
8185             } /* else pattern char is above Latin1, can't possibly match the
8186                  non-UTF-8 target */
8187         }
8188         else {
8189
8190             /* Here, the string must be utf8; pattern isn't, and <c> is
8191              * different in utf8 than not, so can't compare them directly.
8192              * Outside the loop, find the two utf8 bytes that represent c, and
8193              * then look for those in sequence in the utf8 string */
8194             U8 high = UTF8_TWO_BYTE_HI(c);
8195             U8 low = UTF8_TWO_BYTE_LO(c);
8196
8197             while (hardcount < max
8198                     && scan + 1 < loceol
8199                     && UCHARAT(scan) == high
8200                     && UCHARAT(scan + 1) == low)
8201             {
8202                 scan += 2;
8203                 hardcount++;
8204             }
8205         }
8206         break;
8207
8208     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8209         assert(! reginfo->is_utf8_pat);
8210         /* FALLTHROUGH */
8211     case EXACTFA:
8212         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8213         goto do_exactf;
8214
8215     case EXACTFL:
8216         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8217         utf8_flags = FOLDEQ_LOCALE;
8218         goto do_exactf;
8219
8220     case EXACTF:   /* This node only generated for non-utf8 patterns */
8221         assert(! reginfo->is_utf8_pat);
8222         utf8_flags = 0;
8223         goto do_exactf;
8224
8225     case EXACTFLU8:
8226         if (! utf8_target) {
8227             break;
8228         }
8229         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8230                                     | FOLDEQ_S2_FOLDS_SANE;
8231         goto do_exactf;
8232
8233     case EXACTFU_SS:
8234     case EXACTFU:
8235         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8236
8237       do_exactf: {
8238         int c1, c2;
8239         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8240
8241         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8242
8243         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8244                                         reginfo))
8245         {
8246             if (c1 == CHRTEST_VOID) {
8247                 /* Use full Unicode fold matching */
8248                 char *tmpeol = reginfo->strend;
8249                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8250                 while (hardcount < max
8251                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8252                                              STRING(p), NULL, pat_len,
8253                                              reginfo->is_utf8_pat, utf8_flags))
8254                 {
8255                     scan = tmpeol;
8256                     tmpeol = reginfo->strend;
8257                     hardcount++;
8258                 }
8259             }
8260             else if (utf8_target) {
8261                 if (c1 == c2) {
8262                     while (scan < loceol
8263                            && hardcount < max
8264                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8265                     {
8266                         scan += UTF8SKIP(scan);
8267                         hardcount++;
8268                     }
8269                 }
8270                 else {
8271                     while (scan < loceol
8272                            && hardcount < max
8273                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8274                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8275                     {
8276                         scan += UTF8SKIP(scan);
8277                         hardcount++;
8278                     }
8279                 }
8280             }
8281             else if (c1 == c2) {
8282                 while (scan < loceol && UCHARAT(scan) == c1) {
8283                     scan++;
8284                 }
8285             }
8286             else {
8287                 while (scan < loceol &&
8288                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8289                 {
8290                     scan++;
8291                 }
8292             }
8293         }
8294         break;
8295     }
8296     case ANYOFL:
8297         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8298
8299         if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
8300             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8301         }
8302         /* FALLTHROUGH */
8303     case ANYOFD:
8304     case ANYOF:
8305         if (utf8_target) {
8306             while (hardcount < max
8307                    && scan < loceol
8308                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8309             {
8310                 scan += UTF8SKIP(scan);
8311                 hardcount++;
8312             }
8313         } else {
8314             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
8315                 scan++;
8316         }
8317         break;
8318
8319     /* The argument (FLAGS) to all the POSIX node types is the class number */
8320
8321     case NPOSIXL:
8322         to_complement = 1;
8323         /* FALLTHROUGH */
8324
8325     case POSIXL:
8326         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8327         if (! utf8_target) {
8328             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8329                                                                    *scan)))
8330             {
8331                 scan++;
8332             }
8333         } else {
8334             while (hardcount < max && scan < loceol
8335                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8336                                                                   (U8 *) scan)))
8337             {
8338                 scan += UTF8SKIP(scan);
8339                 hardcount++;
8340             }
8341         }
8342         break;
8343
8344     case POSIXD:
8345         if (utf8_target) {
8346             goto utf8_posix;
8347         }
8348         /* FALLTHROUGH */
8349
8350     case POSIXA:
8351         if (utf8_target && loceol - scan > max) {
8352
8353             /* We didn't adjust <loceol> at the beginning of this routine
8354              * because is UTF-8, but it is actually ok to do so, since here, to
8355              * match, 1 char == 1 byte. */
8356             loceol = scan + max;
8357         }
8358         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8359             scan++;
8360         }
8361         break;
8362
8363     case NPOSIXD:
8364         if (utf8_target) {
8365             to_complement = 1;
8366             goto utf8_posix;
8367         }
8368         /* FALLTHROUGH */
8369
8370     case NPOSIXA:
8371         if (! utf8_target) {
8372             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8373                 scan++;
8374             }
8375         }
8376         else {
8377
8378             /* The complement of something that matches only ASCII matches all
8379              * non-ASCII, plus everything in ASCII that isn't in the class. */
8380             while (hardcount < max && scan < loceol
8381                    && (! isASCII_utf8(scan)
8382                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8383             {
8384                 scan += UTF8SKIP(scan);
8385                 hardcount++;
8386             }
8387         }
8388         break;
8389
8390     case NPOSIXU:
8391         to_complement = 1;
8392         /* FALLTHROUGH */
8393
8394     case POSIXU:
8395         if (! utf8_target) {
8396             while (scan < loceol && to_complement
8397                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8398             {
8399                 scan++;
8400             }
8401         }
8402         else {
8403           utf8_posix:
8404             classnum = (_char_class_number) FLAGS(p);
8405             if (classnum < _FIRST_NON_SWASH_CC) {
8406
8407                 /* Here, a swash is needed for above-Latin1 code points.
8408                  * Process as many Latin1 code points using the built-in rules.
8409                  * Go to another loop to finish processing upon encountering
8410                  * the first Latin1 code point.  We could do that in this loop
8411                  * as well, but the other way saves having to test if the swash
8412                  * has been loaded every time through the loop: extra space to
8413                  * save a test. */
8414                 while (hardcount < max && scan < loceol) {
8415                     if (UTF8_IS_INVARIANT(*scan)) {
8416                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
8417                                                                    classnum))))
8418                         {
8419                             break;
8420                         }
8421                         scan++;
8422                     }
8423                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
8424                         if (! (to_complement
8425                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
8426                                                                      *(scan + 1)),
8427                                                     classnum))))
8428                         {
8429                             break;
8430                         }
8431                         scan += 2;
8432                     }
8433                     else {
8434                         goto found_above_latin1;
8435                     }
8436
8437                     hardcount++;
8438                 }
8439             }
8440             else {
8441                 /* For these character classes, the knowledge of how to handle
8442                  * every code point is compiled in to Perl via a macro.  This
8443                  * code is written for making the loops as tight as possible.
8444                  * It could be refactored to save space instead */
8445                 switch (classnum) {
8446                     case _CC_ENUM_SPACE:
8447                         while (hardcount < max
8448                                && scan < loceol
8449                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
8450                         {
8451                             scan += UTF8SKIP(scan);
8452                             hardcount++;
8453                         }
8454                         break;
8455                     case _CC_ENUM_BLANK:
8456                         while (hardcount < max
8457                                && scan < loceol
8458                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
8459                         {
8460                             scan += UTF8SKIP(scan);
8461                             hardcount++;
8462                         }
8463                         break;
8464                     case _CC_ENUM_XDIGIT:
8465                         while (hardcount < max
8466                                && scan < loceol
8467                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
8468                         {
8469                             scan += UTF8SKIP(scan);
8470                             hardcount++;
8471                         }
8472                         break;
8473                     case _CC_ENUM_VERTSPACE:
8474                         while (hardcount < max
8475                                && scan < loceol
8476                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
8477                         {
8478                             scan += UTF8SKIP(scan);
8479                             hardcount++;
8480                         }
8481                         break;
8482                     case _CC_ENUM_CNTRL:
8483                         while (hardcount < max
8484                                && scan < loceol
8485                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
8486                         {
8487                             scan += UTF8SKIP(scan);
8488                             hardcount++;
8489                         }
8490                         break;
8491                     default:
8492                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
8493                 }
8494             }
8495         }
8496         break;
8497
8498       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
8499
8500         /* Load the swash if not already present */
8501         if (! PL_utf8_swash_ptrs[classnum]) {
8502             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
8503             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
8504                                         "utf8",
8505                                         "",
8506                                         &PL_sv_undef, 1, 0,
8507                                         PL_XPosix_ptrs[classnum], &flags);
8508         }
8509
8510         while (hardcount < max && scan < loceol
8511                && to_complement ^ cBOOL(_generic_utf8(
8512                                        classnum,
8513                                        scan,
8514                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
8515                                                    (U8 *) scan,
8516                                                    TRUE))))
8517         {
8518             scan += UTF8SKIP(scan);
8519             hardcount++;
8520         }
8521         break;
8522
8523     case LNBREAK:
8524         if (utf8_target) {
8525             while (hardcount < max && scan < loceol &&
8526                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
8527                 scan += c;
8528                 hardcount++;
8529             }
8530         } else {
8531             /* LNBREAK can match one or two latin chars, which is ok, but we
8532              * have to use hardcount in this situation, and throw away the
8533              * adjustment to <loceol> done before the switch statement */
8534             loceol = reginfo->strend;
8535             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
8536                 scan+=c;
8537                 hardcount++;
8538             }
8539         }
8540         break;
8541
8542     case BOUNDL:
8543     case NBOUNDL:
8544         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8545         /* FALLTHROUGH */
8546     case BOUND:
8547     case BOUNDA:
8548     case BOUNDU:
8549     case EOS:
8550     case GPOS:
8551     case KEEPS:
8552     case NBOUND:
8553     case NBOUNDA:
8554     case NBOUNDU:
8555     case OPFAIL:
8556     case SBOL:
8557     case SEOL:
8558         /* These are all 0 width, so match right here or not at all. */
8559         break;
8560
8561     default:
8562         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
8563         /* NOTREACHED */
8564         NOT_REACHED; /* NOTREACHED */
8565
8566     }
8567
8568     if (hardcount)
8569         c = hardcount;
8570     else
8571         c = scan - *startposp;
8572     *startposp = scan;
8573
8574     DEBUG_r({
8575         GET_RE_DEBUG_FLAGS_DECL;
8576         DEBUG_EXECUTE_r({
8577             SV * const prop = sv_newmortal();
8578             regprop(prog, prop, p, reginfo, NULL);
8579             PerlIO_printf(Perl_debug_log,
8580                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
8581                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
8582         });
8583     });
8584
8585     return(c);
8586 }
8587
8588
8589 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
8590 /*
8591 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
8592 create a copy so that changes the caller makes won't change the shared one.
8593 If <altsvp> is non-null, will return NULL in it, for back-compat.
8594  */
8595 SV *
8596 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
8597 {
8598     PERL_ARGS_ASSERT_REGCLASS_SWASH;
8599
8600     if (altsvp) {
8601         *altsvp = NULL;
8602     }
8603
8604     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
8605 }
8606
8607 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
8608
8609 /*
8610  - reginclass - determine if a character falls into a character class
8611
8612   n is the ANYOF-type regnode
8613   p is the target string
8614   p_end points to one byte beyond the end of the target string
8615   utf8_target tells whether p is in UTF-8.
8616
8617   Returns true if matched; false otherwise.
8618
8619   Note that this can be a synthetic start class, a combination of various
8620   nodes, so things you think might be mutually exclusive, such as locale,
8621   aren't.  It can match both locale and non-locale
8622
8623  */
8624
8625 STATIC bool
8626 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
8627 {
8628     dVAR;
8629     const char flags = ANYOF_FLAGS(n);
8630     bool match = FALSE;
8631     UV c = *p;
8632
8633     PERL_ARGS_ASSERT_REGINCLASS;
8634
8635     /* If c is not already the code point, get it.  Note that
8636      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
8637     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
8638         STRLEN c_len = 0;
8639         c = utf8n_to_uvchr(p, p_end - p, &c_len,
8640                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
8641                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
8642                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
8643                  * UTF8_ALLOW_FFFF */
8644         if (c_len == (STRLEN)-1)
8645             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
8646         if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) {
8647             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
8648         }
8649     }
8650
8651     /* If this character is potentially in the bitmap, check it */
8652     if (c < NUM_ANYOF_CODE_POINTS) {
8653         if (ANYOF_BITMAP_TEST(n, c))
8654             match = TRUE;
8655         else if ((flags
8656                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
8657                   && OP(n) == ANYOFD
8658                   && ! utf8_target
8659                   && ! isASCII(c))
8660         {
8661             match = TRUE;
8662         }
8663         else if (flags & ANYOF_LOCALE_FLAGS) {
8664             if ((flags & ANYOF_LOC_FOLD)
8665                 && c < 256
8666                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
8667             {
8668                 match = TRUE;
8669             }
8670             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
8671                      && c < 256
8672             ) {
8673
8674                 /* The data structure is arranged so bits 0, 2, 4, ... are set
8675                  * if the class includes the Posix character class given by
8676                  * bit/2; and 1, 3, 5, ... are set if the class includes the
8677                  * complemented Posix class given by int(bit/2).  So we loop
8678                  * through the bits, each time changing whether we complement
8679                  * the result or not.  Suppose for the sake of illustration
8680                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
8681                  * is set, it means there is a match for this ANYOF node if the
8682                  * character is in the class given by the expression (0 / 2 = 0
8683                  * = \w).  If it is in that class, isFOO_lc() will return 1,
8684                  * and since 'to_complement' is 0, the result will stay TRUE,
8685                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
8686                  * bit 1 is 1.  That means there is a match if the character
8687                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
8688                  * but will on bit 1.  On the second iteration 'to_complement'
8689                  * will be 1, so the exclusive or will reverse things, so we
8690                  * are testing for \W.  On the third iteration, 'to_complement'
8691                  * will be 0, and we would be testing for \s; the fourth
8692                  * iteration would test for \S, etc.
8693                  *
8694                  * Note that this code assumes that all the classes are closed
8695                  * under folding.  For example, if a character matches \w, then
8696                  * its fold does too; and vice versa.  This should be true for
8697                  * any well-behaved locale for all the currently defined Posix
8698                  * classes, except for :lower: and :upper:, which are handled
8699                  * by the pseudo-class :cased: which matches if either of the
8700                  * other two does.  To get rid of this assumption, an outer
8701                  * loop could be used below to iterate over both the source
8702                  * character, and its fold (if different) */
8703
8704                 int count = 0;
8705                 int to_complement = 0;
8706
8707                 while (count < ANYOF_MAX) {
8708                     if (ANYOF_POSIXL_TEST(n, count)
8709                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
8710                     {
8711                         match = TRUE;
8712                         break;
8713                     }
8714                     count++;
8715                     to_complement ^= 1;
8716                 }
8717             }
8718         }
8719     }
8720
8721
8722     /* If the bitmap didn't (or couldn't) match, and something outside the
8723      * bitmap could match, try that. */
8724     if (!match) {
8725         if (c >= NUM_ANYOF_CODE_POINTS
8726             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
8727         {
8728             match = TRUE;       /* Everything above the bitmap matches */
8729         }
8730         else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
8731                   || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
8732                   || ((flags & ANYOF_LOC_FOLD)
8733                        && IN_UTF8_CTYPE_LOCALE
8734                        && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
8735         {
8736             SV* only_utf8_locale = NULL;
8737             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
8738                                                        &only_utf8_locale, NULL);
8739             if (sw) {
8740                 U8 utf8_buffer[2];
8741                 U8 * utf8_p;
8742                 if (utf8_target) {
8743                     utf8_p = (U8 *) p;
8744                 } else { /* Convert to utf8 */
8745                     utf8_p = utf8_buffer;
8746                     append_utf8_from_native_byte(*p, &utf8_p);
8747                     utf8_p = utf8_buffer;
8748                 }
8749
8750                 if (swash_fetch(sw, utf8_p, TRUE)) {
8751                     match = TRUE;
8752                 }
8753             }
8754             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
8755                 match = _invlist_contains_cp(only_utf8_locale, c);
8756             }
8757         }
8758
8759         if (UNICODE_IS_SUPER(c)
8760             && (flags
8761                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
8762             && OP(n) != ANYOFD
8763             && ckWARN_d(WARN_NON_UNICODE))
8764         {
8765             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
8766                 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
8767         }
8768     }
8769
8770 #if ANYOF_INVERT != 1
8771     /* Depending on compiler optimization cBOOL takes time, so if don't have to
8772      * use it, don't */
8773 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
8774 #endif
8775
8776     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
8777     return (flags & ANYOF_INVERT) ^ match;
8778 }
8779
8780 STATIC U8 *
8781 S_reghop3(U8 *s, SSize_t off, const U8* lim)
8782 {
8783     /* return the position 'off' UTF-8 characters away from 's', forward if
8784      * 'off' >= 0, backwards if negative.  But don't go outside of position
8785      * 'lim', which better be < s  if off < 0 */
8786
8787     PERL_ARGS_ASSERT_REGHOP3;
8788
8789     if (off >= 0) {
8790         while (off-- && s < lim) {
8791             /* XXX could check well-formedness here */
8792             s += UTF8SKIP(s);
8793         }
8794     }
8795     else {
8796         while (off++ && s > lim) {
8797             s--;
8798             if (UTF8_IS_CONTINUED(*s)) {
8799                 while (s > lim && UTF8_IS_CONTINUATION(*s))
8800                     s--;
8801                 if (! UTF8_IS_START(*s)) {
8802                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
8803                 }
8804             }
8805             /* XXX could check well-formedness here */
8806         }
8807     }
8808     return s;
8809 }
8810
8811 STATIC U8 *
8812 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
8813 {
8814     PERL_ARGS_ASSERT_REGHOP4;
8815
8816     if (off >= 0) {
8817         while (off-- && s < rlim) {
8818             /* XXX could check well-formedness here */
8819             s += UTF8SKIP(s);
8820         }
8821     }
8822     else {
8823         while (off++ && s > llim) {
8824             s--;
8825             if (UTF8_IS_CONTINUED(*s)) {
8826                 while (s > llim && UTF8_IS_CONTINUATION(*s))
8827                     s--;
8828                 if (! UTF8_IS_START(*s)) {
8829                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
8830                 }
8831             }
8832             /* XXX could check well-formedness here */
8833         }
8834     }
8835     return s;
8836 }
8837
8838 /* like reghop3, but returns NULL on overrun, rather than returning last
8839  * char pos */
8840
8841 STATIC U8 *
8842 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
8843 {
8844     PERL_ARGS_ASSERT_REGHOPMAYBE3;
8845
8846     if (off >= 0) {
8847         while (off-- && s < lim) {
8848             /* XXX could check well-formedness here */
8849             s += UTF8SKIP(s);
8850         }
8851         if (off >= 0)
8852             return NULL;
8853     }
8854     else {
8855         while (off++ && s > lim) {
8856             s--;
8857             if (UTF8_IS_CONTINUED(*s)) {
8858                 while (s > lim && UTF8_IS_CONTINUATION(*s))
8859                     s--;
8860                 if (! UTF8_IS_START(*s)) {
8861                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
8862                 }
8863             }
8864             /* XXX could check well-formedness here */
8865         }
8866         if (off <= 0)
8867             return NULL;
8868     }
8869     return s;
8870 }
8871
8872
8873 /* when executing a regex that may have (?{}), extra stuff needs setting
8874    up that will be visible to the called code, even before the current
8875    match has finished. In particular:
8876
8877    * $_ is localised to the SV currently being matched;
8878    * pos($_) is created if necessary, ready to be updated on each call-out
8879      to code;
8880    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
8881      isn't set until the current pattern is successfully finished), so that
8882      $1 etc of the match-so-far can be seen;
8883    * save the old values of subbeg etc of the current regex, and  set then
8884      to the current string (again, this is normally only done at the end
8885      of execution)
8886 */
8887
8888 static void
8889 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
8890 {
8891     MAGIC *mg;
8892     regexp *const rex = ReANY(reginfo->prog);
8893     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8894
8895     eval_state->rex = rex;
8896
8897     if (reginfo->sv) {
8898         /* Make $_ available to executed code. */
8899         if (reginfo->sv != DEFSV) {
8900             SAVE_DEFSV;
8901             DEFSV_set(reginfo->sv);
8902         }
8903
8904         if (!(mg = mg_find_mglob(reginfo->sv))) {
8905             /* prepare for quick setting of pos */
8906             mg = sv_magicext_mglob(reginfo->sv);
8907             mg->mg_len = -1;
8908         }
8909         eval_state->pos_magic = mg;
8910         eval_state->pos       = mg->mg_len;
8911         eval_state->pos_flags = mg->mg_flags;
8912     }
8913     else
8914         eval_state->pos_magic = NULL;
8915
8916     if (!PL_reg_curpm) {
8917         /* PL_reg_curpm is a fake PMOP that we can attach the current
8918          * regex to and point PL_curpm at, so that $1 et al are visible
8919          * within a /(?{})/. It's just allocated once per interpreter the
8920          * first time its needed */
8921         Newxz(PL_reg_curpm, 1, PMOP);
8922 #ifdef USE_ITHREADS
8923         {
8924             SV* const repointer = &PL_sv_undef;
8925             /* this regexp is also owned by the new PL_reg_curpm, which
8926                will try to free it.  */
8927             av_push(PL_regex_padav, repointer);
8928             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
8929             PL_regex_pad = AvARRAY(PL_regex_padav);
8930         }
8931 #endif
8932     }
8933     SET_reg_curpm(reginfo->prog);
8934     eval_state->curpm = PL_curpm;
8935     PL_curpm = PL_reg_curpm;
8936     if (RXp_MATCH_COPIED(rex)) {
8937         /*  Here is a serious problem: we cannot rewrite subbeg,
8938             since it may be needed if this match fails.  Thus
8939             $` inside (?{}) could fail... */
8940         eval_state->subbeg     = rex->subbeg;
8941         eval_state->sublen     = rex->sublen;
8942         eval_state->suboffset  = rex->suboffset;
8943         eval_state->subcoffset = rex->subcoffset;
8944 #ifdef PERL_ANY_COW
8945         eval_state->saved_copy = rex->saved_copy;
8946 #endif
8947         RXp_MATCH_COPIED_off(rex);
8948     }
8949     else
8950         eval_state->subbeg = NULL;
8951     rex->subbeg = (char *)reginfo->strbeg;
8952     rex->suboffset = 0;
8953     rex->subcoffset = 0;
8954     rex->sublen = reginfo->strend - reginfo->strbeg;
8955 }
8956
8957
8958 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8959
8960 static void
8961 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8962 {
8963     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8964     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
8965     regmatch_slab *s;
8966
8967     Safefree(aux->poscache);
8968
8969     if (eval_state) {
8970
8971         /* undo the effects of S_setup_eval_state() */
8972
8973         if (eval_state->subbeg) {
8974             regexp * const rex = eval_state->rex;
8975             rex->subbeg     = eval_state->subbeg;
8976             rex->sublen     = eval_state->sublen;
8977             rex->suboffset  = eval_state->suboffset;
8978             rex->subcoffset = eval_state->subcoffset;
8979 #ifdef PERL_ANY_COW
8980             rex->saved_copy = eval_state->saved_copy;
8981 #endif
8982             RXp_MATCH_COPIED_on(rex);
8983         }
8984         if (eval_state->pos_magic)
8985         {
8986             eval_state->pos_magic->mg_len = eval_state->pos;
8987             eval_state->pos_magic->mg_flags =
8988                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8989                | (eval_state->pos_flags & MGf_BYTES);
8990         }
8991
8992         PL_curpm = eval_state->curpm;
8993     }
8994
8995     PL_regmatch_state = aux->old_regmatch_state;
8996     PL_regmatch_slab  = aux->old_regmatch_slab;
8997
8998     /* free all slabs above current one - this must be the last action
8999      * of this function, as aux and eval_state are allocated within
9000      * slabs and may be freed here */
9001
9002     s = PL_regmatch_slab->next;
9003     if (s) {
9004         PL_regmatch_slab->next = NULL;
9005         while (s) {
9006             regmatch_slab * const osl = s;
9007             s = s->next;
9008             Safefree(osl);
9009         }
9010     }
9011 }
9012
9013
9014 STATIC void
9015 S_to_utf8_substr(pTHX_ regexp *prog)
9016 {
9017     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9018      * on the converted value */
9019
9020     int i = 1;
9021
9022     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9023
9024     do {
9025         if (prog->substrs->data[i].substr
9026             && !prog->substrs->data[i].utf8_substr) {
9027             SV* const sv = newSVsv(prog->substrs->data[i].substr);
9028             prog->substrs->data[i].utf8_substr = sv;
9029             sv_utf8_upgrade(sv);
9030             if (SvVALID(prog->substrs->data[i].substr)) {
9031                 if (SvTAIL(prog->substrs->data[i].substr)) {
9032                     /* Trim the trailing \n that fbm_compile added last
9033                        time.  */
9034                     SvCUR_set(sv, SvCUR(sv) - 1);
9035                     /* Whilst this makes the SV technically "invalid" (as its
9036                        buffer is no longer followed by "\0") when fbm_compile()
9037                        adds the "\n" back, a "\0" is restored.  */
9038                     fbm_compile(sv, FBMcf_TAIL);
9039                 } else
9040                     fbm_compile(sv, 0);
9041             }
9042             if (prog->substrs->data[i].substr == prog->check_substr)
9043                 prog->check_utf8 = sv;
9044         }
9045     } while (i--);
9046 }
9047
9048 STATIC bool
9049 S_to_byte_substr(pTHX_ regexp *prog)
9050 {
9051     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9052      * on the converted value; returns FALSE if can't be converted. */
9053
9054     int i = 1;
9055
9056     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9057
9058     do {
9059         if (prog->substrs->data[i].utf8_substr
9060             && !prog->substrs->data[i].substr) {
9061             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9062             if (! sv_utf8_downgrade(sv, TRUE)) {
9063                 return FALSE;
9064             }
9065             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9066                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9067                     /* Trim the trailing \n that fbm_compile added last
9068                         time.  */
9069                     SvCUR_set(sv, SvCUR(sv) - 1);
9070                     fbm_compile(sv, FBMcf_TAIL);
9071                 } else
9072                     fbm_compile(sv, 0);
9073             }
9074             prog->substrs->data[i].substr = sv;
9075             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9076                 prog->check_substr = sv;
9077         }
9078     } while (i--);
9079
9080     return TRUE;
9081 }
9082
9083 /*
9084  * ex: set ts=8 sts=4 sw=4 et:
9085  */