regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 /* Returns a boolean as to whether the input unsigned number is a power of 2
 100  * (2**0, 2**1, etc).  In other words if it has just a single bit set.
 101  * If not, subtracting 1 would leave the uppermost bit set, so the & would
 102  * yield non-zero */
 103 #define isPOWER_OF_2(n) ((n & (n-1)) == 0)
 104
 105 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 106     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 107     goto target;                                                         \
 108 } STMT_END
 109
 110 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 111
 112 #ifndef STATIC
 113 #define STATIC  static
 114 #endif
 115
 116 /* Valid only if 'c', the character being looke-up, is an invariant under
 117  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 118  * everything matchable is straight forward in the bitmap */
 119 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 120                                 ? reginclass(prog,p,c,c+1,u)                \
 121                                 : ANYOF_BITMAP_TEST(p,*(c)))
 122
 123 /*
 124  * Forwards.
 125  */
 126
 127 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 128
 129 #define HOPc(pos,off) \
 130         (char *)(reginfo->is_utf8_target \
 131             ? reghop3((U8*)pos, off, \
 132                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 133             : (U8*)(pos + off))
 134
 135 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 136 #define HOPBACK3(pos, off, lim) \
 137         (reginfo->is_utf8_target                          \
 138             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 139             : (pos - off >= lim)                                 \
 140                 ? (U8*)pos - off                                 \
 141                 : NULL)
 142
 143 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 144
 145 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 146 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 147
 148 /* lim must be +ve. Returns NULL on overshoot */
 149 #define HOPMAYBE3(pos,off,lim) \
 150         (reginfo->is_utf8_target                        \
 151             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 152             : ((U8*)pos + off <= lim)                   \
 153                 ? (U8*)pos + off                        \
 154                 : NULL)
 155
 156 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 157  * off must be >=0; args should be vars rather than expressions */
 158 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 159     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 160     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 161 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 162
 163 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 164     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 165     : (U8*)(pos + off))
 166 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 167
 168 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 169 #define NEXTCHR_IS_EOS (nextchr < 0)
 170
 171 #define SET_nextchr \
 172     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 173
 174 #define SET_locinput(p) \
 175     locinput = (p);  \
 176     SET_nextchr
 177
 178
 179 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 180         if (!swash_ptr) {                                                     \
 181             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 182             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 183                                          1, 0, invlist, &flags);              \
 184             assert(swash_ptr);                                                \
 185         }                                                                     \
 186     } STMT_END
 187
 188 /* If in debug mode, we test that a known character properly matches */
 189 #ifdef DEBUGGING
 190 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 191                                           property_name,                      \
 192                                           invlist,                            \
 193                                           utf8_char_in_property)              \
 194         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 195         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 196 #else
 197 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 198                                           property_name,                      \
 199                                           invlist,                            \
 200                                           utf8_char_in_property)              \
 201         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 202 #endif
 203
 204 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 205                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 206                                         "",                                   \
 207                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 208                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 209
 210 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 211 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 212
 213 /* for use after a quantifier and before an EXACT-like node -- japhy */
 214 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 215  *
 216  * NOTE that *nothing* that affects backtracking should be in here, specifically
 217  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 218  * node that is in between two EXACT like nodes when ascertaining what the required
 219  * "follow" character is. This should probably be moved to regex compile time
 220  * although it may be done at run time beause of the REF possibility - more
 221  * investigation required. -- demerphq
 222 */
 223 #define JUMPABLE(rn) (                                                             \
 224     OP(rn) == OPEN ||                                                              \
 225     (OP(rn) == CLOSE &&                                                            \
 226      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 227     OP(rn) == EVAL ||                                                              \
 228     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 229     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 230     OP(rn) == KEEPS ||                                                             \
 231     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 232 )
 233 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 234
 235 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 236
 237 #if 0
 238 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 239    we don't need this definition.  XXX These are now out-of-sync*/
 240 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 241 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 242 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 243
 244 #else
 245 /* ... so we use this as its faster. */
 246 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 247 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 248 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 249 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 250
 251 #endif
 252
 253 /*
 254   Search for mandatory following text node; for lookahead, the text must
 255   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 256 */
 257 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 258     while (JUMPABLE(rn)) { \
 259         const OPCODE type = OP(rn); \
 260         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 261             rn = NEXTOPER(NEXTOPER(rn)); \
 262         else if (type == PLUS) \
 263             rn = NEXTOPER(rn); \
 264         else if (type == IFMATCH) \
 265             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 266         else rn += NEXT_OFF(rn); \
 267     } \
 268 } STMT_END
 269
 270 #define SLAB_FIRST(s) (&(s)->states[0])
 271 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 272
 273 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 274 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 275 static regmatch_state * S_push_slab(pTHX);
 276
 277 #define REGCP_PAREN_ELEMS 3
 278 #define REGCP_OTHER_ELEMS 3
 279 #define REGCP_FRAME_ELEMS 1
 280 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 281  * are needed for the regexp context stack bookkeeping. */
 282
 283 STATIC CHECKPOINT
 284 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 285 {
 286     const int retval = PL_savestack_ix;
 287     const int paren_elems_to_push =
 288                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 289     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 290     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 291     I32 p;
 292     GET_RE_DEBUG_FLAGS_DECL;
 293
 294     PERL_ARGS_ASSERT_REGCPPUSH;
 295
 296     if (paren_elems_to_push < 0)
 297         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 298                    (int)paren_elems_to_push, (int)maxopenparen,
 299                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 300
 301     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 302         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 303                    " out of range (%lu-%ld)",
 304                    total_elems,
 305                    (unsigned long)maxopenparen,
 306                    (long)parenfloor);
 307
 308     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 309
 310     DEBUG_BUFFERS_r(
 311         if ((int)maxopenparen > (int)parenfloor)
 312             Perl_re_exec_indentf( aTHX_
 313                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 314                 depth,
 315                 PTR2UV(rex),
 316                 PTR2UV(rex->offs)
 317             );
 318     );
 319     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 320 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 321         SSPUSHIV(rex->offs[p].end);
 322         SSPUSHIV(rex->offs[p].start);
 323         SSPUSHINT(rex->offs[p].start_tmp);
 324         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 325             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 326             depth,
 327             (UV)p,
 328             (IV)rex->offs[p].start,
 329             (IV)rex->offs[p].start_tmp,
 330             (IV)rex->offs[p].end
 331         ));
 332     }
 333 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 334     SSPUSHINT(maxopenparen);
 335     SSPUSHINT(rex->lastparen);
 336     SSPUSHINT(rex->lastcloseparen);
 337     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 338
 339     return retval;
 340 }
 341
 342 /* These are needed since we do not localize EVAL nodes: */
 343 #define REGCP_SET(cp)                                           \
 344     DEBUG_STATE_r(                                              \
 345         Perl_re_exec_indentf( aTHX_                             \
 346             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 347             depth, (IV)PL_savestack_ix                          \
 348         )                                                       \
 349     );                                                          \
 350     cp = PL_savestack_ix
 351
 352 #define REGCP_UNWIND(cp)                                        \
 353     DEBUG_STATE_r(                                              \
 354         if (cp != PL_savestack_ix)                              \
 355             Perl_re_exec_indentf( aTHX_                         \
 356                 "Clearing an EVAL scope, savestack=%"           \
 357                 IVdf "..%" IVdf "\n",                           \
 358                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 359             )                                                   \
 360     );                                                          \
 361     regcpblow(cp)
 362
 363 #define UNWIND_PAREN(lp, lcp)               \
 364     for (n = rex->lastparen; n > lp; n--)   \
 365         rex->offs[n].end = -1;              \
 366     rex->lastparen = n;                     \
 367     rex->lastcloseparen = lcp;
 368
 369
 370 STATIC void
 371 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 372 {
 373     UV i;
 374     U32 paren;
 375     GET_RE_DEBUG_FLAGS_DECL;
 376
 377     PERL_ARGS_ASSERT_REGCPPOP;
 378
 379     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 380     i = SSPOPUV;
 381     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 382     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 383     rex->lastcloseparen = SSPOPINT;
 384     rex->lastparen = SSPOPINT;
 385     *maxopenparen_p = SSPOPINT;
 386
 387     i -= REGCP_OTHER_ELEMS;
 388     /* Now restore the parentheses context. */
 389     DEBUG_BUFFERS_r(
 390         if (i || rex->lastparen + 1 <= rex->nparens)
 391             Perl_re_exec_indentf( aTHX_
 392                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 393                 depth,
 394                 PTR2UV(rex),
 395                 PTR2UV(rex->offs)
 396             );
 397     );
 398     paren = *maxopenparen_p;
 399     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 400         SSize_t tmps;
 401         rex->offs[paren].start_tmp = SSPOPINT;
 402         rex->offs[paren].start = SSPOPIV;
 403         tmps = SSPOPIV;
 404         if (paren <= rex->lastparen)
 405             rex->offs[paren].end = tmps;
 406         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 407             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 408             depth,
 409             (UV)paren,
 410             (IV)rex->offs[paren].start,
 411             (IV)rex->offs[paren].start_tmp,
 412             (IV)rex->offs[paren].end,
 413             (paren > rex->lastparen ? "(skipped)" : ""));
 414         );
 415         paren--;
 416     }
 417 #if 1
 418     /* It would seem that the similar code in regtry()
 419      * already takes care of this, and in fact it is in
 420      * a better location to since this code can #if 0-ed out
 421      * but the code in regtry() is needed or otherwise tests
 422      * requiring null fields (pat.t#187 and split.t#{13,14}
 423      * (as of patchlevel 7877)  will fail.  Then again,
 424      * this code seems to be necessary or otherwise
 425      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 426      * --jhi updated by dapm */
 427     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 428         if (i > *maxopenparen_p)
 429             rex->offs[i].start = -1;
 430         rex->offs[i].end = -1;
 431         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 432             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 433             depth,
 434             (UV)i,
 435             (i > *maxopenparen_p) ? "-1" : "  "
 436         ));
 437     }
 438 #endif
 439 }
 440
 441 /* restore the parens and associated vars at savestack position ix,
 442  * but without popping the stack */
 443
 444 STATIC void
 445 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 446 {
 447     I32 tmpix = PL_savestack_ix;
 448     PERL_ARGS_ASSERT_REGCP_RESTORE;
 449
 450     PL_savestack_ix = ix;
 451     regcppop(rex, maxopenparen_p);
 452     PL_savestack_ix = tmpix;
 453 }
 454
 455 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 456
 457 #ifndef PERL_IN_XSUB_RE
 458
 459 bool
 460 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 461 {
 462     /* Returns a boolean as to whether or not 'character' is a member of the
 463      * Posix character class given by 'classnum' that should be equivalent to a
 464      * value in the typedef '_char_class_number'.
 465      *
 466      * Ideally this could be replaced by a just an array of function pointers
 467      * to the C library functions that implement the macros this calls.
 468      * However, to compile, the precise function signatures are required, and
 469      * these may vary from platform to to platform.  To avoid having to figure
 470      * out what those all are on each platform, I (khw) am using this method,
 471      * which adds an extra layer of function call overhead (unless the C
 472      * optimizer strips it away).  But we don't particularly care about
 473      * performance with locales anyway. */
 474
 475     switch ((_char_class_number) classnum) {
 476         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 477         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 478         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 479         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 480         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 481                                         || isUPPER_LC(character);
 482         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 483         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 484         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 485         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 486         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 487         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 488         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 489         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 490         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 491         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 492         default:    /* VERTSPACE should never occur in locales */
 493             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 494     }
 495
 496     NOT_REACHED; /* NOTREACHED */
 497     return FALSE;
 498 }
 499
 500 #endif
 501
 502 STATIC bool
 503 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 504 {
 505     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 506      * 'character' is a member of the Posix character class given by 'classnum'
 507      * that should be equivalent to a value in the typedef
 508      * '_char_class_number'.
 509      *
 510      * This just calls isFOO_lc on the code point for the character if it is in
 511      * the range 0-255.  Outside that range, all characters use Unicode
 512      * rules, ignoring any locale.  So use the Unicode function if this class
 513      * requires a swash, and use the Unicode macro otherwise. */
 514
 515     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 516
 517     if (UTF8_IS_INVARIANT(*character)) {
 518         return isFOO_lc(classnum, *character);
 519     }
 520     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 521         return isFOO_lc(classnum,
 522                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 523     }
 524
 525     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 526
 527     if (classnum < _FIRST_NON_SWASH_CC) {
 528
 529         /* Initialize the swash unless done already */
 530         if (! PL_utf8_swash_ptrs[classnum]) {
 531             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 532             PL_utf8_swash_ptrs[classnum] =
 533                     _core_swash_init("utf8",
 534                                      "",
 535                                      &PL_sv_undef, 1, 0,
 536                                      PL_XPosix_ptrs[classnum], &flags);
 537         }
 538
 539         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 540                                  character,
 541                                  TRUE /* is UTF */ ));
 542     }
 543
 544     switch ((_char_class_number) classnum) {
 545         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 546         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 547         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 548         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 549         default:                 break;
 550     }
 551
 552     return FALSE; /* Things like CNTRL are always below 256 */
 553 }
 554
 555 STATIC char *
 556 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
 557 {
 558     /* Returns the position of the first ASCII byte in the sequence between 's'
 559      * and 'send-1' inclusive; returns 'send' if none found */
 560
 561     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 562
 563 #ifndef EBCDIC
 564
 565     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 566
 567                             /* This term is wordsize if subword; 0 if not */
 568                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 569
 570                             /* 'offset' */
 571                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 572     {
 573
 574         /* Process per-byte until reach word boundary.  XXX This loop could be
 575          * eliminated if we knew that this platform had fast unaligned reads */
 576         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 577             if (isASCII(*s)) {
 578                 return s;
 579             }
 580             s++;    /* khw didn't bother creating a separate loop for
 581                        utf8_target */
 582         }
 583
 584         /* Here, we know we have at least one full word to process.  Process
 585          * per-word as long as we have at least a full word left */
 586         do {
 587             if ((* (PERL_UINTMAX_T *) s) & ~ PERL_VARIANTS_WORD_MASK)  {
 588                 break;
 589             }
 590             s += PERL_WORDSIZE;
 591         } while (s + PERL_WORDSIZE <= send);
 592     }
 593
 594 #endif
 595
 596     /* Process per-character */
 597     if (utf8_target) {
 598         while (s < send) {
 599             if (isASCII(*s)) {
 600                 return s;
 601             }
 602             s += UTF8SKIP(s);
 603         }
 604     }
 605     else {
 606         while (s < send) {
 607             if (isASCII(*s)) {
 608                 return s;
 609             }
 610             s++;
 611         }
 612     }
 613
 614     return s;
 615 }
 616
 617 STATIC char *
 618 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
 619 {
 620     /* Returns the position of the first non-ASCII byte in the sequence between
 621      * 's' and 'send-1' inclusive; returns 'send' if none found */
 622
 623 #ifdef EBCDIC
 624
 625     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 626
 627     if (utf8_target) {
 628         while (s < send) {
 629             if ( ! isASCII(*s)) {
 630                 return s;
 631             }
 632             s += UTF8SKIP(s);
 633         }
 634     }
 635     else {
 636         while (s < send) {
 637             if ( ! isASCII(*s)) {
 638                 return s;
 639             }
 640             s++;
 641         }
 642     }
 643
 644     return s;
 645
 646 #else
 647
 648     const U8 * next_non_ascii = NULL;
 649
 650     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 651     PERL_UNUSED_ARG(utf8_target);
 652
 653     /* On ASCII platforms invariants and ASCII are identical, so if the string
 654      * is entirely invariants, there is no non-ASCII character */
 655     return (is_utf8_invariant_string_loc((U8 *) s,
 656                                          (STRLEN) (send - s),
 657                                          &next_non_ascii))
 658             ? (char *) send
 659             : (char *) next_non_ascii;
 660
 661 #endif
 662
 663 }
 664
 665 /*
 666  * pregexec and friends
 667  */
 668
 669 #ifndef PERL_IN_XSUB_RE
 670 /*
 671  - pregexec - match a regexp against a string
 672  */
 673 I32
 674 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 675          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 676 /* stringarg: the point in the string at which to begin matching */
 677 /* strend:    pointer to null at end of string */
 678 /* strbeg:    real beginning of string */
 679 /* minend:    end of match must be >= minend bytes after stringarg. */
 680 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 681  *            itself is accessed via the pointers above */
 682 /* nosave:    For optimizations. */
 683 {
 684     PERL_ARGS_ASSERT_PREGEXEC;
 685
 686     return
 687         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 688                       nosave ? 0 : REXEC_COPY_STR);
 689 }
 690 #endif
 691
 692
 693
 694 /* re_intuit_start():
 695  *
 696  * Based on some optimiser hints, try to find the earliest position in the
 697  * string where the regex could match.
 698  *
 699  *   rx:     the regex to match against
 700  *   sv:     the SV being matched: only used for utf8 flag; the string
 701  *           itself is accessed via the pointers below. Note that on
 702  *           something like an overloaded SV, SvPOK(sv) may be false
 703  *           and the string pointers may point to something unrelated to
 704  *           the SV itself.
 705  *   strbeg: real beginning of string
 706  *   strpos: the point in the string at which to begin matching
 707  *   strend: pointer to the byte following the last char of the string
 708  *   flags   currently unused; set to 0
 709  *   data:   currently unused; set to NULL
 710  *
 711  * The basic idea of re_intuit_start() is to use some known information
 712  * about the pattern, namely:
 713  *
 714  *   a) the longest known anchored substring (i.e. one that's at a
 715  *      constant offset from the beginning of the pattern; but not
 716  *      necessarily at a fixed offset from the beginning of the
 717  *      string);
 718  *   b) the longest floating substring (i.e. one that's not at a constant
 719  *      offset from the beginning of the pattern);
 720  *   c) Whether the pattern is anchored to the string; either
 721  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 722  *      or anchored to pos(): /\G/;
 723  *   d) A start class: a real or synthetic character class which
 724  *      represents which characters are legal at the start of the pattern;
 725  *
 726  * to either quickly reject the match, or to find the earliest position
 727  * within the string at which the pattern might match, thus avoiding
 728  * running the full NFA engine at those earlier locations, only to
 729  * eventually fail and retry further along.
 730  *
 731  * Returns NULL if the pattern can't match, or returns the address within
 732  * the string which is the earliest place the match could occur.
 733  *
 734  * The longest of the anchored and floating substrings is called 'check'
 735  * and is checked first. The other is called 'other' and is checked
 736  * second. The 'other' substring may not be present.  For example,
 737  *
 738  *    /(abc|xyz)ABC\d{0,3}DEFG/
 739  *
 740  * will have
 741  *
 742  *   check substr (float)    = "DEFG", offset 6..9 chars
 743  *   other substr (anchored) = "ABC",  offset 3..3 chars
 744  *   stclass = [ax]
 745  *
 746  * Be aware that during the course of this function, sometimes 'anchored'
 747  * refers to a substring being anchored relative to the start of the
 748  * pattern, and sometimes to the pattern itself being anchored relative to
 749  * the string. For example:
 750  *
 751  *   /\dabc/:   "abc" is anchored to the pattern;
 752  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 753  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 754  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 755  *                    but the pattern is anchored to the string.
 756  */
 757
 758 char *
 759 Perl_re_intuit_start(pTHX_
 760                     REGEXP * const rx,
 761                     SV *sv,
 762                     const char * const strbeg,
 763                     char *strpos,
 764                     char *strend,
 765                     const U32 flags,
 766                     re_scream_pos_data *data)
 767 {
 768     struct regexp *const prog = ReANY(rx);
 769     SSize_t start_shift = prog->check_offset_min;
 770     /* Should be nonnegative! */
 771     SSize_t end_shift   = 0;
 772     /* current lowest pos in string where the regex can start matching */
 773     char *rx_origin = strpos;
 774     SV *check;
 775     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 776     U8   other_ix = 1 - prog->substrs->check_ix;
 777     bool ml_anch = 0;
 778     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 779     char *check_at = NULL;              /* check substr found at this pos */
 780     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 781     RXi_GET_DECL(prog,progi);
 782     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 783     regmatch_info *const reginfo = &reginfo_buf;
 784     GET_RE_DEBUG_FLAGS_DECL;
 785
 786     PERL_ARGS_ASSERT_RE_INTUIT_START;
 787     PERL_UNUSED_ARG(flags);
 788     PERL_UNUSED_ARG(data);
 789
 790     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 791                 "Intuit: trying to determine minimum start position...\n"));
 792
 793     /* for now, assume that all substr offsets are positive. If at some point
 794      * in the future someone wants to do clever things with lookbehind and
 795      * -ve offsets, they'll need to fix up any code in this function
 796      * which uses these offsets. See the thread beginning
 797      * <20140113145929.GF27210@iabyn.com>
 798      */
 799     assert(prog->substrs->data[0].min_offset >= 0);
 800     assert(prog->substrs->data[0].max_offset >= 0);
 801     assert(prog->substrs->data[1].min_offset >= 0);
 802     assert(prog->substrs->data[1].max_offset >= 0);
 803     assert(prog->substrs->data[2].min_offset >= 0);
 804     assert(prog->substrs->data[2].max_offset >= 0);
 805
 806     /* for now, assume that if both present, that the floating substring
 807      * doesn't start before the anchored substring.
 808      * If you break this assumption (e.g. doing better optimisations
 809      * with lookahead/behind), then you'll need to audit the code in this
 810      * function carefully first
 811      */
 812     assert(
 813             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 814               && (prog->float_utf8    || prog->float_substr))
 815            || (prog->float_min_offset >= prog->anchored_offset));
 816
 817     /* byte rather than char calculation for efficiency. It fails
 818      * to quickly reject some cases that can't match, but will reject
 819      * them later after doing full char arithmetic */
 820     if (prog->minlen > strend - strpos) {
 821         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 822                               "  String too short...\n"));
 823         goto fail;
 824     }
 825
 826     RXp_MATCH_UTF8_set(prog, utf8_target);
 827     reginfo->is_utf8_target = cBOOL(utf8_target);
 828     reginfo->info_aux = NULL;
 829     reginfo->strbeg = strbeg;
 830     reginfo->strend = strend;
 831     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 832     reginfo->intuit = 1;
 833     /* not actually used within intuit, but zero for safety anyway */
 834     reginfo->poscache_maxiter = 0;
 835
 836     if (utf8_target) {
 837         if ((!prog->anchored_utf8 && prog->anchored_substr)
 838                 || (!prog->float_utf8 && prog->float_substr))
 839             to_utf8_substr(prog);
 840         check = prog->check_utf8;
 841     } else {
 842         if (!prog->check_substr && prog->check_utf8) {
 843             if (! to_byte_substr(prog)) {
 844                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 845             }
 846         }
 847         check = prog->check_substr;
 848     }
 849
 850     /* dump the various substring data */
 851     DEBUG_OPTIMISE_MORE_r({
 852         int i;
 853         for (i=0; i<=2; i++) {
 854             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 855                                   : prog->substrs->data[i].substr);
 856             if (!sv)
 857                 continue;
 858
 859             Perl_re_printf( aTHX_
 860                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
 861                 " useful=%" IVdf " utf8=%d [%s]\n",
 862                 i,
 863                 (IV)prog->substrs->data[i].min_offset,
 864                 (IV)prog->substrs->data[i].max_offset,
 865                 (IV)prog->substrs->data[i].end_shift,
 866                 BmUSEFUL(sv),
 867                 utf8_target ? 1 : 0,
 868                 SvPEEK(sv));
 869         }
 870     });
 871
 872     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 873
 874         /* ml_anch: check after \n?
 875          *
 876          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 877          * with /.*.../, these flags will have been added by the
 878          * compiler:
 879          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 880          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 881          */
 882         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 883                    && !(prog->intflags & PREGf_IMPLICIT);
 884
 885         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 886             /* we are only allowed to match at BOS or \G */
 887
 888             /* trivially reject if there's a BOS anchor and we're not at BOS.
 889              *
 890              * Note that we don't try to do a similar quick reject for
 891              * \G, since generally the caller will have calculated strpos
 892              * based on pos() and gofs, so the string is already correctly
 893              * anchored by definition; and handling the exceptions would
 894              * be too fiddly (e.g. REXEC_IGNOREPOS).
 895              */
 896             if (   strpos != strbeg
 897                 && (prog->intflags & PREGf_ANCH_SBOL))
 898             {
 899                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 900                                 "  Not at start...\n"));
 901                 goto fail;
 902             }
 903
 904             /* in the presence of an anchor, the anchored (relative to the
 905              * start of the regex) substr must also be anchored relative
 906              * to strpos. So quickly reject if substr isn't found there.
 907              * This works for \G too, because the caller will already have
 908              * subtracted gofs from pos, and gofs is the offset from the
 909              * \G to the start of the regex. For example, in /.abc\Gdef/,
 910              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 911              * caller will have set strpos=pos()-4; we look for the substr
 912              * at position pos()-4+1, which lines up with the "a" */
 913
 914             if (prog->check_offset_min == prog->check_offset_max) {
 915                 /* Substring at constant offset from beg-of-str... */
 916                 SSize_t slen = SvCUR(check);
 917                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 918
 919                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 920                     "  Looking for check substr at fixed offset %" IVdf "...\n",
 921                     (IV)prog->check_offset_min));
 922
 923                 if (SvTAIL(check)) {
 924                     /* In this case, the regex is anchored at the end too.
 925                      * Unless it's a multiline match, the lengths must match
 926                      * exactly, give or take a \n.  NB: slen >= 1 since
 927                      * the last char of check is \n */
 928                     if (!multiline
 929                         && (   strend - s > slen
 930                             || strend - s < slen - 1
 931                             || (strend - s == slen && strend[-1] != '\n')))
 932                     {
 933                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 934                                             "  String too long...\n"));
 935                         goto fail_finish;
 936                     }
 937                     /* Now should match s[0..slen-2] */
 938                     slen--;
 939                 }
 940                 if (slen && (strend - s < slen
 941                     || *SvPVX_const(check) != *s
 942                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
 943                 {
 944                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 945                                     "  String not equal...\n"));
 946                     goto fail_finish;
 947                 }
 948
 949                 check_at = s;
 950                 goto success_at_start;
 951             }
 952         }
 953     }
 954
 955     end_shift = prog->check_end_shift;
 956
 957 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 958     if (end_shift < 0)
 959         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
 960                    (IV)end_shift, RX_PRECOMP(rx));
 961 #endif
 962
 963   restart:
 964
 965     /* This is the (re)entry point of the main loop in this function.
 966      * The goal of this loop is to:
 967      * 1) find the "check" substring in the region rx_origin..strend
 968      *    (adjusted by start_shift / end_shift). If not found, reject
 969      *    immediately.
 970      * 2) If it exists, look for the "other" substr too if defined; for
 971      *    example, if the check substr maps to the anchored substr, then
 972      *    check the floating substr, and vice-versa. If not found, go
 973      *    back to (1) with rx_origin suitably incremented.
 974      * 3) If we find an rx_origin position that doesn't contradict
 975      *    either of the substrings, then check the possible additional
 976      *    constraints on rx_origin of /^.../m or a known start class.
 977      *    If these fail, then depending on which constraints fail, jump
 978      *    back to here, or to various other re-entry points further along
 979      *    that skip some of the first steps.
 980      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 981      *    substring. If the start position was determined to be at the
 982      *    beginning of the string  - so, not rejected, but not optimised,
 983      *    since we have to run regmatch from position 0 - decrement the
 984      *    BmUSEFUL() count. Otherwise increment it.
 985      */
 986
 987
 988     /* first, look for the 'check' substring */
 989
 990     {
 991         U8* start_point;
 992         U8* end_point;
 993
 994         DEBUG_OPTIMISE_MORE_r({
 995             Perl_re_printf( aTHX_
 996                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
 997                 " Start shift: %" IVdf " End shift %" IVdf
 998                 " Real end Shift: %" IVdf "\n",
 999                 (IV)(rx_origin - strbeg),
1000                 (IV)prog->check_offset_min,
1001                 (IV)start_shift,
1002                 (IV)end_shift,
1003                 (IV)prog->check_end_shift);
1004         });
1005
1006         end_point = HOPBACK3(strend, end_shift, rx_origin);
1007         if (!end_point)
1008             goto fail_finish;
1009         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1010         if (!start_point)
1011             goto fail_finish;
1012
1013
1014         /* If the regex is absolutely anchored to either the start of the
1015          * string (SBOL) or to pos() (ANCH_GPOS), then
1016          * check_offset_max represents an upper bound on the string where
1017          * the substr could start. For the ANCH_GPOS case, we assume that
1018          * the caller of intuit will have already set strpos to
1019          * pos()-gofs, so in this case strpos + offset_max will still be
1020          * an upper bound on the substr.
1021          */
1022         if (!ml_anch
1023             && prog->intflags & PREGf_ANCH
1024             && prog->check_offset_max != SSize_t_MAX)
1025         {
1026             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1027             const char * const anchor =
1028                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1029             SSize_t targ_len = (char*)end_point - anchor;
1030
1031             if (check_len > targ_len) {
1032                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033                               "Anchored string too short...\n"));
1034                 goto fail_finish;
1035             }
1036
1037             /* do a bytes rather than chars comparison. It's conservative;
1038              * so it skips doing the HOP if the result can't possibly end
1039              * up earlier than the old value of end_point.
1040              */
1041             assert(anchor + check_len <= (char *)end_point);
1042             if (prog->check_offset_max + check_len < targ_len) {
1043                 end_point = HOP3lim((U8*)anchor,
1044                                 prog->check_offset_max,
1045                                 end_point - check_len
1046                             )
1047                             + check_len;
1048             }
1049         }
1050
1051         check_at = fbm_instr( start_point, end_point,
1052                       check, multiline ? FBMrf_MULTILINE : 0);
1053
1054         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1055             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1056             (IV)((char*)start_point - strbeg),
1057             (IV)((char*)end_point   - strbeg),
1058             (IV)(check_at ? check_at - strbeg : -1)
1059         ));
1060
1061         /* Update the count-of-usability, remove useless subpatterns,
1062             unshift s.  */
1063
1064         DEBUG_EXECUTE_r({
1065             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1066                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1067             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1068                               (check_at ? "Found" : "Did not find"),
1069                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1070                     ? "anchored" : "floating"),
1071                 quoted,
1072                 RE_SV_TAIL(check),
1073                 (check_at ? " at offset " : "...\n") );
1074         });
1075
1076         if (!check_at)
1077             goto fail_finish;
1078         /* set rx_origin to the minimum position where the regex could start
1079          * matching, given the constraint of the just-matched check substring.
1080          * But don't set it lower than previously.
1081          */
1082
1083         if (check_at - rx_origin > prog->check_offset_max)
1084             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1085         /* Finish the diagnostic message */
1086         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1087             "%ld (rx_origin now %" IVdf ")...\n",
1088             (long)(check_at - strbeg),
1089             (IV)(rx_origin - strbeg)
1090         ));
1091     }
1092
1093
1094     /* now look for the 'other' substring if defined */
1095
1096     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1097                     : prog->substrs->data[other_ix].substr)
1098     {
1099         /* Take into account the "other" substring. */
1100         char *last, *last1;
1101         char *s;
1102         SV* must;
1103         struct reg_substr_datum *other;
1104
1105       do_other_substr:
1106         other = &prog->substrs->data[other_ix];
1107
1108         /* if "other" is anchored:
1109          * we've previously found a floating substr starting at check_at.
1110          * This means that the regex origin must lie somewhere
1111          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1112          * and max:                 HOP3(check_at, -check_offset_min)
1113          * (except that min will be >= strpos)
1114          * So the fixed  substr must lie somewhere between
1115          *  HOP3(min, anchored_offset)
1116          *  HOP3(max, anchored_offset) + SvCUR(substr)
1117          */
1118
1119         /* if "other" is floating
1120          * Calculate last1, the absolute latest point where the
1121          * floating substr could start in the string, ignoring any
1122          * constraints from the earlier fixed match. It is calculated
1123          * as follows:
1124          *
1125          * strend - prog->minlen (in chars) is the absolute latest
1126          * position within the string where the origin of the regex
1127          * could appear. The latest start point for the floating
1128          * substr is float_min_offset(*) on from the start of the
1129          * regex.  last1 simply combines thee two offsets.
1130          *
1131          * (*) You might think the latest start point should be
1132          * float_max_offset from the regex origin, and technically
1133          * you'd be correct. However, consider
1134          *    /a\d{2,4}bcd\w/
1135          * Here, float min, max are 3,5 and minlen is 7.
1136          * This can match either
1137          *    /a\d\dbcd\w/
1138          *    /a\d\d\dbcd\w/
1139          *    /a\d\d\d\dbcd\w/
1140          * In the first case, the regex matches minlen chars; in the
1141          * second, minlen+1, in the third, minlen+2.
1142          * In the first case, the floating offset is 3 (which equals
1143          * float_min), in the second, 4, and in the third, 5 (which
1144          * equals float_max). In all cases, the floating string bcd
1145          * can never start more than 4 chars from the end of the
1146          * string, which equals minlen - float_min. As the substring
1147          * starts to match more than float_min from the start of the
1148          * regex, it makes the regex match more than minlen chars,
1149          * and the two cancel each other out. So we can always use
1150          * float_min - minlen, rather than float_max - minlen for the
1151          * latest position in the string.
1152          *
1153          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1154          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1155          */
1156
1157         assert(prog->minlen >= other->min_offset);
1158         last1 = HOP3c(strend,
1159                         other->min_offset - prog->minlen, strbeg);
1160
1161         if (other_ix) {/* i.e. if (other-is-float) */
1162             /* last is the latest point where the floating substr could
1163              * start, *given* any constraints from the earlier fixed
1164              * match. This constraint is that the floating string starts
1165              * <= float_max_offset chars from the regex origin (rx_origin).
1166              * If this value is less than last1, use it instead.
1167              */
1168             assert(rx_origin <= last1);
1169             last =
1170                 /* this condition handles the offset==infinity case, and
1171                  * is a short-cut otherwise. Although it's comparing a
1172                  * byte offset to a char length, it does so in a safe way,
1173                  * since 1 char always occupies 1 or more bytes,
1174                  * so if a string range is  (last1 - rx_origin) bytes,
1175                  * it will be less than or equal to  (last1 - rx_origin)
1176                  * chars; meaning it errs towards doing the accurate HOP3
1177                  * rather than just using last1 as a short-cut */
1178                 (last1 - rx_origin) < other->max_offset
1179                     ? last1
1180                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1181         }
1182         else {
1183             assert(strpos + start_shift <= check_at);
1184             last = HOP4c(check_at, other->min_offset - start_shift,
1185                         strbeg, strend);
1186         }
1187
1188         s = HOP3c(rx_origin, other->min_offset, strend);
1189         if (s < other_last)     /* These positions already checked */
1190             s = other_last;
1191
1192         must = utf8_target ? other->utf8_substr : other->substr;
1193         assert(SvPOK(must));
1194         {
1195             char *from = s;
1196             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1197
1198             if (to > strend)
1199                 to = strend;
1200             if (from > to) {
1201                 s = NULL;
1202                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1203                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1204                     (IV)(from - strbeg),
1205                     (IV)(to   - strbeg)
1206                 ));
1207             }
1208             else {
1209                 s = fbm_instr(
1210                     (unsigned char*)from,
1211                     (unsigned char*)to,
1212                     must,
1213                     multiline ? FBMrf_MULTILINE : 0
1214                 );
1215                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1216                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1217                     (IV)(from - strbeg),
1218                     (IV)(to   - strbeg),
1219                     (IV)(s ? s - strbeg : -1)
1220                 ));
1221             }
1222         }
1223
1224         DEBUG_EXECUTE_r({
1225             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1226                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1227             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1228                 s ? "Found" : "Contradicts",
1229                 other_ix ? "floating" : "anchored",
1230                 quoted, RE_SV_TAIL(must));
1231         });
1232
1233
1234         if (!s) {
1235             /* last1 is latest possible substr location. If we didn't
1236              * find it before there, we never will */
1237             if (last >= last1) {
1238                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1239                                         "; giving up...\n"));
1240                 goto fail_finish;
1241             }
1242
1243             /* try to find the check substr again at a later
1244              * position. Maybe next time we'll find the "other" substr
1245              * in range too */
1246             other_last = HOP3c(last, 1, strend) /* highest failure */;
1247             rx_origin =
1248                 other_ix /* i.e. if other-is-float */
1249                     ? HOP3c(rx_origin, 1, strend)
1250                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1251             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1252                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1253                 (other_ix ? "floating" : "anchored"),
1254                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1255                 (IV)(rx_origin - strbeg)
1256             ));
1257             goto restart;
1258         }
1259         else {
1260             if (other_ix) { /* if (other-is-float) */
1261                 /* other_last is set to s, not s+1, since its possible for
1262                  * a floating substr to fail first time, then succeed
1263                  * second time at the same floating position; e.g.:
1264                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1265                  * The first time round, anchored and float match at
1266                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1267                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1268                  */
1269                 other_last = s;
1270             }
1271             else {
1272                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1273                 other_last = HOP3c(s, 1, strend);
1274             }
1275             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1276                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1277                   (long)(s - strbeg),
1278                 (IV)(rx_origin - strbeg)
1279               ));
1280
1281         }
1282     }
1283     else {
1284         DEBUG_OPTIMISE_MORE_r(
1285             Perl_re_printf( aTHX_
1286                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1287                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1288                 " strend:%" IVdf "\n",
1289                 (IV)prog->check_offset_min,
1290                 (IV)prog->check_offset_max,
1291                 (IV)(check_at-strbeg),
1292                 (IV)(rx_origin-strbeg),
1293                 (IV)(rx_origin-check_at),
1294                 (IV)(strend-strbeg)
1295             )
1296         );
1297     }
1298
1299   postprocess_substr_matches:
1300
1301     /* handle the extra constraint of /^.../m if present */
1302
1303     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1304         char *s;
1305
1306         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1307                         "  looking for /^/m anchor"));
1308
1309         /* we have failed the constraint of a \n before rx_origin.
1310          * Find the next \n, if any, even if it's beyond the current
1311          * anchored and/or floating substrings. Whether we should be
1312          * scanning ahead for the next \n or the next substr is debatable.
1313          * On the one hand you'd expect rare substrings to appear less
1314          * often than \n's. On the other hand, searching for \n means
1315          * we're effectively flipping between check_substr and "\n" on each
1316          * iteration as the current "rarest" string candidate, which
1317          * means for example that we'll quickly reject the whole string if
1318          * hasn't got a \n, rather than trying every substr position
1319          * first
1320          */
1321
1322         s = HOP3c(strend, - prog->minlen, strpos);
1323         if (s <= rx_origin ||
1324             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1325         {
1326             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1327                             "  Did not find /%s^%s/m...\n",
1328                             PL_colors[0], PL_colors[1]));
1329             goto fail_finish;
1330         }
1331
1332         /* earliest possible origin is 1 char after the \n.
1333          * (since *rx_origin == '\n', it's safe to ++ here rather than
1334          * HOP(rx_origin, 1)) */
1335         rx_origin++;
1336
1337         if (prog->substrs->check_ix == 0  /* check is anchored */
1338             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1339         {
1340             /* Position contradicts check-string; either because
1341              * check was anchored (and thus has no wiggle room),
1342              * or check was float and rx_origin is above the float range */
1343             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1344                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1345                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1346             goto restart;
1347         }
1348
1349         /* if we get here, the check substr must have been float,
1350          * is in range, and we may or may not have had an anchored
1351          * "other" substr which still contradicts */
1352         assert(prog->substrs->check_ix); /* check is float */
1353
1354         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1355             /* whoops, the anchored "other" substr exists, so we still
1356              * contradict. On the other hand, the float "check" substr
1357              * didn't contradict, so just retry the anchored "other"
1358              * substr */
1359             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1360                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1361                 PL_colors[0], PL_colors[1],
1362                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1363                 (IV)(rx_origin - strbeg)
1364             ));
1365             goto do_other_substr;
1366         }
1367
1368         /* success: we don't contradict the found floating substring
1369          * (and there's no anchored substr). */
1370         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1371             "  Found /%s^%s/m with rx_origin %ld...\n",
1372             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1373     }
1374     else {
1375         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1376             "  (multiline anchor test skipped)\n"));
1377     }
1378
1379   success_at_start:
1380
1381
1382     /* if we have a starting character class, then test that extra constraint.
1383      * (trie stclasses are too expensive to use here, we are better off to
1384      * leave it to regmatch itself) */
1385
1386     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1387         const U8* const str = (U8*)STRING(progi->regstclass);
1388
1389         /* XXX this value could be pre-computed */
1390         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1391                     ?  (reginfo->is_utf8_pat
1392                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1393                         : STR_LEN(progi->regstclass))
1394                     : 1);
1395         char * endpos;
1396         char *s;
1397         /* latest pos that a matching float substr constrains rx start to */
1398         char *rx_max_float = NULL;
1399
1400         /* if the current rx_origin is anchored, either by satisfying an
1401          * anchored substring constraint, or a /^.../m constraint, then we
1402          * can reject the current origin if the start class isn't found
1403          * at the current position. If we have a float-only match, then
1404          * rx_origin is constrained to a range; so look for the start class
1405          * in that range. if neither, then look for the start class in the
1406          * whole rest of the string */
1407
1408         /* XXX DAPM it's not clear what the minlen test is for, and why
1409          * it's not used in the floating case. Nothing in the test suite
1410          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1411          * Here are some old comments, which may or may not be correct:
1412          *
1413          *   minlen == 0 is possible if regstclass is \b or \B,
1414          *   and the fixed substr is ''$.
1415          *   Since minlen is already taken into account, rx_origin+1 is
1416          *   before strend; accidentally, minlen >= 1 guaranties no false
1417          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1418          *   0) below assumes that regstclass does not come from lookahead...
1419          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1420          *   This leaves EXACTF-ish only, which are dealt with in
1421          *   find_byclass().
1422          */
1423
1424         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1425             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1426         else if (prog->float_substr || prog->float_utf8) {
1427             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1428             endpos = HOP3clim(rx_max_float, cl_l, strend);
1429         }
1430         else
1431             endpos= strend;
1432
1433         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1434             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1435             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1436               (IV)start_shift, (IV)(check_at - strbeg),
1437               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1438
1439         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1440                             reginfo);
1441         if (!s) {
1442             if (endpos == strend) {
1443                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1444                                 "  Could not match STCLASS...\n") );
1445                 goto fail;
1446             }
1447             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1448                                "  This position contradicts STCLASS...\n") );
1449             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1450                         && !(prog->intflags & PREGf_IMPLICIT))
1451                 goto fail;
1452
1453             /* Contradict one of substrings */
1454             if (prog->anchored_substr || prog->anchored_utf8) {
1455                 if (prog->substrs->check_ix == 1) { /* check is float */
1456                     /* Have both, check_string is floating */
1457                     assert(rx_origin + start_shift <= check_at);
1458                     if (rx_origin + start_shift != check_at) {
1459                         /* not at latest position float substr could match:
1460                          * Recheck anchored substring, but not floating.
1461                          * The condition above is in bytes rather than
1462                          * chars for efficiency. It's conservative, in
1463                          * that it errs on the side of doing 'goto
1464                          * do_other_substr'. In this case, at worst,
1465                          * an extra anchored search may get done, but in
1466                          * practice the extra fbm_instr() is likely to
1467                          * get skipped anyway. */
1468                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1469                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1470                             (long)(other_last - strbeg),
1471                             (IV)(rx_origin - strbeg)
1472                         ));
1473                         goto do_other_substr;
1474                     }
1475                 }
1476             }
1477             else {
1478                 /* float-only */
1479
1480                 if (ml_anch) {
1481                     /* In the presence of ml_anch, we might be able to
1482                      * find another \n without breaking the current float
1483                      * constraint. */
1484
1485                     /* strictly speaking this should be HOP3c(..., 1, ...),
1486                      * but since we goto a block of code that's going to
1487                      * search for the next \n if any, its safe here */
1488                     rx_origin++;
1489                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1490                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1491                               PL_colors[0], PL_colors[1],
1492                               (long)(rx_origin - strbeg)) );
1493                     goto postprocess_substr_matches;
1494                 }
1495
1496                 /* strictly speaking this can never be true; but might
1497                  * be if we ever allow intuit without substrings */
1498                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1499                     goto fail;
1500
1501                 rx_origin = rx_max_float;
1502             }
1503
1504             /* at this point, any matching substrings have been
1505              * contradicted. Start again... */
1506
1507             rx_origin = HOP3c(rx_origin, 1, strend);
1508
1509             /* uses bytes rather than char calculations for efficiency.
1510              * It's conservative: it errs on the side of doing 'goto restart',
1511              * where there is code that does a proper char-based test */
1512             if (rx_origin + start_shift + end_shift > strend) {
1513                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1514                                        "  Could not match STCLASS...\n") );
1515                 goto fail;
1516             }
1517             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1518                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1519                 (prog->substrs->check_ix ? "floating" : "anchored"),
1520                 (long)(rx_origin + start_shift - strbeg),
1521                 (IV)(rx_origin - strbeg)
1522             ));
1523             goto restart;
1524         }
1525
1526         /* Success !!! */
1527
1528         if (rx_origin != s) {
1529             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1530                         "  By STCLASS: moving %ld --> %ld\n",
1531                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1532                    );
1533         }
1534         else {
1535             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1536                                   "  Does not contradict STCLASS...\n");
1537                    );
1538         }
1539     }
1540
1541     /* Decide whether using the substrings helped */
1542
1543     if (rx_origin != strpos) {
1544         /* Fixed substring is found far enough so that the match
1545            cannot start at strpos. */
1546
1547         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1548         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1549     }
1550     else {
1551         /* The found rx_origin position does not prohibit matching at
1552          * strpos, so calling intuit didn't gain us anything. Decrement
1553          * the BmUSEFUL() count on the check substring, and if we reach
1554          * zero, free it.  */
1555         if (!(prog->intflags & PREGf_NAUGHTY)
1556             && (utf8_target ? (
1557                 prog->check_utf8                /* Could be deleted already */
1558                 && --BmUSEFUL(prog->check_utf8) < 0
1559                 && (prog->check_utf8 == prog->float_utf8)
1560             ) : (
1561                 prog->check_substr              /* Could be deleted already */
1562                 && --BmUSEFUL(prog->check_substr) < 0
1563                 && (prog->check_substr == prog->float_substr)
1564             )))
1565         {
1566             /* If flags & SOMETHING - do not do it many times on the same match */
1567             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1568             /* XXX Does the destruction order has to change with utf8_target? */
1569             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1570             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1571             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1572             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1573             check = NULL;                       /* abort */
1574             /* XXXX This is a remnant of the old implementation.  It
1575                     looks wasteful, since now INTUIT can use many
1576                     other heuristics. */
1577             prog->extflags &= ~RXf_USE_INTUIT;
1578         }
1579     }
1580
1581     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1582             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1583              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1584
1585     return rx_origin;
1586
1587   fail_finish:                          /* Substring not found */
1588     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1589         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1590   fail:
1591     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1592                           PL_colors[4], PL_colors[5]));
1593     return NULL;
1594 }
1595
1596
1597 #define DECL_TRIE_TYPE(scan) \
1598     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1599                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1600                  trie_utf8l, trie_flu8 }                                            \
1601                     trie_type = ((scan->flags == EXACT)                             \
1602                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1603                                  : (scan->flags == EXACTL)                          \
1604                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1605                                     : (scan->flags == EXACTFA)                      \
1606                                       ? (utf8_target                                \
1607                                          ? trie_utf8_exactfa_fold                   \
1608                                          : trie_latin_utf8_exactfa_fold)            \
1609                                       : (scan->flags == EXACTFLU8                   \
1610                                          ? trie_flu8                                \
1611                                          : (utf8_target                             \
1612                                            ? trie_utf8_fold                         \
1613                                            :   trie_latin_utf8_fold)))
1614
1615 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1616 STMT_START {                                                                        \
1617     STRLEN skiplen;                                                                 \
1618     U8 flags = FOLD_FLAGS_FULL;                                                     \
1619     switch (trie_type) {                                                            \
1620     case trie_flu8:                                                                 \
1621         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1622         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1623             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1624         }                                                                           \
1625         goto do_trie_utf8_fold;                                                     \
1626     case trie_utf8_exactfa_fold:                                                    \
1627         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1628         /* FALLTHROUGH */                                                           \
1629     case trie_utf8_fold:                                                            \
1630       do_trie_utf8_fold:                                                            \
1631         if ( foldlen>0 ) {                                                          \
1632             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1633             foldlen -= len;                                                         \
1634             uscan += len;                                                           \
1635             len=0;                                                                  \
1636         } else {                                                                    \
1637             len = UTF8SKIP(uc);                                                     \
1638             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen,  \
1639                                                                             flags); \
1640             skiplen = UVCHR_SKIP( uvc );                                            \
1641             foldlen -= skiplen;                                                     \
1642             uscan = foldbuf + skiplen;                                              \
1643         }                                                                           \
1644         break;                                                                      \
1645     case trie_latin_utf8_exactfa_fold:                                              \
1646         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1647         /* FALLTHROUGH */                                                           \
1648     case trie_latin_utf8_fold:                                                      \
1649         if ( foldlen>0 ) {                                                          \
1650             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1651             foldlen -= len;                                                         \
1652             uscan += len;                                                           \
1653             len=0;                                                                  \
1654         } else {                                                                    \
1655             len = 1;                                                                \
1656             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1657             skiplen = UVCHR_SKIP( uvc );                                            \
1658             foldlen -= skiplen;                                                     \
1659             uscan = foldbuf + skiplen;                                              \
1660         }                                                                           \
1661         break;                                                                      \
1662     case trie_utf8l:                                                                \
1663         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1664         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1665             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1666         }                                                                           \
1667         /* FALLTHROUGH */                                                           \
1668     case trie_utf8:                                                                 \
1669         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1670         break;                                                                      \
1671     case trie_plain:                                                                \
1672         uvc = (UV)*uc;                                                              \
1673         len = 1;                                                                    \
1674     }                                                                               \
1675     if (uvc < 256) {                                                                \
1676         charid = trie->charmap[ uvc ];                                              \
1677     }                                                                               \
1678     else {                                                                          \
1679         charid = 0;                                                                 \
1680         if (widecharmap) {                                                          \
1681             SV** const svpp = hv_fetch(widecharmap,                                 \
1682                         (char*)&uvc, sizeof(UV), 0);                                \
1683             if (svpp)                                                               \
1684                 charid = (U16)SvIV(*svpp);                                          \
1685         }                                                                           \
1686     }                                                                               \
1687 } STMT_END
1688
1689 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1690     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1691                 startpos, doutf8, depth)
1692
1693 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1694 STMT_START {                                              \
1695     while (s <= e) {                                      \
1696         if ( (COND)                                       \
1697              && (ln == 1 || folder(s, pat_string, ln))    \
1698              && (reginfo->intuit || regtry(reginfo, &s)) )\
1699             goto got_it;                                  \
1700         s++;                                              \
1701     }                                                     \
1702 } STMT_END
1703
1704 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1705 STMT_START {                                          \
1706     while (s < strend) {                              \
1707         CODE                                          \
1708         s += UTF8SKIP(s);                             \
1709     }                                                 \
1710 } STMT_END
1711
1712 #define REXEC_FBC_SCAN(CODE)                          \
1713 STMT_START {                                          \
1714     while (s < strend) {                              \
1715         CODE                                          \
1716         s++;                                          \
1717     }                                                 \
1718 } STMT_END
1719
1720 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1721 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1722     if (COND) {                                                \
1723         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1724             goto got_it;                                       \
1725         else                                                   \
1726             tmp = doevery;                                     \
1727     }                                                          \
1728     else                                                       \
1729         tmp = 1;                                               \
1730 )
1731
1732 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1733 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1734     if (COND) {                                                \
1735         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1736             goto got_it;                                       \
1737         else                                                   \
1738             tmp = doevery;                                     \
1739     }                                                          \
1740     else                                                       \
1741         tmp = 1;                                               \
1742 )
1743
1744 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1745     if (utf8_target) {                                         \
1746         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1747     }                                                          \
1748     else {                                                     \
1749         REXEC_FBC_CLASS_SCAN(COND);                            \
1750     }
1751
1752 /* The three macros below are slightly different versions of the same logic.
1753  *
1754  * The first is for /a and /aa when the target string is UTF-8.  This can only
1755  * match ascii, but it must advance based on UTF-8.   The other two handle the
1756  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1757  * for the boundary (or non-boundary) between a word and non-word character.
1758  * The utf8 and non-utf8 cases have the same logic, but the details must be
1759  * different.  Find the "wordness" of the character just prior to this one, and
1760  * compare it with the wordness of this one.  If they differ, we have a
1761  * boundary.  At the beginning of the string, pretend that the previous
1762  * character was a new-line.
1763  *
1764  * All these macros uncleanly have side-effects with each other and outside
1765  * variables.  So far it's been too much trouble to clean-up
1766  *
1767  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1768  *               a word character or not.
1769  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1770  *               word/non-word
1771  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1772  *
1773  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1774  * are looking for a boundary or for a non-boundary.  If we are looking for a
1775  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1776  * see if this tentative match actually works, and if so, to quit the loop
1777  * here.  And vice-versa if we are looking for a non-boundary.
1778  *
1779  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1780  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1781  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1782  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1783  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1784  * complement.  But in that branch we complement tmp, meaning that at the
1785  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1786  * which means at the top of the loop in the next iteration, it is
1787  * TEST_NON_UTF8(s-1) */
1788 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1789     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1790     tmp = TEST_NON_UTF8(tmp);                                                  \
1791     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1792         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1793             tmp = !tmp;                                                        \
1794             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1795         }                                                                      \
1796         else {                                                                 \
1797             IF_FAIL;                                                           \
1798         }                                                                      \
1799     );                                                                         \
1800
1801 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1802  * TEST_UTF8 is a macro that for the same input code points returns identically
1803  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1804 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1805     if (s == reginfo->strbeg) {                                                \
1806         tmp = '\n';                                                            \
1807     }                                                                          \
1808     else { /* Back-up to the start of the previous character */                \
1809         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1810         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1811                                                        0, UTF8_ALLOW_DEFAULT); \
1812     }                                                                          \
1813     tmp = TEST_UV(tmp);                                                        \
1814     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1815     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1816         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1817             tmp = !tmp;                                                        \
1818             IF_SUCCESS;                                                        \
1819         }                                                                      \
1820         else {                                                                 \
1821             IF_FAIL;                                                           \
1822         }                                                                      \
1823     );
1824
1825 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1826  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1827  * macros below */
1828 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1829     if (utf8_target) {                                                         \
1830         UTF8_CODE                                                              \
1831     }                                                                          \
1832     else {  /* Not utf8 */                                                     \
1833         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1834         tmp = TEST_NON_UTF8(tmp);                                              \
1835         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1836             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1837                 IF_SUCCESS;                                                    \
1838                 tmp = !tmp;                                                    \
1839             }                                                                  \
1840             else {                                                             \
1841                 IF_FAIL;                                                       \
1842             }                                                                  \
1843         );                                                                     \
1844     }                                                                          \
1845     /* Here, things have been set up by the previous code so that tmp is the   \
1846      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1847      * utf8ness of the target).  We also have to check if this matches against \
1848      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1849      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1850      * string */                                                               \
1851     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1852         IF_SUCCESS;                                                            \
1853     }                                                                          \
1854     else {                                                                     \
1855         IF_FAIL;                                                               \
1856     }
1857
1858 /* This is the macro to use when we want to see if something that looks like it
1859  * could match, actually does, and if so exits the loop */
1860 #define REXEC_FBC_TRYIT                            \
1861     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1862         goto got_it
1863
1864 /* The only difference between the BOUND and NBOUND cases is that
1865  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1866  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1867  * with the other one being empty (PLACEHOLDER is defined as empty).
1868  *
1869  * The TEST_FOO parameters are for operating on different forms of input, but
1870  * all should be ones that return identically for the same underlying code
1871  * points */
1872 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1873     FBC_BOUND_COMMON(                                                          \
1874           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1875           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1876
1877 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1878     FBC_BOUND_COMMON(                                                          \
1879             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1880             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1881
1882 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1883     FBC_BOUND_COMMON(                                                          \
1884           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1885           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1886
1887 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1888     FBC_BOUND_COMMON(                                                          \
1889             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1890             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1891
1892 #ifdef DEBUGGING
1893 static IV
1894 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1895   IV cp_out = Perl__invlist_search(invlist, cp_in);
1896   assert(cp_out >= 0);
1897   return cp_out;
1898 }
1899 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1900         invmap[S_get_break_val_cp_checked(invlist, cp)]
1901 #else
1902 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1903         invmap[_invlist_search(invlist, cp)]
1904 #endif
1905
1906 /* Takes a pointer to an inversion list, a pointer to its corresponding
1907  * inversion map, and a code point, and returns the code point's value
1908  * according to the two arrays.  It assumes that all code points have a value.
1909  * This is used as the base macro for macros for particular properties */
1910 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1911         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1912
1913 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1914  * of a code point, returning the value for the first code point in the string.
1915  * And it takes the particular macro name that finds the desired value given a
1916  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1917 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1918              (__ASSERT_(pos < strend)                                          \
1919                  /* Note assumes is valid UTF-8 */                             \
1920              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1921
1922 /* Returns the GCB value for the input code point */
1923 #define getGCB_VAL_CP(cp)                                                      \
1924           _generic_GET_BREAK_VAL_CP(                                           \
1925                                     PL_GCB_invlist,                            \
1926                                     _Perl_GCB_invmap,                          \
1927                                     (cp))
1928
1929 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1930  * bounded by pos and strend */
1931 #define getGCB_VAL_UTF8(pos, strend)                                           \
1932     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1933
1934 /* Returns the LB value for the input code point */
1935 #define getLB_VAL_CP(cp)                                                       \
1936           _generic_GET_BREAK_VAL_CP(                                           \
1937                                     PL_LB_invlist,                             \
1938                                     _Perl_LB_invmap,                           \
1939                                     (cp))
1940
1941 /* Returns the LB value for the first code point in the UTF-8 encoded string
1942  * bounded by pos and strend */
1943 #define getLB_VAL_UTF8(pos, strend)                                            \
1944     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1945
1946
1947 /* Returns the SB value for the input code point */
1948 #define getSB_VAL_CP(cp)                                                       \
1949           _generic_GET_BREAK_VAL_CP(                                           \
1950                                     PL_SB_invlist,                             \
1951                                     _Perl_SB_invmap,                     \
1952                                     (cp))
1953
1954 /* Returns the SB value for the first code point in the UTF-8 encoded string
1955  * bounded by pos and strend */
1956 #define getSB_VAL_UTF8(pos, strend)                                            \
1957     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1958
1959 /* Returns the WB value for the input code point */
1960 #define getWB_VAL_CP(cp)                                                       \
1961           _generic_GET_BREAK_VAL_CP(                                           \
1962                                     PL_WB_invlist,                             \
1963                                     _Perl_WB_invmap,                         \
1964                                     (cp))
1965
1966 /* Returns the WB value for the first code point in the UTF-8 encoded string
1967  * bounded by pos and strend */
1968 #define getWB_VAL_UTF8(pos, strend)                                            \
1969     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1970
1971 /* We know what class REx starts with.  Try to find this position... */
1972 /* if reginfo->intuit, its a dryrun */
1973 /* annoyingly all the vars in this routine have different names from their counterparts
1974    in regmatch. /grrr */
1975 STATIC char *
1976 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1977     const char *strend, regmatch_info *reginfo)
1978 {
1979     dVAR;
1980     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1981     char *pat_string;   /* The pattern's exactish string */
1982     char *pat_end;          /* ptr to end char of pat_string */
1983     re_fold_t folder;   /* Function for computing non-utf8 folds */
1984     const U8 *fold_array;   /* array for folding ords < 256 */
1985     STRLEN ln;
1986     STRLEN lnc;
1987     U8 c1;
1988     U8 c2;
1989     char *e;
1990     I32 tmp = 1;        /* Scratch variable? */
1991     const bool utf8_target = reginfo->is_utf8_target;
1992     UV utf8_fold_flags = 0;
1993     const bool is_utf8_pat = reginfo->is_utf8_pat;
1994     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1995                                    with a result inverts that result, as 0^1 =
1996                                    1 and 1^1 = 0 */
1997     _char_class_number classnum;
1998
1999     RXi_GET_DECL(prog,progi);
2000
2001     PERL_ARGS_ASSERT_FIND_BYCLASS;
2002
2003     /* We know what class it must start with. */
2004     switch (OP(c)) {
2005     case ANYOFL:
2006         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2007
2008         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2009             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2010         }
2011
2012         /* FALLTHROUGH */
2013     case ANYOFD:
2014     case ANYOF:
2015         if (utf8_target) {
2016             REXEC_FBC_UTF8_CLASS_SCAN(
2017                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2018         }
2019         else if (ANYOF_FLAGS(c)) {
2020             REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2021         }
2022         else {
2023             REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2024         }
2025         break;
2026
2027     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
2028         assert(! is_utf8_pat);
2029         /* FALLTHROUGH */
2030     case EXACTFA:
2031         if (is_utf8_pat || utf8_target) {
2032             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2033             goto do_exactf_utf8;
2034         }
2035         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
2036         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
2037         goto do_exactf_non_utf8;        /* isn't dealt with by these */
2038
2039     case EXACTF:   /* This node only generated for non-utf8 patterns */
2040         assert(! is_utf8_pat);
2041         if (utf8_target) {
2042             utf8_fold_flags = 0;
2043             goto do_exactf_utf8;
2044         }
2045         fold_array = PL_fold;
2046         folder = foldEQ;
2047         goto do_exactf_non_utf8;
2048
2049     case EXACTFL:
2050         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2051         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2052             utf8_fold_flags = FOLDEQ_LOCALE;
2053             goto do_exactf_utf8;
2054         }
2055         fold_array = PL_fold_locale;
2056         folder = foldEQ_locale;
2057         goto do_exactf_non_utf8;
2058
2059     case EXACTFU_SS:
2060         if (is_utf8_pat) {
2061             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2062         }
2063         goto do_exactf_utf8;
2064
2065     case EXACTFLU8:
2066             if (! utf8_target) {    /* All code points in this node require
2067                                        UTF-8 to express.  */
2068                 break;
2069             }
2070             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2071                                              | FOLDEQ_S2_FOLDS_SANE;
2072             goto do_exactf_utf8;
2073
2074     case EXACTFU:
2075         if (is_utf8_pat || utf8_target) {
2076             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2077             goto do_exactf_utf8;
2078         }
2079
2080         /* Any 'ss' in the pattern should have been replaced by regcomp,
2081          * so we don't have to worry here about this single special case
2082          * in the Latin1 range */
2083         fold_array = PL_fold_latin1;
2084         folder = foldEQ_latin1;
2085
2086         /* FALLTHROUGH */
2087
2088       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2089                            are no glitches with fold-length differences
2090                            between the target string and pattern */
2091
2092         /* The idea in the non-utf8 EXACTF* cases is to first find the
2093          * first character of the EXACTF* node and then, if necessary,
2094          * case-insensitively compare the full text of the node.  c1 is the
2095          * first character.  c2 is its fold.  This logic will not work for
2096          * Unicode semantics and the german sharp ss, which hence should
2097          * not be compiled into a node that gets here. */
2098         pat_string = STRING(c);
2099         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2100
2101         /* We know that we have to match at least 'ln' bytes (which is the
2102          * same as characters, since not utf8).  If we have to match 3
2103          * characters, and there are only 2 availabe, we know without
2104          * trying that it will fail; so don't start a match past the
2105          * required minimum number from the far end */
2106         e = HOP3c(strend, -((SSize_t)ln), s);
2107         if (e < s)
2108             break;
2109
2110         c1 = *pat_string;
2111         c2 = fold_array[c1];
2112         if (c1 == c2) { /* If char and fold are the same */
2113             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
2114         }
2115         else {
2116             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
2117         }
2118         break;
2119
2120       do_exactf_utf8:
2121       {
2122         unsigned expansion;
2123
2124         /* If one of the operands is in utf8, we can't use the simpler folding
2125          * above, due to the fact that many different characters can have the
2126          * same fold, or portion of a fold, or different- length fold */
2127         pat_string = STRING(c);
2128         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2129         pat_end = pat_string + ln;
2130         lnc = is_utf8_pat       /* length to match in characters */
2131                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2132                 : ln;
2133
2134         /* We have 'lnc' characters to match in the pattern, but because of
2135          * multi-character folding, each character in the target can match
2136          * up to 3 characters (Unicode guarantees it will never exceed
2137          * this) if it is utf8-encoded; and up to 2 if not (based on the
2138          * fact that the Latin 1 folds are already determined, and the
2139          * only multi-char fold in that range is the sharp-s folding to
2140          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2141          * string character.  Adjust lnc accordingly, rounding up, so that
2142          * if we need to match at least 4+1/3 chars, that really is 5. */
2143         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2144         lnc = (lnc + expansion - 1) / expansion;
2145
2146         /* As in the non-UTF8 case, if we have to match 3 characters, and
2147          * only 2 are left, it's guaranteed to fail, so don't start a
2148          * match that would require us to go beyond the end of the string
2149          */
2150         e = HOP3c(strend, -((SSize_t)lnc), s);
2151
2152         /* XXX Note that we could recalculate e to stop the loop earlier,
2153          * as the worst case expansion above will rarely be met, and as we
2154          * go along we would usually find that e moves further to the left.
2155          * This would happen only after we reached the point in the loop
2156          * where if there were no expansion we should fail.  Unclear if
2157          * worth the expense */
2158
2159         while (s <= e) {
2160             char *my_strend= (char *)strend;
2161             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2162                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2163                 && (reginfo->intuit || regtry(reginfo, &s)) )
2164             {
2165                 goto got_it;
2166             }
2167             s += (utf8_target) ? UTF8SKIP(s) : 1;
2168         }
2169         break;
2170     }
2171
2172     case BOUNDL:
2173         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2174         if (FLAGS(c) != TRADITIONAL_BOUND) {
2175             if (! IN_UTF8_CTYPE_LOCALE) {
2176                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2177                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2178             }
2179             goto do_boundu;
2180         }
2181
2182         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2183         break;
2184
2185     case NBOUNDL:
2186         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2187         if (FLAGS(c) != TRADITIONAL_BOUND) {
2188             if (! IN_UTF8_CTYPE_LOCALE) {
2189                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2190                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2191             }
2192             goto do_nboundu;
2193         }
2194
2195         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2196         break;
2197
2198     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2199                    meaning */
2200         assert(FLAGS(c) == TRADITIONAL_BOUND);
2201
2202         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2203         break;
2204
2205     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2206                    meaning */
2207         assert(FLAGS(c) == TRADITIONAL_BOUND);
2208
2209         FBC_BOUND_A(isWORDCHAR_A);
2210         break;
2211
2212     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2213                    meaning */
2214         assert(FLAGS(c) == TRADITIONAL_BOUND);
2215
2216         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2217         break;
2218
2219     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2220                    meaning */
2221         assert(FLAGS(c) == TRADITIONAL_BOUND);
2222
2223         FBC_NBOUND_A(isWORDCHAR_A);
2224         break;
2225
2226     case NBOUNDU:
2227         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2228             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2229             break;
2230         }
2231
2232       do_nboundu:
2233
2234         to_complement = 1;
2235         /* FALLTHROUGH */
2236
2237     case BOUNDU:
2238       do_boundu:
2239         switch((bound_type) FLAGS(c)) {
2240             case TRADITIONAL_BOUND:
2241                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2242                 break;
2243             case GCB_BOUND:
2244                 if (s == reginfo->strbeg) {
2245                     if (reginfo->intuit || regtry(reginfo, &s))
2246                     {
2247                         goto got_it;
2248                     }
2249
2250                     /* Didn't match.  Try at the next position (if there is one) */
2251                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2252                     if (UNLIKELY(s >= reginfo->strend)) {
2253                         break;
2254                     }
2255                 }
2256
2257                 if (utf8_target) {
2258                     GCB_enum before = getGCB_VAL_UTF8(
2259                                                reghop3((U8*)s, -1,
2260                                                        (U8*)(reginfo->strbeg)),
2261                                                (U8*) reginfo->strend);
2262                     while (s < strend) {
2263                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2264                                                         (U8*) reginfo->strend);
2265                         if (   (to_complement ^ isGCB(before,
2266                                                       after,
2267                                                       (U8*) reginfo->strbeg,
2268                                                       (U8*) s,
2269                                                       utf8_target))
2270                             && (reginfo->intuit || regtry(reginfo, &s)))
2271                         {
2272                             goto got_it;
2273                         }
2274                         before = after;
2275                         s += UTF8SKIP(s);
2276                     }
2277                 }
2278                 else {  /* Not utf8.  Everything is a GCB except between CR and
2279                            LF */
2280                     while (s < strend) {
2281                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2282                                               || UCHARAT(s) != '\n'))
2283                             && (reginfo->intuit || regtry(reginfo, &s)))
2284                         {
2285                             goto got_it;
2286                         }
2287                         s++;
2288                     }
2289                 }
2290
2291                 /* And, since this is a bound, it can match after the final
2292                  * character in the string */
2293                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2294                     goto got_it;
2295                 }
2296                 break;
2297
2298             case LB_BOUND:
2299                 if (s == reginfo->strbeg) {
2300                     if (reginfo->intuit || regtry(reginfo, &s)) {
2301                         goto got_it;
2302                     }
2303                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2304                     if (UNLIKELY(s >= reginfo->strend)) {
2305                         break;
2306                     }
2307                 }
2308
2309                 if (utf8_target) {
2310                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2311                                                                -1,
2312                                                                (U8*)(reginfo->strbeg)),
2313                                                        (U8*) reginfo->strend);
2314                     while (s < strend) {
2315                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2316                         if (to_complement ^ isLB(before,
2317                                                  after,
2318                                                  (U8*) reginfo->strbeg,
2319                                                  (U8*) s,
2320                                                  (U8*) reginfo->strend,
2321                                                  utf8_target)
2322                             && (reginfo->intuit || regtry(reginfo, &s)))
2323                         {
2324                             goto got_it;
2325                         }
2326                         before = after;
2327                         s += UTF8SKIP(s);
2328                     }
2329                 }
2330                 else {  /* Not utf8. */
2331                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2332                     while (s < strend) {
2333                         LB_enum after = getLB_VAL_CP((U8) *s);
2334                         if (to_complement ^ isLB(before,
2335                                                  after,
2336                                                  (U8*) reginfo->strbeg,
2337                                                  (U8*) s,
2338                                                  (U8*) reginfo->strend,
2339                                                  utf8_target)
2340                             && (reginfo->intuit || regtry(reginfo, &s)))
2341                         {
2342                             goto got_it;
2343                         }
2344                         before = after;
2345                         s++;
2346                     }
2347                 }
2348
2349                 if (reginfo->intuit || regtry(reginfo, &s)) {
2350                     goto got_it;
2351                 }
2352
2353                 break;
2354
2355             case SB_BOUND:
2356                 if (s == reginfo->strbeg) {
2357                     if (reginfo->intuit || regtry(reginfo, &s)) {
2358                         goto got_it;
2359                     }
2360                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2361                     if (UNLIKELY(s >= reginfo->strend)) {
2362                         break;
2363                     }
2364                 }
2365
2366                 if (utf8_target) {
2367                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2368                                                         -1,
2369                                                         (U8*)(reginfo->strbeg)),
2370                                                       (U8*) reginfo->strend);
2371                     while (s < strend) {
2372                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2373                                                          (U8*) reginfo->strend);
2374                         if ((to_complement ^ isSB(before,
2375                                                   after,
2376                                                   (U8*) reginfo->strbeg,
2377                                                   (U8*) s,
2378                                                   (U8*) reginfo->strend,
2379                                                   utf8_target))
2380                             && (reginfo->intuit || regtry(reginfo, &s)))
2381                         {
2382                             goto got_it;
2383                         }
2384                         before = after;
2385                         s += UTF8SKIP(s);
2386                     }
2387                 }
2388                 else {  /* Not utf8. */
2389                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2390                     while (s < strend) {
2391                         SB_enum after = getSB_VAL_CP((U8) *s);
2392                         if ((to_complement ^ isSB(before,
2393                                                   after,
2394                                                   (U8*) reginfo->strbeg,
2395                                                   (U8*) s,
2396                                                   (U8*) reginfo->strend,
2397                                                   utf8_target))
2398                             && (reginfo->intuit || regtry(reginfo, &s)))
2399                         {
2400                             goto got_it;
2401                         }
2402                         before = after;
2403                         s++;
2404                     }
2405                 }
2406
2407                 /* Here are at the final position in the target string.  The SB
2408                  * value is always true here, so matches, depending on other
2409                  * constraints */
2410                 if (reginfo->intuit || regtry(reginfo, &s)) {
2411                     goto got_it;
2412                 }
2413
2414                 break;
2415
2416             case WB_BOUND:
2417                 if (s == reginfo->strbeg) {
2418                     if (reginfo->intuit || regtry(reginfo, &s)) {
2419                         goto got_it;
2420                     }
2421                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2422                     if (UNLIKELY(s >= reginfo->strend)) {
2423                         break;
2424                     }
2425                 }
2426
2427                 if (utf8_target) {
2428                     /* We are at a boundary between char_sub_0 and char_sub_1.
2429                      * We also keep track of the value for char_sub_-1 as we
2430                      * loop through the line.   Context may be needed to make a
2431                      * determination, and if so, this can save having to
2432                      * recalculate it */
2433                     WB_enum previous = WB_UNKNOWN;
2434                     WB_enum before = getWB_VAL_UTF8(
2435                                               reghop3((U8*)s,
2436                                                       -1,
2437                                                       (U8*)(reginfo->strbeg)),
2438                                               (U8*) reginfo->strend);
2439                     while (s < strend) {
2440                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2441                                                         (U8*) reginfo->strend);
2442                         if ((to_complement ^ isWB(previous,
2443                                                   before,
2444                                                   after,
2445                                                   (U8*) reginfo->strbeg,
2446                                                   (U8*) s,
2447                                                   (U8*) reginfo->strend,
2448                                                   utf8_target))
2449                             && (reginfo->intuit || regtry(reginfo, &s)))
2450                         {
2451                             goto got_it;
2452                         }
2453                         previous = before;
2454                         before = after;
2455                         s += UTF8SKIP(s);
2456                     }
2457                 }
2458                 else {  /* Not utf8. */
2459                     WB_enum previous = WB_UNKNOWN;
2460                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2461                     while (s < strend) {
2462                         WB_enum after = getWB_VAL_CP((U8) *s);
2463                         if ((to_complement ^ isWB(previous,
2464                                                   before,
2465                                                   after,
2466                                                   (U8*) reginfo->strbeg,
2467                                                   (U8*) s,
2468                                                   (U8*) reginfo->strend,
2469                                                   utf8_target))
2470                             && (reginfo->intuit || regtry(reginfo, &s)))
2471                         {
2472                             goto got_it;
2473                         }
2474                         previous = before;
2475                         before = after;
2476                         s++;
2477                     }
2478                 }
2479
2480                 if (reginfo->intuit || regtry(reginfo, &s)) {
2481                     goto got_it;
2482                 }
2483         }
2484         break;
2485
2486     case LNBREAK:
2487         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2488                         is_LNBREAK_latin1_safe(s, strend)
2489         );
2490         break;
2491
2492     case ASCII:
2493         s = find_next_ascii(s, strend, utf8_target);
2494         if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
2495             goto got_it;
2496         }
2497
2498         break;
2499
2500     case NASCII:
2501         s = find_next_non_ascii(s, strend, utf8_target);
2502         if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
2503             goto got_it;
2504         }
2505
2506         break;
2507
2508     /* The argument to all the POSIX node types is the class number to pass to
2509      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2510
2511     case NPOSIXL:
2512         to_complement = 1;
2513         /* FALLTHROUGH */
2514
2515     case POSIXL:
2516         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2517         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2518                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2519         break;
2520
2521     case NPOSIXD:
2522         to_complement = 1;
2523         /* FALLTHROUGH */
2524
2525     case POSIXD:
2526         if (utf8_target) {
2527             goto posix_utf8;
2528         }
2529         goto posixa;
2530
2531     case NPOSIXA:
2532         if (utf8_target) {
2533             /* The complement of something that matches only ASCII matches all
2534              * non-ASCII, plus everything in ASCII that isn't in the class. */
2535             REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
2536                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2537             break;
2538         }
2539
2540         to_complement = 1;
2541         goto posixa;
2542
2543     case POSIXA:
2544         /* Don't need to worry about utf8, as it can match only a single
2545          * byte invariant character.  But we do anyway for performance reasons,
2546          * as otherwise we would have to examine all the continuation
2547          * characters */
2548         if (utf8_target) {
2549             REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
2550             break;
2551         }
2552
2553       posixa:
2554         REXEC_FBC_CLASS_SCAN(
2555                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2556         break;
2557
2558     case NPOSIXU:
2559         to_complement = 1;
2560         /* FALLTHROUGH */
2561
2562     case POSIXU:
2563         if (! utf8_target) {
2564             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2565                                                                     FLAGS(c))));
2566         }
2567         else {
2568
2569           posix_utf8:
2570             classnum = (_char_class_number) FLAGS(c);
2571             if (classnum < _FIRST_NON_SWASH_CC) {
2572                 while (s < strend) {
2573
2574                     /* We avoid loading in the swash as long as possible, but
2575                      * should we have to, we jump to a separate loop.  This
2576                      * extra 'if' statement is what keeps this code from being
2577                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2578                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2579                         goto found_above_latin1;
2580                     }
2581                     if ((UTF8_IS_INVARIANT(*s)
2582                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2583                                                                 classnum)))
2584                         || (   UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
2585                             && to_complement ^ cBOOL(
2586                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2587                                                                       *(s + 1)),
2588                                               classnum))))
2589                     {
2590                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2591                             goto got_it;
2592                         else {
2593                             tmp = doevery;
2594                         }
2595                     }
2596                     else {
2597                         tmp = 1;
2598                     }
2599                     s += UTF8SKIP(s);
2600                 }
2601             }
2602             else switch (classnum) {    /* These classes are implemented as
2603                                            macros */
2604                 case _CC_ENUM_SPACE:
2605                     REXEC_FBC_UTF8_CLASS_SCAN(
2606                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2607                     break;
2608
2609                 case _CC_ENUM_BLANK:
2610                     REXEC_FBC_UTF8_CLASS_SCAN(
2611                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2612                     break;
2613
2614                 case _CC_ENUM_XDIGIT:
2615                     REXEC_FBC_UTF8_CLASS_SCAN(
2616                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2617                     break;
2618
2619                 case _CC_ENUM_VERTSPACE:
2620                     REXEC_FBC_UTF8_CLASS_SCAN(
2621                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2622                     break;
2623
2624                 case _CC_ENUM_CNTRL:
2625                     REXEC_FBC_UTF8_CLASS_SCAN(
2626                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2627                     break;
2628
2629                 default:
2630                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2631                     NOT_REACHED; /* NOTREACHED */
2632             }
2633         }
2634         break;
2635
2636       found_above_latin1:   /* Here we have to load a swash to get the result
2637                                for the current code point */
2638         if (! PL_utf8_swash_ptrs[classnum]) {
2639             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2640             PL_utf8_swash_ptrs[classnum] =
2641                     _core_swash_init("utf8",
2642                                      "",
2643                                      &PL_sv_undef, 1, 0,
2644                                      PL_XPosix_ptrs[classnum], &flags);
2645         }
2646
2647         /* This is a copy of the loop above for swash classes, though using the
2648          * FBC macro instead of being expanded out.  Since we've loaded the
2649          * swash, we don't have to check for that each time through the loop */
2650         REXEC_FBC_UTF8_CLASS_SCAN(
2651                 to_complement ^ cBOOL(_generic_utf8_safe(
2652                                       classnum,
2653                                       s,
2654                                       strend,
2655                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2656                                                   (U8 *) s, TRUE))));
2657         break;
2658
2659     case AHOCORASICKC:
2660     case AHOCORASICK:
2661         {
2662             DECL_TRIE_TYPE(c);
2663             /* what trie are we using right now */
2664             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2665             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2666             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2667
2668             const char *last_start = strend - trie->minlen;
2669 #ifdef DEBUGGING
2670             const char *real_start = s;
2671 #endif
2672             STRLEN maxlen = trie->maxlen;
2673             SV *sv_points;
2674             U8 **points; /* map of where we were in the input string
2675                             when reading a given char. For ASCII this
2676                             is unnecessary overhead as the relationship
2677                             is always 1:1, but for Unicode, especially
2678                             case folded Unicode this is not true. */
2679             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2680             U8 *bitmap=NULL;
2681
2682
2683             GET_RE_DEBUG_FLAGS_DECL;
2684
2685             /* We can't just allocate points here. We need to wrap it in
2686              * an SV so it gets freed properly if there is a croak while
2687              * running the match */
2688             ENTER;
2689             SAVETMPS;
2690             sv_points=newSV(maxlen * sizeof(U8 *));
2691             SvCUR_set(sv_points,
2692                 maxlen * sizeof(U8 *));
2693             SvPOK_on(sv_points);
2694             sv_2mortal(sv_points);
2695             points=(U8**)SvPV_nolen(sv_points );
2696             if ( trie_type != trie_utf8_fold
2697                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2698             {
2699                 if (trie->bitmap)
2700                     bitmap=(U8*)trie->bitmap;
2701                 else
2702                     bitmap=(U8*)ANYOF_BITMAP(c);
2703             }
2704             /* this is the Aho-Corasick algorithm modified a touch
2705                to include special handling for long "unknown char" sequences.
2706                The basic idea being that we use AC as long as we are dealing
2707                with a possible matching char, when we encounter an unknown char
2708                (and we have not encountered an accepting state) we scan forward
2709                until we find a legal starting char.
2710                AC matching is basically that of trie matching, except that when
2711                we encounter a failing transition, we fall back to the current
2712                states "fail state", and try the current char again, a process
2713                we repeat until we reach the root state, state 1, or a legal
2714                transition. If we fail on the root state then we can either
2715                terminate if we have reached an accepting state previously, or
2716                restart the entire process from the beginning if we have not.
2717
2718              */
2719             while (s <= last_start) {
2720                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2721                 U8 *uc = (U8*)s;
2722                 U16 charid = 0;
2723                 U32 base = 1;
2724                 U32 state = 1;
2725                 UV uvc = 0;
2726                 STRLEN len = 0;
2727                 STRLEN foldlen = 0;
2728                 U8 *uscan = (U8*)NULL;
2729                 U8 *leftmost = NULL;
2730 #ifdef DEBUGGING
2731                 U32 accepted_word= 0;
2732 #endif
2733                 U32 pointpos = 0;
2734
2735                 while ( state && uc <= (U8*)strend ) {
2736                     int failed=0;
2737                     U32 word = aho->states[ state ].wordnum;
2738
2739                     if( state==1 ) {
2740                         if ( bitmap ) {
2741                             DEBUG_TRIE_EXECUTE_r(
2742                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2743                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2744                                         (char *)uc, utf8_target, 0 );
2745                                     Perl_re_printf( aTHX_
2746                                         " Scanning for legal start char...\n");
2747                                 }
2748                             );
2749                             if (utf8_target) {
2750                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2751                                     uc += UTF8SKIP(uc);
2752                                 }
2753                             } else {
2754                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2755                                     uc++;
2756                                 }
2757                             }
2758                             s= (char *)uc;
2759                         }
2760                         if (uc >(U8*)last_start) break;
2761                     }
2762
2763                     if ( word ) {
2764                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2765                         if (!leftmost || lpos < leftmost) {
2766                             DEBUG_r(accepted_word=word);
2767                             leftmost= lpos;
2768                         }
2769                         if (base==0) break;
2770
2771                     }
2772                     points[pointpos++ % maxlen]= uc;
2773                     if (foldlen || uc < (U8*)strend) {
2774                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2775                                          widecharmap, uc,
2776                                          uscan, len, uvc, charid, foldlen,
2777                                          foldbuf, uniflags);
2778                         DEBUG_TRIE_EXECUTE_r({
2779                             dump_exec_pos( (char *)uc, c, strend,
2780                                         real_start, s, utf8_target, 0);
2781                             Perl_re_printf( aTHX_
2782                                 " Charid:%3u CP:%4" UVxf " ",
2783                                  charid, uvc);
2784                         });
2785                     }
2786                     else {
2787                         len = 0;
2788                         charid = 0;
2789                     }
2790
2791
2792                     do {
2793 #ifdef DEBUGGING
2794                         word = aho->states[ state ].wordnum;
2795 #endif
2796                         base = aho->states[ state ].trans.base;
2797
2798                         DEBUG_TRIE_EXECUTE_r({
2799                             if (failed)
2800                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2801                                     s,   utf8_target, 0 );
2802                             Perl_re_printf( aTHX_
2803                                 "%sState: %4" UVxf ", word=%" UVxf,
2804                                 failed ? " Fail transition to " : "",
2805                                 (UV)state, (UV)word);
2806                         });
2807                         if ( base ) {
2808                             U32 tmp;
2809                             I32 offset;
2810                             if (charid &&
2811                                  ( ((offset = base + charid
2812                                     - 1 - trie->uniquecharcount)) >= 0)
2813                                  && ((U32)offset < trie->lasttrans)
2814                                  && trie->trans[offset].check == state
2815                                  && (tmp=trie->trans[offset].next))
2816                             {
2817                                 DEBUG_TRIE_EXECUTE_r(
2818                                     Perl_re_printf( aTHX_ " - legal\n"));
2819                                 state = tmp;
2820                                 break;
2821                             }
2822                             else {
2823                                 DEBUG_TRIE_EXECUTE_r(
2824                                     Perl_re_printf( aTHX_ " - fail\n"));
2825                                 failed = 1;
2826                                 state = aho->fail[state];
2827                             }
2828                         }
2829                         else {
2830                             /* we must be accepting here */
2831                             DEBUG_TRIE_EXECUTE_r(
2832                                     Perl_re_printf( aTHX_ " - accepting\n"));
2833                             failed = 1;
2834                             break;
2835                         }
2836                     } while(state);
2837                     uc += len;
2838                     if (failed) {
2839                         if (leftmost)
2840                             break;
2841                         if (!state) state = 1;
2842                     }
2843                 }
2844                 if ( aho->states[ state ].wordnum ) {
2845                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2846                     if (!leftmost || lpos < leftmost) {
2847                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2848                         leftmost = lpos;
2849                     }
2850                 }
2851                 if (leftmost) {
2852                     s = (char*)leftmost;
2853                     DEBUG_TRIE_EXECUTE_r({
2854                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
2855                             (UV)accepted_word, (IV)(s - real_start)
2856                         );
2857                     });
2858                     if (reginfo->intuit || regtry(reginfo, &s)) {
2859                         FREETMPS;
2860                         LEAVE;
2861                         goto got_it;
2862                     }
2863                     s = HOPc(s,1);
2864                     DEBUG_TRIE_EXECUTE_r({
2865                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2866                     });
2867                 } else {
2868                     DEBUG_TRIE_EXECUTE_r(
2869                         Perl_re_printf( aTHX_ "No match.\n"));
2870                     break;
2871                 }
2872             }
2873             FREETMPS;
2874             LEAVE;
2875         }
2876         break;
2877     default:
2878         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2879     }
2880     return 0;
2881   got_it:
2882     return s;
2883 }
2884
2885 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2886  * flags have same meanings as with regexec_flags() */
2887
2888 static void
2889 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2890                             char *strbeg,
2891                             char *strend,
2892                             SV *sv,
2893                             U32 flags,
2894                             bool utf8_target)
2895 {
2896     struct regexp *const prog = ReANY(rx);
2897
2898     if (flags & REXEC_COPY_STR) {
2899 #ifdef PERL_ANY_COW
2900         if (SvCANCOW(sv)) {
2901             DEBUG_C(Perl_re_printf( aTHX_
2902                               "Copy on write: regexp capture, type %d\n",
2903                                     (int) SvTYPE(sv)));
2904             /* Create a new COW SV to share the match string and store
2905              * in saved_copy, unless the current COW SV in saved_copy
2906              * is valid and suitable for our purpose */
2907             if ((   prog->saved_copy
2908                  && SvIsCOW(prog->saved_copy)
2909                  && SvPOKp(prog->saved_copy)
2910                  && SvIsCOW(sv)
2911                  && SvPOKp(sv)
2912                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2913             {
2914                 /* just reuse saved_copy SV */
2915                 if (RXp_MATCH_COPIED(prog)) {
2916                     Safefree(prog->subbeg);
2917                     RXp_MATCH_COPIED_off(prog);
2918                 }
2919             }
2920             else {
2921                 /* create new COW SV to share string */
2922                 RXp_MATCH_COPY_FREE(prog);
2923                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2924             }
2925             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2926             assert (SvPOKp(prog->saved_copy));
2927             prog->sublen  = strend - strbeg;
2928             prog->suboffset = 0;
2929             prog->subcoffset = 0;
2930         } else
2931 #endif
2932         {
2933             SSize_t min = 0;
2934             SSize_t max = strend - strbeg;
2935             SSize_t sublen;
2936
2937             if (    (flags & REXEC_COPY_SKIP_POST)
2938                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2939                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2940             ) { /* don't copy $' part of string */
2941                 U32 n = 0;
2942                 max = -1;
2943                 /* calculate the right-most part of the string covered
2944                  * by a capture. Due to lookahead, this may be to
2945                  * the right of $&, so we have to scan all captures */
2946                 while (n <= prog->lastparen) {
2947                     if (prog->offs[n].end > max)
2948                         max = prog->offs[n].end;
2949                     n++;
2950                 }
2951                 if (max == -1)
2952                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2953                             ? prog->offs[0].start
2954                             : 0;
2955                 assert(max >= 0 && max <= strend - strbeg);
2956             }
2957
2958             if (    (flags & REXEC_COPY_SKIP_PRE)
2959                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2960                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2961             ) { /* don't copy $` part of string */
2962                 U32 n = 0;
2963                 min = max;
2964                 /* calculate the left-most part of the string covered
2965                  * by a capture. Due to lookbehind, this may be to
2966                  * the left of $&, so we have to scan all captures */
2967                 while (min && n <= prog->lastparen) {
2968                     if (   prog->offs[n].start != -1
2969                         && prog->offs[n].start < min)
2970                     {
2971                         min = prog->offs[n].start;
2972                     }
2973                     n++;
2974                 }
2975                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2976                     && min >  prog->offs[0].end
2977                 )
2978                     min = prog->offs[0].end;
2979
2980             }
2981
2982             assert(min >= 0 && min <= max && min <= strend - strbeg);
2983             sublen = max - min;
2984
2985             if (RXp_MATCH_COPIED(prog)) {
2986                 if (sublen > prog->sublen)
2987                     prog->subbeg =
2988                             (char*)saferealloc(prog->subbeg, sublen+1);
2989             }
2990             else
2991                 prog->subbeg = (char*)safemalloc(sublen+1);
2992             Copy(strbeg + min, prog->subbeg, sublen, char);
2993             prog->subbeg[sublen] = '\0';
2994             prog->suboffset = min;
2995             prog->sublen = sublen;
2996             RXp_MATCH_COPIED_on(prog);
2997         }
2998         prog->subcoffset = prog->suboffset;
2999         if (prog->suboffset && utf8_target) {
3000             /* Convert byte offset to chars.
3001              * XXX ideally should only compute this if @-/@+
3002              * has been seen, a la PL_sawampersand ??? */
3003
3004             /* If there's a direct correspondence between the
3005              * string which we're matching and the original SV,
3006              * then we can use the utf8 len cache associated with
3007              * the SV. In particular, it means that under //g,
3008              * sv_pos_b2u() will use the previously cached
3009              * position to speed up working out the new length of
3010              * subcoffset, rather than counting from the start of
3011              * the string each time. This stops
3012              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3013              * from going quadratic */
3014             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3015                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3016                                                 SV_GMAGIC|SV_CONST_RETURN);
3017             else
3018                 prog->subcoffset = utf8_length((U8*)strbeg,
3019                                     (U8*)(strbeg+prog->suboffset));
3020         }
3021     }
3022     else {
3023         RXp_MATCH_COPY_FREE(prog);
3024         prog->subbeg = strbeg;
3025         prog->suboffset = 0;
3026         prog->subcoffset = 0;
3027         prog->sublen = strend - strbeg;
3028     }
3029 }
3030
3031
3032
3033
3034 /*
3035  - regexec_flags - match a regexp against a string
3036  */
3037 I32
3038 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3039               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3040 /* stringarg: the point in the string at which to begin matching */
3041 /* strend:    pointer to null at end of string */
3042 /* strbeg:    real beginning of string */
3043 /* minend:    end of match must be >= minend bytes after stringarg. */
3044 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3045  *            itself is accessed via the pointers above */
3046 /* data:      May be used for some additional optimizations.
3047               Currently unused. */
3048 /* flags:     For optimizations. See REXEC_* in regexp.h */
3049
3050 {
3051     struct regexp *const prog = ReANY(rx);
3052     char *s;
3053     regnode *c;
3054     char *startpos;
3055     SSize_t minlen;             /* must match at least this many chars */
3056     SSize_t dontbother = 0;     /* how many characters not to try at end */
3057     const bool utf8_target = cBOOL(DO_UTF8(sv));
3058     I32 multiline;
3059     RXi_GET_DECL(prog,progi);
3060     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3061     regmatch_info *const reginfo = &reginfo_buf;
3062     regexp_paren_pair *swap = NULL;
3063     I32 oldsave;
3064     GET_RE_DEBUG_FLAGS_DECL;
3065
3066     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3067     PERL_UNUSED_ARG(data);
3068
3069     /* Be paranoid... */
3070     if (prog == NULL) {
3071         Perl_croak(aTHX_ "NULL regexp parameter");
3072     }
3073
3074     DEBUG_EXECUTE_r(
3075         debug_start_match(rx, utf8_target, stringarg, strend,
3076         "Matching");
3077     );
3078
3079     startpos = stringarg;
3080
3081     /* set these early as they may be used by the HOP macros below */
3082     reginfo->strbeg = strbeg;
3083     reginfo->strend = strend;
3084     reginfo->is_utf8_target = cBOOL(utf8_target);
3085
3086     if (prog->intflags & PREGf_GPOS_SEEN) {
3087         MAGIC *mg;
3088
3089         /* set reginfo->ganch, the position where \G can match */
3090
3091         reginfo->ganch =
3092             (flags & REXEC_IGNOREPOS)
3093             ? stringarg /* use start pos rather than pos() */
3094             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3095               /* Defined pos(): */
3096             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3097             : strbeg; /* pos() not defined; use start of string */
3098
3099         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3100             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3101
3102         /* in the presence of \G, we may need to start looking earlier in
3103          * the string than the suggested start point of stringarg:
3104          * if prog->gofs is set, then that's a known, fixed minimum
3105          * offset, such as
3106          * /..\G/:   gofs = 2
3107          * /ab|c\G/: gofs = 1
3108          * or if the minimum offset isn't known, then we have to go back
3109          * to the start of the string, e.g. /w+\G/
3110          */
3111
3112         if (prog->intflags & PREGf_ANCH_GPOS) {
3113             if (prog->gofs) {
3114                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3115                 if (!startpos ||
3116                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3117                 {
3118                     DEBUG_r(Perl_re_printf( aTHX_
3119                             "fail: ganch-gofs before earliest possible start\n"));
3120                     return 0;
3121                 }
3122             }
3123             else
3124                 startpos = reginfo->ganch;
3125         }
3126         else if (prog->gofs) {
3127             startpos = HOPBACKc(startpos, prog->gofs);
3128             if (!startpos)
3129                 startpos = strbeg;
3130         }
3131         else if (prog->intflags & PREGf_GPOS_FLOAT)
3132             startpos = strbeg;
3133     }
3134
3135     minlen = prog->minlen;
3136     if ((startpos + minlen) > strend || startpos < strbeg) {
3137         DEBUG_r(Perl_re_printf( aTHX_
3138                     "Regex match can't succeed, so not even tried\n"));
3139         return 0;
3140     }
3141
3142     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3143      * which will call destuctors to reset PL_regmatch_state, free higher
3144      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3145      * regmatch_info_aux_eval */
3146
3147     oldsave = PL_savestack_ix;
3148
3149     s = startpos;
3150
3151     if ((prog->extflags & RXf_USE_INTUIT)
3152         && !(flags & REXEC_CHECKED))
3153     {
3154         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3155                                     flags, NULL);
3156         if (!s)
3157             return 0;
3158
3159         if (prog->extflags & RXf_CHECK_ALL) {
3160             /* we can match based purely on the result of INTUIT.
3161              * Set up captures etc just for $& and $-[0]
3162              * (an intuit-only match wont have $1,$2,..) */
3163             assert(!prog->nparens);
3164
3165             /* s/// doesn't like it if $& is earlier than where we asked it to
3166              * start searching (which can happen on something like /.\G/) */
3167             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3168                     && (s < stringarg))
3169             {
3170                 /* this should only be possible under \G */
3171                 assert(prog->intflags & PREGf_GPOS_SEEN);
3172                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3173                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3174                 goto phooey;
3175             }
3176
3177             /* match via INTUIT shouldn't have any captures.
3178              * Let @-, @+, $^N know */
3179             prog->lastparen = prog->lastcloseparen = 0;
3180             RXp_MATCH_UTF8_set(prog, utf8_target);
3181             prog->offs[0].start = s - strbeg;
3182             prog->offs[0].end = utf8_target
3183                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3184                 : s - strbeg + prog->minlenret;
3185             if ( !(flags & REXEC_NOT_FIRST) )
3186                 S_reg_set_capture_string(aTHX_ rx,
3187                                         strbeg, strend,
3188                                         sv, flags, utf8_target);
3189
3190             return 1;
3191         }
3192     }
3193
3194     multiline = prog->extflags & RXf_PMf_MULTILINE;
3195
3196     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3197         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3198                               "String too short [regexec_flags]...\n"));
3199         goto phooey;
3200     }
3201
3202     /* Check validity of program. */
3203     if (UCHARAT(progi->program) != REG_MAGIC) {
3204         Perl_croak(aTHX_ "corrupted regexp program");
3205     }
3206
3207     RXp_MATCH_TAINTED_off(prog);
3208     RXp_MATCH_UTF8_set(prog, utf8_target);
3209
3210     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3211     reginfo->intuit = 0;
3212     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3213     reginfo->warned = FALSE;
3214     reginfo->sv = sv;
3215     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3216     /* see how far we have to get to not match where we matched before */
3217     reginfo->till = stringarg + minend;
3218
3219     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3220         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3221            S_cleanup_regmatch_info_aux has executed (registered by
3222            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3223            magic belonging to this SV.
3224            Not newSVsv, either, as it does not COW.
3225         */
3226         reginfo->sv = newSV(0);
3227         SvSetSV_nosteal(reginfo->sv, sv);
3228         SAVEFREESV(reginfo->sv);
3229     }
3230
3231     /* reserve next 2 or 3 slots in PL_regmatch_state:
3232      * slot N+0: may currently be in use: skip it
3233      * slot N+1: use for regmatch_info_aux struct
3234      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3235      * slot N+3: ready for use by regmatch()
3236      */
3237
3238     {
3239         regmatch_state *old_regmatch_state;
3240         regmatch_slab  *old_regmatch_slab;
3241         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3242
3243         /* on first ever match, allocate first slab */
3244         if (!PL_regmatch_slab) {
3245             Newx(PL_regmatch_slab, 1, regmatch_slab);
3246             PL_regmatch_slab->prev = NULL;
3247             PL_regmatch_slab->next = NULL;
3248             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3249         }
3250
3251         old_regmatch_state = PL_regmatch_state;
3252         old_regmatch_slab  = PL_regmatch_slab;
3253
3254         for (i=0; i <= max; i++) {
3255             if (i == 1)
3256                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3257             else if (i ==2)
3258                 reginfo->info_aux_eval =
3259                 reginfo->info_aux->info_aux_eval =
3260                             &(PL_regmatch_state->u.info_aux_eval);
3261
3262             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3263                 PL_regmatch_state = S_push_slab(aTHX);
3264         }
3265
3266         /* note initial PL_regmatch_state position; at end of match we'll
3267          * pop back to there and free any higher slabs */
3268
3269         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3270         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3271         reginfo->info_aux->poscache = NULL;
3272
3273         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3274
3275         if ((prog->extflags & RXf_EVAL_SEEN))
3276             S_setup_eval_state(aTHX_ reginfo);
3277         else
3278             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3279     }
3280
3281     /* If there is a "must appear" string, look for it. */
3282
3283     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3284         /* We have to be careful. If the previous successful match
3285            was from this regex we don't want a subsequent partially
3286            successful match to clobber the old results.
3287            So when we detect this possibility we add a swap buffer
3288            to the re, and switch the buffer each match. If we fail,
3289            we switch it back; otherwise we leave it swapped.
3290         */
3291         swap = prog->offs;
3292         /* do we need a save destructor here for eval dies? */
3293         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3294         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3295             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3296             0,
3297             PTR2UV(prog),
3298             PTR2UV(swap),
3299             PTR2UV(prog->offs)
3300         ));
3301     }
3302
3303     if (prog->recurse_locinput)
3304         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3305
3306     /* Simplest case: anchored match need be tried only once, or with
3307      * MBOL, only at the beginning of each line.
3308      *
3309      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3310      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3311      * match at the start of the string then it won't match anywhere else
3312      * either; while with /.*.../, if it doesn't match at the beginning,
3313      * the earliest it could match is at the start of the next line */
3314
3315     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3316         char *end;
3317
3318         if (regtry(reginfo, &s))
3319             goto got_it;
3320
3321         if (!(prog->intflags & PREGf_ANCH_MBOL))
3322             goto phooey;
3323
3324         /* didn't match at start, try at other newline positions */
3325
3326         if (minlen)
3327             dontbother = minlen - 1;
3328         end = HOP3c(strend, -dontbother, strbeg) - 1;
3329
3330         /* skip to next newline */
3331
3332         while (s <= end) { /* note it could be possible to match at the end of the string */
3333             /* NB: newlines are the same in unicode as they are in latin */
3334             if (*s++ != '\n')
3335                 continue;
3336             if (prog->check_substr || prog->check_utf8) {
3337             /* note that with PREGf_IMPLICIT, intuit can only fail
3338              * or return the start position, so it's of limited utility.
3339              * Nevertheless, I made the decision that the potential for
3340              * quick fail was still worth it - DAPM */
3341                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3342                 if (!s)
3343                     goto phooey;
3344             }
3345             if (regtry(reginfo, &s))
3346                 goto got_it;
3347         }
3348         goto phooey;
3349     } /* end anchored search */
3350
3351     if (prog->intflags & PREGf_ANCH_GPOS)
3352     {
3353         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3354         assert(prog->intflags & PREGf_GPOS_SEEN);
3355         /* For anchored \G, the only position it can match from is
3356          * (ganch-gofs); we already set startpos to this above; if intuit
3357          * moved us on from there, we can't possibly succeed */
3358         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3359         if (s == startpos && regtry(reginfo, &s))
3360             goto got_it;
3361         goto phooey;
3362     }
3363
3364     /* Messy cases:  unanchored match. */
3365     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3366         /* we have /x+whatever/ */
3367         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3368         char ch;
3369 #ifdef DEBUGGING
3370         int did_match = 0;
3371 #endif
3372         if (utf8_target) {
3373             if (! prog->anchored_utf8) {
3374                 to_utf8_substr(prog);
3375             }
3376             ch = SvPVX_const(prog->anchored_utf8)[0];
3377             REXEC_FBC_SCAN(
3378                 if (*s == ch) {
3379                     DEBUG_EXECUTE_r( did_match = 1 );
3380                     if (regtry(reginfo, &s)) goto got_it;
3381                     s += UTF8SKIP(s);
3382                     while (s < strend && *s == ch)
3383                         s += UTF8SKIP(s);
3384                 }
3385             );
3386
3387         }
3388         else {
3389             if (! prog->anchored_substr) {
3390                 if (! to_byte_substr(prog)) {
3391                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3392                 }
3393             }
3394             ch = SvPVX_const(prog->anchored_substr)[0];
3395             REXEC_FBC_SCAN(
3396                 if (*s == ch) {
3397                     DEBUG_EXECUTE_r( did_match = 1 );
3398                     if (regtry(reginfo, &s)) goto got_it;
3399                     s++;
3400                     while (s < strend && *s == ch)
3401                         s++;
3402                 }
3403             );
3404         }
3405         DEBUG_EXECUTE_r(if (!did_match)
3406                 Perl_re_printf( aTHX_
3407                                   "Did not find anchored character...\n")
3408                );
3409     }
3410     else if (prog->anchored_substr != NULL
3411               || prog->anchored_utf8 != NULL
3412               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3413                   && prog->float_max_offset < strend - s)) {
3414         SV *must;
3415         SSize_t back_max;
3416         SSize_t back_min;
3417         char *last;
3418         char *last1;            /* Last position checked before */
3419 #ifdef DEBUGGING
3420         int did_match = 0;
3421 #endif
3422         if (prog->anchored_substr || prog->anchored_utf8) {
3423             if (utf8_target) {
3424                 if (! prog->anchored_utf8) {
3425                     to_utf8_substr(prog);
3426                 }
3427                 must = prog->anchored_utf8;
3428             }
3429             else {
3430                 if (! prog->anchored_substr) {
3431                     if (! to_byte_substr(prog)) {
3432                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3433                     }
3434                 }
3435                 must = prog->anchored_substr;
3436             }
3437             back_max = back_min = prog->anchored_offset;
3438         } else {
3439             if (utf8_target) {
3440                 if (! prog->float_utf8) {
3441                     to_utf8_substr(prog);
3442                 }
3443                 must = prog->float_utf8;
3444             }
3445             else {
3446                 if (! prog->float_substr) {
3447                     if (! to_byte_substr(prog)) {
3448                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3449                     }
3450                 }
3451                 must = prog->float_substr;
3452             }
3453             back_max = prog->float_max_offset;
3454             back_min = prog->float_min_offset;
3455         }
3456
3457         if (back_min<0) {
3458             last = strend;
3459         } else {
3460             last = HOP3c(strend,        /* Cannot start after this */
3461                   -(SSize_t)(CHR_SVLEN(must)
3462                          - (SvTAIL(must) != 0) + back_min), strbeg);
3463         }
3464         if (s > reginfo->strbeg)
3465             last1 = HOPc(s, -1);
3466         else
3467             last1 = s - 1;      /* bogus */
3468
3469         /* XXXX check_substr already used to find "s", can optimize if
3470            check_substr==must. */
3471         dontbother = 0;
3472         strend = HOPc(strend, -dontbother);
3473         while ( (s <= last) &&
3474                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3475                                   (unsigned char*)strend, must,
3476                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3477             DEBUG_EXECUTE_r( did_match = 1 );
3478             if (HOPc(s, -back_max) > last1) {
3479                 last1 = HOPc(s, -back_min);
3480                 s = HOPc(s, -back_max);
3481             }
3482             else {
3483                 char * const t = (last1 >= reginfo->strbeg)
3484                                     ? HOPc(last1, 1) : last1 + 1;
3485
3486                 last1 = HOPc(s, -back_min);
3487                 s = t;
3488             }
3489             if (utf8_target) {
3490                 while (s <= last1) {
3491                     if (regtry(reginfo, &s))
3492                         goto got_it;
3493                     if (s >= last1) {
3494                         s++; /* to break out of outer loop */
3495                         break;
3496                     }
3497                     s += UTF8SKIP(s);
3498                 }
3499             }
3500             else {
3501                 while (s <= last1) {
3502                     if (regtry(reginfo, &s))
3503                         goto got_it;
3504                     s++;
3505                 }
3506             }
3507         }
3508         DEBUG_EXECUTE_r(if (!did_match) {
3509             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3510                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3511             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3512                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3513                                ? "anchored" : "floating"),
3514                 quoted, RE_SV_TAIL(must));
3515         });
3516         goto phooey;
3517     }
3518     else if ( (c = progi->regstclass) ) {
3519         if (minlen) {
3520             const OPCODE op = OP(progi->regstclass);
3521             /* don't bother with what can't match */
3522             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3523                 strend = HOPc(strend, -(minlen - 1));
3524         }
3525         DEBUG_EXECUTE_r({
3526             SV * const prop = sv_newmortal();
3527             regprop(prog, prop, c, reginfo, NULL);
3528             {
3529                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3530                     s,strend-s,PL_dump_re_max_len);
3531                 Perl_re_printf( aTHX_
3532                     "Matching stclass %.*s against %s (%d bytes)\n",
3533                     (int)SvCUR(prop), SvPVX_const(prop),
3534                      quoted, (int)(strend - s));
3535             }
3536         });
3537         if (find_byclass(prog, c, s, strend, reginfo))
3538             goto got_it;
3539         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3540     }
3541     else {
3542         dontbother = 0;
3543         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3544             /* Trim the end. */
3545             char *last= NULL;
3546             SV* float_real;
3547             STRLEN len;
3548             const char *little;
3549
3550             if (utf8_target) {
3551                 if (! prog->float_utf8) {
3552                     to_utf8_substr(prog);
3553                 }
3554                 float_real = prog->float_utf8;
3555             }
3556             else {
3557                 if (! prog->float_substr) {
3558                     if (! to_byte_substr(prog)) {
3559                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3560                     }
3561                 }
3562                 float_real = prog->float_substr;
3563             }
3564
3565             little = SvPV_const(float_real, len);
3566             if (SvTAIL(float_real)) {
3567                     /* This means that float_real contains an artificial \n on
3568                      * the end due to the presence of something like this:
3569                      * /foo$/ where we can match both "foo" and "foo\n" at the
3570                      * end of the string.  So we have to compare the end of the
3571                      * string first against the float_real without the \n and
3572                      * then against the full float_real with the string.  We
3573                      * have to watch out for cases where the string might be
3574                      * smaller than the float_real or the float_real without
3575                      * the \n. */
3576                     char *checkpos= strend - len;
3577                     DEBUG_OPTIMISE_r(
3578                         Perl_re_printf( aTHX_
3579                             "%sChecking for float_real.%s\n",
3580                             PL_colors[4], PL_colors[5]));
3581                     if (checkpos + 1 < strbeg) {
3582                         /* can't match, even if we remove the trailing \n
3583                          * string is too short to match */
3584                         DEBUG_EXECUTE_r(
3585                             Perl_re_printf( aTHX_
3586                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3587                                 PL_colors[4], PL_colors[5]));
3588                         goto phooey;
3589                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3590                         /* can match, the end of the string matches without the
3591                          * "\n" */
3592                         last = checkpos + 1;
3593                     } else if (checkpos < strbeg) {
3594                         /* cant match, string is too short when the "\n" is
3595                          * included */
3596                         DEBUG_EXECUTE_r(
3597                             Perl_re_printf( aTHX_
3598                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3599                                 PL_colors[4], PL_colors[5]));
3600                         goto phooey;
3601                     } else if (!multiline) {
3602                         /* non multiline match, so compare with the "\n" at the
3603                          * end of the string */
3604                         if (memEQ(checkpos, little, len)) {
3605                             last= checkpos;
3606                         } else {
3607                             DEBUG_EXECUTE_r(
3608                                 Perl_re_printf( aTHX_
3609                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3610                                     PL_colors[4], PL_colors[5]));
3611                             goto phooey;
3612                         }
3613                     } else {
3614                         /* multiline match, so we have to search for a place
3615                          * where the full string is located */
3616                         goto find_last;
3617                     }
3618             } else {
3619                   find_last:
3620                     if (len)
3621                         last = rninstr(s, strend, little, little + len);
3622                     else
3623                         last = strend;  /* matching "$" */
3624             }
3625             if (!last) {
3626                 /* at one point this block contained a comment which was
3627                  * probably incorrect, which said that this was a "should not
3628                  * happen" case.  Even if it was true when it was written I am
3629                  * pretty sure it is not anymore, so I have removed the comment
3630                  * and replaced it with this one. Yves */
3631                 DEBUG_EXECUTE_r(
3632                     Perl_re_printf( aTHX_
3633                         "%sString does not contain required substring, cannot match.%s\n",
3634                         PL_colors[4], PL_colors[5]
3635                     ));
3636                 goto phooey;
3637             }
3638             dontbother = strend - last + prog->float_min_offset;
3639         }
3640         if (minlen && (dontbother < minlen))
3641             dontbother = minlen - 1;
3642         strend -= dontbother;              /* this one's always in bytes! */
3643         /* We don't know much -- general case. */
3644         if (utf8_target) {
3645             for (;;) {
3646                 if (regtry(reginfo, &s))
3647                     goto got_it;
3648                 if (s >= strend)
3649                     break;
3650                 s += UTF8SKIP(s);
3651             };
3652         }
3653         else {
3654             do {
3655                 if (regtry(reginfo, &s))
3656                     goto got_it;
3657             } while (s++ < strend);
3658         }
3659     }
3660
3661     /* Failure. */
3662     goto phooey;
3663
3664   got_it:
3665     /* s/// doesn't like it if $& is earlier than where we asked it to
3666      * start searching (which can happen on something like /.\G/) */
3667     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3668             && (prog->offs[0].start < stringarg - strbeg))
3669     {
3670         /* this should only be possible under \G */
3671         assert(prog->intflags & PREGf_GPOS_SEEN);
3672         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3673             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3674         goto phooey;
3675     }
3676
3677     DEBUG_BUFFERS_r(
3678         if (swap)
3679             Perl_re_exec_indentf( aTHX_
3680                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3681                 0,
3682                 PTR2UV(prog),
3683                 PTR2UV(swap)
3684             );
3685     );
3686     Safefree(swap);
3687
3688     /* clean up; this will trigger destructors that will free all slabs
3689      * above the current one, and cleanup the regmatch_info_aux
3690      * and regmatch_info_aux_eval sructs */
3691
3692     LEAVE_SCOPE(oldsave);
3693
3694     if (RXp_PAREN_NAMES(prog))
3695         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3696
3697     /* make sure $`, $&, $', and $digit will work later */
3698     if ( !(flags & REXEC_NOT_FIRST) )
3699         S_reg_set_capture_string(aTHX_ rx,
3700                                     strbeg, reginfo->strend,
3701                                     sv, flags, utf8_target);
3702
3703     return 1;
3704
3705   phooey:
3706     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3707                           PL_colors[4], PL_colors[5]));
3708
3709     /* clean up; this will trigger destructors that will free all slabs
3710      * above the current one, and cleanup the regmatch_info_aux
3711      * and regmatch_info_aux_eval sructs */
3712
3713     LEAVE_SCOPE(oldsave);
3714
3715     if (swap) {
3716         /* we failed :-( roll it back */
3717         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3718             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3719             0,
3720             PTR2UV(prog),
3721             PTR2UV(prog->offs),
3722             PTR2UV(swap)
3723         ));
3724         Safefree(prog->offs);
3725         prog->offs = swap;
3726     }
3727     return 0;
3728 }
3729
3730
3731 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3732  * Do inc before dec, in case old and new rex are the same */
3733 #define SET_reg_curpm(Re2)                          \
3734     if (reginfo->info_aux_eval) {                   \
3735         (void)ReREFCNT_inc(Re2);                    \
3736         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3737         PM_SETRE((PL_reg_curpm), (Re2));            \
3738     }
3739
3740
3741 /*
3742  - regtry - try match at specific point
3743  */
3744 STATIC bool                     /* 0 failure, 1 success */
3745 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3746 {
3747     CHECKPOINT lastcp;
3748     REGEXP *const rx = reginfo->prog;
3749     regexp *const prog = ReANY(rx);
3750     SSize_t result;
3751 #ifdef DEBUGGING
3752     U32 depth = 0; /* used by REGCP_SET */
3753 #endif
3754     RXi_GET_DECL(prog,progi);
3755     GET_RE_DEBUG_FLAGS_DECL;
3756
3757     PERL_ARGS_ASSERT_REGTRY;
3758
3759     reginfo->cutpoint=NULL;
3760
3761     prog->offs[0].start = *startposp - reginfo->strbeg;
3762     prog->lastparen = 0;
3763     prog->lastcloseparen = 0;
3764
3765     /* XXXX What this code is doing here?!!!  There should be no need
3766        to do this again and again, prog->lastparen should take care of
3767        this!  --ilya*/
3768
3769     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3770      * Actually, the code in regcppop() (which Ilya may be meaning by
3771      * prog->lastparen), is not needed at all by the test suite
3772      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3773      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3774      * Meanwhile, this code *is* needed for the
3775      * above-mentioned test suite tests to succeed.  The common theme
3776      * on those tests seems to be returning null fields from matches.
3777      * --jhi updated by dapm */
3778
3779     /* After encountering a variant of the issue mentioned above I think
3780      * the point Ilya was making is that if we properly unwind whenever
3781      * we set lastparen to a smaller value then we should not need to do
3782      * this every time, only when needed. So if we have tests that fail if
3783      * we remove this, then it suggests somewhere else we are improperly
3784      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3785      * places it is called, and related regcp() routines. - Yves */
3786 #if 1
3787     if (prog->nparens) {
3788         regexp_paren_pair *pp = prog->offs;
3789         I32 i;
3790         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3791             ++pp;
3792             pp->start = -1;
3793             pp->end = -1;
3794         }
3795     }
3796 #endif
3797     REGCP_SET(lastcp);
3798     result = regmatch(reginfo, *startposp, progi->program + 1);
3799     if (result != -1) {
3800         prog->offs[0].end = result;
3801         return 1;
3802     }
3803     if (reginfo->cutpoint)
3804         *startposp= reginfo->cutpoint;
3805     REGCP_UNWIND(lastcp);
3806     return 0;
3807 }
3808
3809
3810 #define sayYES goto yes
3811 #define sayNO goto no
3812 #define sayNO_SILENT goto no_silent
3813
3814 /* we dont use STMT_START/END here because it leads to
3815    "unreachable code" warnings, which are bogus, but distracting. */
3816 #define CACHEsayNO \
3817     if (ST.cache_mask) \
3818        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3819     sayNO
3820
3821 /* this is used to determine how far from the left messages like
3822    'failed...' are printed in regexec.c. It should be set such that
3823    messages are inline with the regop output that created them.
3824 */
3825 #define REPORT_CODE_OFF 29
3826 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3827 #ifdef DEBUGGING
3828 int
3829 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3830 {
3831     va_list ap;
3832     int result;
3833     PerlIO *f= Perl_debug_log;
3834     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3835     va_start(ap, depth);
3836     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3837     result = PerlIO_vprintf(f, fmt, ap);
3838     va_end(ap);
3839     return result;
3840 }
3841 #endif /* DEBUGGING */
3842
3843
3844 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3845 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3846 #define CHRTEST_NOT_A_CP_1 -999
3847 #define CHRTEST_NOT_A_CP_2 -998
3848
3849 /* grab a new slab and return the first slot in it */
3850
3851 STATIC regmatch_state *
3852 S_push_slab(pTHX)
3853 {
3854     regmatch_slab *s = PL_regmatch_slab->next;
3855     if (!s) {
3856         Newx(s, 1, regmatch_slab);
3857         s->prev = PL_regmatch_slab;
3858         s->next = NULL;
3859         PL_regmatch_slab->next = s;
3860     }
3861     PL_regmatch_slab = s;
3862     return SLAB_FIRST(s);
3863 }
3864
3865
3866 /* push a new state then goto it */
3867
3868 #define PUSH_STATE_GOTO(state, node, input) \
3869     pushinput = input; \
3870     scan = node; \
3871     st->resume_state = state; \
3872     goto push_state;
3873
3874 /* push a new state with success backtracking, then goto it */
3875
3876 #define PUSH_YES_STATE_GOTO(state, node, input) \
3877     pushinput = input; \
3878     scan = node; \
3879     st->resume_state = state; \
3880     goto push_yes_state;
3881
3882
3883
3884
3885 /*
3886
3887 regmatch() - main matching routine
3888
3889 This is basically one big switch statement in a loop. We execute an op,
3890 set 'next' to point the next op, and continue. If we come to a point which
3891 we may need to backtrack to on failure such as (A|B|C), we push a
3892 backtrack state onto the backtrack stack. On failure, we pop the top
3893 state, and re-enter the loop at the state indicated. If there are no more
3894 states to pop, we return failure.
3895
3896 Sometimes we also need to backtrack on success; for example /A+/, where
3897 after successfully matching one A, we need to go back and try to
3898 match another one; similarly for lookahead assertions: if the assertion
3899 completes successfully, we backtrack to the state just before the assertion
3900 and then carry on.  In these cases, the pushed state is marked as
3901 'backtrack on success too'. This marking is in fact done by a chain of
3902 pointers, each pointing to the previous 'yes' state. On success, we pop to
3903 the nearest yes state, discarding any intermediate failure-only states.
3904 Sometimes a yes state is pushed just to force some cleanup code to be
3905 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3906 it to free the inner regex.
3907
3908 Note that failure backtracking rewinds the cursor position, while
3909 success backtracking leaves it alone.
3910
3911 A pattern is complete when the END op is executed, while a subpattern
3912 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3913 ops trigger the "pop to last yes state if any, otherwise return true"
3914 behaviour.
3915
3916 A common convention in this function is to use A and B to refer to the two
3917 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3918 the subpattern to be matched possibly multiple times, while B is the entire
3919 rest of the pattern. Variable and state names reflect this convention.
3920
3921 The states in the main switch are the union of ops and failure/success of
3922 substates associated with with that op.  For example, IFMATCH is the op
3923 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3924 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3925 successfully matched A and IFMATCH_A_fail is a state saying that we have
3926 just failed to match A. Resume states always come in pairs. The backtrack
3927 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3928 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3929 on success or failure.
3930
3931 The struct that holds a backtracking state is actually a big union, with
3932 one variant for each major type of op. The variable st points to the
3933 top-most backtrack struct. To make the code clearer, within each
3934 block of code we #define ST to alias the relevant union.
3935
3936 Here's a concrete example of a (vastly oversimplified) IFMATCH
3937 implementation:
3938
3939     switch (state) {
3940     ....
3941
3942 #define ST st->u.ifmatch
3943
3944     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3945         ST.foo = ...; // some state we wish to save
3946         ...
3947         // push a yes backtrack state with a resume value of
3948         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3949         // first node of A:
3950         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3951         // NOTREACHED
3952
3953     case IFMATCH_A: // we have successfully executed A; now continue with B
3954         next = B;
3955         bar = ST.foo; // do something with the preserved value
3956         break;
3957
3958     case IFMATCH_A_fail: // A failed, so the assertion failed
3959         ...;   // do some housekeeping, then ...
3960         sayNO; // propagate the failure
3961
3962 #undef ST
3963
3964     ...
3965     }
3966
3967 For any old-timers reading this who are familiar with the old recursive
3968 approach, the code above is equivalent to:
3969
3970     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3971     {
3972         int foo = ...
3973         ...
3974         if (regmatch(A)) {
3975             next = B;
3976             bar = foo;
3977             break;
3978         }
3979         ...;   // do some housekeeping, then ...
3980         sayNO; // propagate the failure
3981     }
3982
3983 The topmost backtrack state, pointed to by st, is usually free. If you
3984 want to claim it, populate any ST.foo fields in it with values you wish to
3985 save, then do one of
3986
3987         PUSH_STATE_GOTO(resume_state, node, newinput);
3988         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3989
3990 which sets that backtrack state's resume value to 'resume_state', pushes a
3991 new free entry to the top of the backtrack stack, then goes to 'node'.
3992 On backtracking, the free slot is popped, and the saved state becomes the
3993 new free state. An ST.foo field in this new top state can be temporarily
3994 accessed to retrieve values, but once the main loop is re-entered, it
3995 becomes available for reuse.
3996
3997 Note that the depth of the backtrack stack constantly increases during the
3998 left-to-right execution of the pattern, rather than going up and down with
3999 the pattern nesting. For example the stack is at its maximum at Z at the
4000 end of the pattern, rather than at X in the following:
4001
4002     /(((X)+)+)+....(Y)+....Z/
4003
4004 The only exceptions to this are lookahead/behind assertions and the cut,
4005 (?>A), which pop all the backtrack states associated with A before
4006 continuing.
4007
4008 Backtrack state structs are allocated in slabs of about 4K in size.
4009 PL_regmatch_state and st always point to the currently active state,
4010 and PL_regmatch_slab points to the slab currently containing
4011 PL_regmatch_state.  The first time regmatch() is called, the first slab is
4012 allocated, and is never freed until interpreter destruction. When the slab
4013 is full, a new one is allocated and chained to the end. At exit from
4014 regmatch(), slabs allocated since entry are freed.
4015
4016 */
4017
4018
4019 #define DEBUG_STATE_pp(pp)                                  \
4020     DEBUG_STATE_r({                                         \
4021         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
4022         Perl_re_printf( aTHX_                                           \
4023             "%*s" pp " %s%s%s%s%s\n",                       \
4024             INDENT_CHARS(depth), "",                        \
4025             PL_reg_name[st->resume_state],                  \
4026             ((st==yes_state||st==mark_state) ? "[" : ""),   \
4027             ((st==yes_state) ? "Y" : ""),                   \
4028             ((st==mark_state) ? "M" : ""),                  \
4029             ((st==yes_state||st==mark_state) ? "]" : "")    \
4030         );                                                  \
4031     });
4032
4033
4034 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4035
4036 #ifdef DEBUGGING
4037
4038 STATIC void
4039 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4040     const char *start, const char *end, const char *blurb)
4041 {
4042     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4043
4044     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4045
4046     if (!PL_colorset)
4047             reginitcolors();
4048     {
4049         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4050             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4051
4052         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4053             start, end - start, PL_dump_re_max_len);
4054
4055         Perl_re_printf( aTHX_
4056             "%s%s REx%s %s against %s\n",
4057                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4058
4059         if (utf8_target||utf8_pat)
4060             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4061                 utf8_pat ? "pattern" : "",
4062                 utf8_pat && utf8_target ? " and " : "",
4063                 utf8_target ? "string" : ""
4064             );
4065     }
4066 }
4067
4068 STATIC void
4069 S_dump_exec_pos(pTHX_ const char *locinput,
4070                       const regnode *scan,
4071                       const char *loc_regeol,
4072                       const char *loc_bostr,
4073                       const char *loc_reg_starttry,
4074                       const bool utf8_target,
4075                       const U32 depth
4076                 )
4077 {
4078     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4079     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4080     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4081     /* The part of the string before starttry has one color
4082        (pref0_len chars), between starttry and current
4083        position another one (pref_len - pref0_len chars),
4084        after the current position the third one.
4085        We assume that pref0_len <= pref_len, otherwise we
4086        decrease pref0_len.  */
4087     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4088         ? (5 + taill) - l : locinput - loc_bostr;
4089     int pref0_len;
4090
4091     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4092
4093     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4094         pref_len++;
4095     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4096     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4097         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4098               ? (5 + taill) - pref_len : loc_regeol - locinput);
4099     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4100         l--;
4101     if (pref0_len < 0)
4102         pref0_len = 0;
4103     if (pref0_len > pref_len)
4104         pref0_len = pref_len;
4105     {
4106         const int is_uni = utf8_target ? 1 : 0;
4107
4108         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4109             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4110
4111         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4112                     (locinput - pref_len + pref0_len),
4113                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4114
4115         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4116                     locinput, loc_regeol - locinput, 10, 0, 1);
4117
4118         const STRLEN tlen=len0+len1+len2;
4119         Perl_re_printf( aTHX_
4120                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4121                     (IV)(locinput - loc_bostr),
4122                     len0, s0,
4123                     len1, s1,
4124                     (docolor ? "" : "> <"),
4125                     len2, s2,
4126                     (int)(tlen > 19 ? 0 :  19 - tlen),
4127                     "",
4128                     depth);
4129     }
4130 }
4131
4132 #endif
4133
4134 /* reg_check_named_buff_matched()
4135  * Checks to see if a named buffer has matched. The data array of
4136  * buffer numbers corresponding to the buffer is expected to reside
4137  * in the regexp->data->data array in the slot stored in the ARG() of
4138  * node involved. Note that this routine doesn't actually care about the
4139  * name, that information is not preserved from compilation to execution.
4140  * Returns the index of the leftmost defined buffer with the given name
4141  * or 0 if non of the buffers matched.
4142  */
4143 STATIC I32
4144 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4145 {
4146     I32 n;
4147     RXi_GET_DECL(rex,rexi);
4148     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4149     I32 *nums=(I32*)SvPVX(sv_dat);
4150
4151     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4152
4153     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4154         if ((I32)rex->lastparen >= nums[n] &&
4155             rex->offs[nums[n]].end != -1)
4156         {
4157             return nums[n];
4158         }
4159     }
4160     return 0;
4161 }
4162
4163
4164 static bool
4165 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4166         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4167 {
4168     /* This function determines if there are one or two characters that match
4169      * the first character of the passed-in EXACTish node <text_node>, and if
4170      * so, returns them in the passed-in pointers.
4171      *
4172      * If it determines that no possible character in the target string can
4173      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4174      * the first character in <text_node> requires UTF-8 to represent, and the
4175      * target string isn't in UTF-8.)
4176      *
4177      * If there are more than two characters that could match the beginning of
4178      * <text_node>, or if more context is required to determine a match or not,
4179      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4180      *
4181      * The motiviation behind this function is to allow the caller to set up
4182      * tight loops for matching.  If <text_node> is of type EXACT, there is
4183      * only one possible character that can match its first character, and so
4184      * the situation is quite simple.  But things get much more complicated if
4185      * folding is involved.  It may be that the first character of an EXACTFish
4186      * node doesn't participate in any possible fold, e.g., punctuation, so it
4187      * can be matched only by itself.  The vast majority of characters that are
4188      * in folds match just two things, their lower and upper-case equivalents.
4189      * But not all are like that; some have multiple possible matches, or match
4190      * sequences of more than one character.  This function sorts all that out.
4191      *
4192      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4193      * loop of trying to match A*, we know we can't exit where the thing
4194      * following it isn't a B.  And something can't be a B unless it is the
4195      * beginning of B.  By putting a quick test for that beginning in a tight
4196      * loop, we can rule out things that can't possibly be B without having to
4197      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4198      * character, we can make a tight loop matching A*, using the outputs of
4199      * this function.
4200      *
4201      * If the target string to match isn't in UTF-8, and there aren't
4202      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4203      * the one or two possible octets (which are characters in this situation)
4204      * that can match.  In all cases, if there is only one character that can
4205      * match, *<c1p> and *<c2p> will be identical.
4206      *
4207      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4208      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4209      * can match the beginning of <text_node>.  They should be declared with at
4210      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4211      * undefined what these contain.)  If one or both of the buffers are
4212      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4213      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4214      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4215      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4216      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4217
4218     const bool utf8_target = reginfo->is_utf8_target;
4219
4220     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4221     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4222     bool use_chrtest_void = FALSE;
4223     const bool is_utf8_pat = reginfo->is_utf8_pat;
4224
4225     /* Used when we have both utf8 input and utf8 output, to avoid converting
4226      * to/from code points */
4227     bool utf8_has_been_setup = FALSE;
4228
4229     dVAR;
4230
4231     U8 *pat = (U8*)STRING(text_node);
4232     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4233
4234     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4235
4236         /* In an exact node, only one thing can be matched, that first
4237          * character.  If both the pat and the target are UTF-8, we can just
4238          * copy the input to the output, avoiding finding the code point of
4239          * that character */
4240         if (!is_utf8_pat) {
4241             c2 = c1 = *pat;
4242         }
4243         else if (utf8_target) {
4244             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4245             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4246             utf8_has_been_setup = TRUE;
4247         }
4248         else {
4249             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4250         }
4251     }
4252     else { /* an EXACTFish node */
4253         U8 *pat_end = pat + STR_LEN(text_node);
4254
4255         /* An EXACTFL node has at least some characters unfolded, because what
4256          * they match is not known until now.  So, now is the time to fold
4257          * the first few of them, as many as are needed to determine 'c1' and
4258          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4259          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4260          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4261          * need to fold as many characters as a single character can fold to,
4262          * so that later we can check if the first ones are such a multi-char
4263          * fold.  But, in such a pattern only locale-problematic characters
4264          * aren't folded, so we can skip this completely if the first character
4265          * in the node isn't one of the tricky ones */
4266         if (OP(text_node) == EXACTFL) {
4267
4268             if (! is_utf8_pat) {
4269                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4270                 {
4271                     folded[0] = folded[1] = 's';
4272                     pat = folded;
4273                     pat_end = folded + 2;
4274                 }
4275             }
4276             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4277                 U8 *s = pat;
4278                 U8 *d = folded;
4279                 int i;
4280
4281                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4282                     if (isASCII(*s)) {
4283                         *(d++) = (U8) toFOLD_LC(*s);
4284                         s++;
4285                     }
4286                     else {
4287                         STRLEN len;
4288                         _toFOLD_utf8_flags(s,
4289                                            pat_end,
4290                                            d,
4291                                            &len,
4292                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4293                         d += len;
4294                         s += UTF8SKIP(s);
4295                     }
4296                 }
4297
4298                 pat = folded;
4299                 pat_end = d;
4300             }
4301         }
4302
4303         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4304              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4305         {
4306             /* Multi-character folds require more context to sort out.  Also
4307              * PL_utf8_foldclosures used below doesn't handle them, so have to
4308              * be handled outside this routine */
4309             use_chrtest_void = TRUE;
4310         }
4311         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4312             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4313             if (c1 > 255) {
4314                 /* Load the folds hash, if not already done */
4315                 SV** listp;
4316                 if (! PL_utf8_foldclosures) {
4317                     _load_PL_utf8_foldclosures();
4318                 }
4319
4320                 /* The fold closures data structure is a hash with the keys
4321                  * being the UTF-8 of every character that is folded to, like
4322                  * 'k', and the values each an array of all code points that
4323                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4324                  * Multi-character folds are not included */
4325                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4326                                         (char *) pat,
4327                                         UTF8SKIP(pat),
4328                                         FALSE))))
4329                 {
4330                     /* Not found in the hash, therefore there are no folds
4331                     * containing it, so there is only a single character that
4332                     * could match */
4333                     c2 = c1;
4334                 }
4335                 else {  /* Does participate in folds */
4336                     AV* list = (AV*) *listp;
4337                     if (av_tindex_skip_len_mg(list) != 1) {
4338
4339                         /* If there aren't exactly two folds to this, it is
4340                          * outside the scope of this function */
4341                         use_chrtest_void = TRUE;
4342                     }
4343                     else {  /* There are two.  Get them */
4344                         SV** c_p = av_fetch(list, 0, FALSE);
4345                         if (c_p == NULL) {
4346                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4347                         }
4348                         c1 = SvUV(*c_p);
4349
4350                         c_p = av_fetch(list, 1, FALSE);
4351                         if (c_p == NULL) {
4352                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4353                         }
4354                         c2 = SvUV(*c_p);
4355
4356                         /* Folds that cross the 255/256 boundary are forbidden
4357                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4358                          * one is ASCIII.  Since the pattern character is above
4359                          * 255, and its only other match is below 256, the only
4360                          * legal match will be to itself.  We have thrown away
4361                          * the original, so have to compute which is the one
4362                          * above 255. */
4363                         if ((c1 < 256) != (c2 < 256)) {
4364                             if ((OP(text_node) == EXACTFL
4365                                  && ! IN_UTF8_CTYPE_LOCALE)
4366                                 || ((OP(text_node) == EXACTFA
4367                                     || OP(text_node) == EXACTFA_NO_TRIE)
4368                                     && (isASCII(c1) || isASCII(c2))))
4369                             {
4370                                 if (c1 < 256) {
4371                                     c1 = c2;
4372                                 }
4373                                 else {
4374                                     c2 = c1;
4375                                 }
4376                             }
4377                         }
4378                     }
4379                 }
4380             }
4381             else /* Here, c1 is <= 255 */
4382                 if (utf8_target
4383                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4384                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4385                     && ((OP(text_node) != EXACTFA
4386                         && OP(text_node) != EXACTFA_NO_TRIE)
4387                         || ! isASCII(c1)))
4388             {
4389                 /* Here, there could be something above Latin1 in the target
4390                  * which folds to this character in the pattern.  All such
4391                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4392                  * than two characters involved in their folds, so are outside
4393                  * the scope of this function */
4394                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4395                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4396                 }
4397                 else {
4398                     use_chrtest_void = TRUE;
4399                 }
4400             }
4401             else { /* Here nothing above Latin1 can fold to the pattern
4402                       character */
4403                 switch (OP(text_node)) {
4404
4405                     case EXACTFL:   /* /l rules */
4406                         c2 = PL_fold_locale[c1];
4407                         break;
4408
4409                     case EXACTF:   /* This node only generated for non-utf8
4410                                     patterns */
4411                         assert(! is_utf8_pat);
4412                         if (! utf8_target) {    /* /d rules */
4413                             c2 = PL_fold[c1];
4414                             break;
4415                         }
4416                         /* FALLTHROUGH */
4417                         /* /u rules for all these.  This happens to work for
4418                         * EXACTFA as nothing in Latin1 folds to ASCII */
4419                     case EXACTFA_NO_TRIE:   /* This node only generated for
4420                                             non-utf8 patterns */
4421                         assert(! is_utf8_pat);
4422                         /* FALLTHROUGH */
4423                     case EXACTFA:
4424                     case EXACTFU_SS:
4425                     case EXACTFU:
4426                         c2 = PL_fold_latin1[c1];
4427                         break;
4428
4429                     default:
4430                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4431                         NOT_REACHED; /* NOTREACHED */
4432                 }
4433             }
4434         }
4435     }
4436
4437     /* Here have figured things out.  Set up the returns */
4438     if (use_chrtest_void) {
4439         *c2p = *c1p = CHRTEST_VOID;
4440     }
4441     else if (utf8_target) {
4442         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4443             uvchr_to_utf8(c1_utf8, c1);
4444             uvchr_to_utf8(c2_utf8, c2);
4445         }
4446
4447         /* Invariants are stored in both the utf8 and byte outputs; Use
4448          * negative numbers otherwise for the byte ones.  Make sure that the
4449          * byte ones are the same iff the utf8 ones are the same */
4450         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4451         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4452                 ? *c2_utf8
4453                 : (c1 == c2)
4454                   ? CHRTEST_NOT_A_CP_1
4455                   : CHRTEST_NOT_A_CP_2;
4456     }
4457     else if (c1 > 255) {
4458        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4459                            can represent */
4460            return FALSE;
4461        }
4462
4463        *c1p = *c2p = c2;    /* c2 is the only representable value */
4464     }
4465     else {  /* c1 is representable; see about c2 */
4466        *c1p = c1;
4467        *c2p = (c2 < 256) ? c2 : c1;
4468     }
4469
4470     return TRUE;
4471 }
4472
4473 STATIC bool
4474 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4475 {
4476     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4477      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4478
4479     PERL_ARGS_ASSERT_ISGCB;
4480
4481     switch (GCB_table[before][after]) {
4482         case GCB_BREAKABLE:
4483             return TRUE;
4484
4485         case GCB_NOBREAK:
4486             return FALSE;
4487
4488         case GCB_RI_then_RI:
4489             {
4490                 int RI_count = 1;
4491                 U8 * temp_pos = (U8 *) curpos;
4492
4493                 /* Do not break within emoji flag sequences. That is, do not
4494                  * break between regional indicator (RI) symbols if there is an
4495                  * odd number of RI characters before the break point.
4496                  *  GB12   sot (RI RI)* RI × RI
4497                  *  GB13 [^RI] (RI RI)* RI × RI */
4498
4499                 while (backup_one_GCB(strbeg,
4500                                     &temp_pos,
4501                                     utf8_target) == GCB_Regional_Indicator)
4502                 {
4503                     RI_count++;
4504                 }
4505
4506                 return RI_count % 2 != 1;
4507             }
4508
4509         case GCB_EX_then_EM:
4510
4511             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4512             {
4513                 U8 * temp_pos = (U8 *) curpos;
4514                 GCB_enum prev;
4515
4516                 do {
4517                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4518                 }
4519                 while (prev == GCB_Extend);
4520
4521                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4522             }
4523
4524         default:
4525             break;
4526     }
4527
4528 #ifdef DEBUGGING
4529     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4530                                   before, after, GCB_table[before][after]);
4531     assert(0);
4532 #endif
4533     return TRUE;
4534 }
4535
4536 STATIC GCB_enum
4537 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4538 {
4539     GCB_enum gcb;
4540
4541     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4542
4543     if (*curpos < strbeg) {
4544         return GCB_EDGE;
4545     }
4546
4547     if (utf8_target) {
4548         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4549         U8 * prev_prev_char_pos;
4550
4551         if (! prev_char_pos) {
4552             return GCB_EDGE;
4553         }
4554
4555         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4556             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4557             *curpos = prev_char_pos;
4558             prev_char_pos = prev_prev_char_pos;
4559         }
4560         else {
4561             *curpos = (U8 *) strbeg;
4562             return GCB_EDGE;
4563         }
4564     }
4565     else {
4566         if (*curpos - 2 < strbeg) {
4567             *curpos = (U8 *) strbeg;
4568             return GCB_EDGE;
4569         }
4570         (*curpos)--;
4571         gcb = getGCB_VAL_CP(*(*curpos - 1));
4572     }
4573
4574     return gcb;
4575 }
4576
4577 /* Combining marks attach to most classes that precede them, but this defines
4578  * the exceptions (from TR14) */
4579 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4580                                      || prev == LB_Mandatory_Break      \
4581                                      || prev == LB_Carriage_Return      \
4582                                      || prev == LB_Line_Feed            \
4583                                      || prev == LB_Next_Line            \
4584                                      || prev == LB_Space                \
4585                                      || prev == LB_ZWSpace))
4586
4587 STATIC bool
4588 S_isLB(pTHX_ LB_enum before,
4589              LB_enum after,
4590              const U8 * const strbeg,
4591              const U8 * const curpos,
4592              const U8 * const strend,
4593              const bool utf8_target)
4594 {
4595     U8 * temp_pos = (U8 *) curpos;
4596     LB_enum prev = before;
4597
4598     /* Is the boundary between 'before' and 'after' line-breakable?
4599      * Most of this is just a table lookup of a generated table from Unicode
4600      * rules.  But some rules require context to decide, and so have to be
4601      * implemented in code */
4602
4603     PERL_ARGS_ASSERT_ISLB;
4604
4605     /* Rule numbers in the comments below are as of Unicode 9.0 */
4606
4607   redo:
4608     before = prev;
4609     switch (LB_table[before][after]) {
4610         case LB_BREAKABLE:
4611             return TRUE;
4612
4613         case LB_NOBREAK:
4614         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4615             return FALSE;
4616
4617         case LB_SP_foo + LB_BREAKABLE:
4618         case LB_SP_foo + LB_NOBREAK:
4619         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4620
4621             /* When we have something following a SP, we have to look at the
4622              * context in order to know what to do.
4623              *
4624              * SP SP should not reach here because LB7: Do not break before
4625              * spaces.  (For two spaces in a row there is nothing that
4626              * overrides that) */
4627             assert(after != LB_Space);
4628
4629             /* Here we have a space followed by a non-space.  Mostly this is a
4630              * case of LB18: "Break after spaces".  But there are complications
4631              * as the handling of spaces is somewhat tricky.  They are in a
4632              * number of rules, which have to be applied in priority order, but
4633              * something earlier in the string can cause a rule to be skipped
4634              * and a lower priority rule invoked.  A prime example is LB7 which
4635              * says don't break before a space.  But rule LB8 (lower priority)
4636              * says that the first break opportunity after a ZW is after any
4637              * span of spaces immediately after it.  If a ZW comes before a SP
4638              * in the input, rule LB8 applies, and not LB7.  Other such rules
4639              * involve combining marks which are rules 9 and 10, but they may
4640              * override higher priority rules if they come earlier in the
4641              * string.  Since we're doing random access into the middle of the
4642              * string, we have to look for rules that should get applied based
4643              * on both string position and priority.  Combining marks do not
4644              * attach to either ZW nor SP, so we don't have to consider them
4645              * until later.
4646              *
4647              * To check for LB8, we have to find the first non-space character
4648              * before this span of spaces */
4649             do {
4650                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4651             }
4652             while (prev == LB_Space);
4653
4654             /* LB8 Break before any character following a zero-width space,
4655              * even if one or more spaces intervene.
4656              *      ZW SP* ÷
4657              * So if we have a ZW just before this span, and to get here this
4658              * is the final space in the span. */
4659             if (prev == LB_ZWSpace) {
4660                 return TRUE;
4661             }
4662
4663             /* Here, not ZW SP+.  There are several rules that have higher
4664              * priority than LB18 and can be resolved now, as they don't depend
4665              * on anything earlier in the string (except ZW, which we have
4666              * already handled).  One of these rules is LB11 Do not break
4667              * before Word joiner, but we have specially encoded that in the
4668              * lookup table so it is caught by the single test below which
4669              * catches the other ones. */
4670             if (LB_table[LB_Space][after] - LB_SP_foo
4671                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4672             {
4673                 return FALSE;
4674             }
4675
4676             /* If we get here, we have to XXX consider combining marks. */
4677             if (prev == LB_Combining_Mark) {
4678
4679                 /* What happens with these depends on the character they
4680                  * follow.  */
4681                 do {
4682                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4683                 }
4684                 while (prev == LB_Combining_Mark);
4685
4686                 /* Most times these attach to and inherit the characteristics
4687                  * of that character, but not always, and when not, they are to
4688                  * be treated as AL by rule LB10. */
4689                 if (! LB_CM_ATTACHES_TO(prev)) {
4690                     prev = LB_Alphabetic;
4691                 }
4692             }
4693
4694             /* Here, we have the character preceding the span of spaces all set
4695              * up.  We follow LB18: "Break after spaces" unless the table shows
4696              * that is overriden */
4697             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4698
4699         case LB_CM_ZWJ_foo:
4700
4701             /* We don't know how to treat the CM except by looking at the first
4702              * non-CM character preceding it.  ZWJ is treated as CM */
4703             do {
4704                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4705             }
4706             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4707
4708             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4709              * attatches to it, then it inherits the behavior of 'prev'.  If it
4710              * doesn't attach, it is to be treated as an AL */
4711             if (! LB_CM_ATTACHES_TO(prev)) {
4712                 prev = LB_Alphabetic;
4713             }
4714
4715             goto redo;
4716
4717         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4718         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4719
4720             /* LB21a Don't break after Hebrew + Hyphen.
4721              * HL (HY | BA) × */
4722
4723             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4724                                                           == LB_Hebrew_Letter)
4725             {
4726                 return FALSE;
4727             }
4728
4729             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4730
4731         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4732         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4733
4734             /* LB25a (PR | PO) × ( OP | HY )? NU */
4735             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4736                 return FALSE;
4737             }
4738
4739             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4740                                                                 == LB_BREAKABLE;
4741
4742         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4743         case LB_SY_or_IS_then_various + LB_NOBREAK:
4744         {
4745             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4746
4747             LB_enum temp = prev;
4748             do {
4749                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4750             }
4751             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4752             if (temp == LB_Numeric) {
4753                 return FALSE;
4754             }
4755
4756             return LB_table[prev][after] - LB_SY_or_IS_then_various
4757                                                                == LB_BREAKABLE;
4758         }
4759
4760         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4761         case LB_various_then_PO_or_PR + LB_NOBREAK:
4762         {
4763             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4764
4765             LB_enum temp = prev;
4766             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4767             {
4768                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4769             }
4770             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4771                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4772             }
4773             if (temp == LB_Numeric) {
4774                 return FALSE;
4775             }
4776             return LB_various_then_PO_or_PR;
4777         }
4778
4779         case LB_RI_then_RI + LB_NOBREAK:
4780         case LB_RI_then_RI + LB_BREAKABLE:
4781             {
4782                 int RI_count = 1;
4783
4784                 /* LB30a Break between two regional indicator symbols if and
4785                  * only if there are an even number of regional indicators
4786                  * preceding the position of the break.
4787                  *
4788                  *    sot (RI RI)* RI × RI
4789                  *  [^RI] (RI RI)* RI × RI */
4790
4791                 while (backup_one_LB(strbeg,
4792                                      &temp_pos,
4793                                      utf8_target) == LB_Regional_Indicator)
4794                 {
4795                     RI_count++;
4796                 }
4797
4798                 return RI_count % 2 == 0;
4799             }
4800
4801         default:
4802             break;
4803     }
4804
4805 #ifdef DEBUGGING
4806     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4807                                   before, after, LB_table[before][after]);
4808     assert(0);
4809 #endif
4810     return TRUE;
4811 }
4812
4813 STATIC LB_enum
4814 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4815 {
4816     LB_enum lb;
4817
4818     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4819
4820     if (*curpos >= strend) {
4821         return LB_EDGE;
4822     }
4823
4824     if (utf8_target) {
4825         *curpos += UTF8SKIP(*curpos);
4826         if (*curpos >= strend) {
4827             return LB_EDGE;
4828         }
4829         lb = getLB_VAL_UTF8(*curpos, strend);
4830     }
4831     else {
4832         (*curpos)++;
4833         if (*curpos >= strend) {
4834             return LB_EDGE;
4835         }
4836         lb = getLB_VAL_CP(**curpos);
4837     }
4838
4839     return lb;
4840 }
4841
4842 STATIC LB_enum
4843 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4844 {
4845     LB_enum lb;
4846
4847     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4848
4849     if (*curpos < strbeg) {
4850         return LB_EDGE;
4851     }
4852
4853     if (utf8_target) {
4854         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4855         U8 * prev_prev_char_pos;
4856
4857         if (! prev_char_pos) {
4858             return LB_EDGE;
4859         }
4860
4861         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4862             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4863             *curpos = prev_char_pos;
4864             prev_char_pos = prev_prev_char_pos;
4865         }
4866         else {
4867             *curpos = (U8 *) strbeg;
4868             return LB_EDGE;
4869         }
4870     }
4871     else {
4872         if (*curpos - 2 < strbeg) {
4873             *curpos = (U8 *) strbeg;
4874             return LB_EDGE;
4875         }
4876         (*curpos)--;
4877         lb = getLB_VAL_CP(*(*curpos - 1));
4878     }
4879
4880     return lb;
4881 }
4882
4883 STATIC bool
4884 S_isSB(pTHX_ SB_enum before,
4885              SB_enum after,
4886              const U8 * const strbeg,
4887              const U8 * const curpos,
4888              const U8 * const strend,
4889              const bool utf8_target)
4890 {
4891     /* returns a boolean indicating if there is a Sentence Boundary Break
4892      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4893
4894     U8 * lpos = (U8 *) curpos;
4895     bool has_para_sep = FALSE;
4896     bool has_sp = FALSE;
4897
4898     PERL_ARGS_ASSERT_ISSB;
4899
4900     /* Break at the start and end of text.
4901         SB1.  sot  ÷
4902         SB2.  ÷  eot
4903       But unstated in Unicode is don't break if the text is empty */
4904     if (before == SB_EDGE || after == SB_EDGE) {
4905         return before != after;
4906     }
4907
4908     /* SB 3: Do not break within CRLF. */
4909     if (before == SB_CR && after == SB_LF) {
4910         return FALSE;
4911     }
4912
4913     /* Break after paragraph separators.  CR and LF are considered
4914      * so because Unicode views text as like word processing text where there
4915      * are no newlines except between paragraphs, and the word processor takes
4916      * care of wrapping without there being hard line-breaks in the text *./
4917        SB4.  Sep | CR | LF  ÷ */
4918     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4919         return TRUE;
4920     }
4921
4922     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4923      * (See Section 6.2, Replacing Ignore Rules.)
4924         SB5.  X (Extend | Format)*  →  X */
4925     if (after == SB_Extend || after == SB_Format) {
4926
4927         /* Implied is that the these characters attach to everything
4928          * immediately prior to them except for those separator-type
4929          * characters.  And the rules earlier have already handled the case
4930          * when one of those immediately precedes the extend char */
4931         return FALSE;
4932     }
4933
4934     if (before == SB_Extend || before == SB_Format) {
4935         U8 * temp_pos = lpos;
4936         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4937         if (   backup != SB_EDGE
4938             && backup != SB_Sep
4939             && backup != SB_CR
4940             && backup != SB_LF)
4941         {
4942             before = backup;
4943             lpos = temp_pos;
4944         }
4945
4946         /* Here, both 'before' and 'backup' are these types; implied is that we
4947          * don't break between them */
4948         if (backup == SB_Extend || backup == SB_Format) {
4949             return FALSE;
4950         }
4951     }
4952
4953     /* Do not break after ambiguous terminators like period, if they are
4954      * immediately followed by a number or lowercase letter, if they are
4955      * between uppercase letters, if the first following letter (optionally
4956      * after certain punctuation) is lowercase, or if they are followed by
4957      * "continuation" punctuation such as comma, colon, or semicolon. For
4958      * example, a period may be an abbreviation or numeric period, and thus may
4959      * not mark the end of a sentence.
4960
4961      * SB6. ATerm  ×  Numeric */
4962     if (before == SB_ATerm && after == SB_Numeric) {
4963         return FALSE;
4964     }
4965
4966     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4967     if (before == SB_ATerm && after == SB_Upper) {
4968         U8 * temp_pos = lpos;
4969         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4970         if (backup == SB_Upper || backup == SB_Lower) {
4971             return FALSE;
4972         }
4973     }
4974
4975     /* The remaining rules that aren't the final one, all require an STerm or
4976      * an ATerm after having backed up over some Close* Sp*, and in one case an
4977      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4978      * So do that backup now, setting flags if either Sp or a paragraph
4979      * separator are found */
4980
4981     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4982         has_para_sep = TRUE;
4983         before = backup_one_SB(strbeg, &lpos, utf8_target);
4984     }
4985
4986     if (before == SB_Sp) {
4987         has_sp = TRUE;
4988         do {
4989             before = backup_one_SB(strbeg, &lpos, utf8_target);
4990         }
4991         while (before == SB_Sp);
4992     }
4993
4994     while (before == SB_Close) {
4995         before = backup_one_SB(strbeg, &lpos, utf8_target);
4996     }
4997
4998     /* The next few rules apply only when the backed-up-to is an ATerm, and in
4999      * most cases an STerm */
5000     if (before == SB_STerm || before == SB_ATerm) {
5001
5002         /* So, here the lhs matches
5003          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5004          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5005          * The rules that apply here are:
5006          *
5007          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5008                                            | LF | STerm | ATerm) )* Lower
5009            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5010            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5011            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5012            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5013          */
5014
5015         /* And all but SB11 forbid having seen a paragraph separator */
5016         if (! has_para_sep) {
5017             if (before == SB_ATerm) {          /* SB8 */
5018                 U8 * rpos = (U8 *) curpos;
5019                 SB_enum later = after;
5020
5021                 while (    later != SB_OLetter
5022                         && later != SB_Upper
5023                         && later != SB_Lower
5024                         && later != SB_Sep
5025                         && later != SB_CR
5026                         && later != SB_LF
5027                         && later != SB_STerm
5028                         && later != SB_ATerm
5029                         && later != SB_EDGE)
5030                 {
5031                     later = advance_one_SB(&rpos, strend, utf8_target);
5032                 }
5033                 if (later == SB_Lower) {
5034                     return FALSE;
5035                 }
5036             }
5037
5038             if (   after == SB_SContinue    /* SB8a */
5039                 || after == SB_STerm
5040                 || after == SB_ATerm)
5041             {
5042                 return FALSE;
5043             }
5044
5045             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5046                 if (   after == SB_Close
5047                     || after == SB_Sp
5048                     || after == SB_Sep
5049                     || after == SB_CR
5050                     || after == SB_LF)
5051                 {
5052                     return FALSE;
5053                 }
5054             }
5055
5056             /* SB10.  This and SB9 could probably be combined some way, but khw
5057              * has decided to follow the Unicode rule book precisely for
5058              * simplified maintenance */
5059             if (   after == SB_Sp
5060                 || after == SB_Sep
5061                 || after == SB_CR
5062                 || after == SB_LF)
5063             {
5064                 return FALSE;
5065             }
5066         }
5067
5068         /* SB11.  */
5069         return TRUE;
5070     }
5071
5072     /* Otherwise, do not break.
5073     SB12.  Any  ×  Any */
5074
5075     return FALSE;
5076 }
5077
5078 STATIC SB_enum
5079 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5080 {
5081     SB_enum sb;
5082
5083     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5084
5085     if (*curpos >= strend) {
5086         return SB_EDGE;
5087     }
5088
5089     if (utf8_target) {
5090         do {
5091             *curpos += UTF8SKIP(*curpos);
5092             if (*curpos >= strend) {
5093                 return SB_EDGE;
5094             }
5095             sb = getSB_VAL_UTF8(*curpos, strend);
5096         } while (sb == SB_Extend || sb == SB_Format);
5097     }
5098     else {
5099         do {
5100             (*curpos)++;
5101             if (*curpos >= strend) {
5102                 return SB_EDGE;
5103             }
5104             sb = getSB_VAL_CP(**curpos);
5105         } while (sb == SB_Extend || sb == SB_Format);
5106     }
5107
5108     return sb;
5109 }
5110
5111 STATIC SB_enum
5112 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5113 {
5114     SB_enum sb;
5115
5116     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5117
5118     if (*curpos < strbeg) {
5119         return SB_EDGE;
5120     }
5121
5122     if (utf8_target) {
5123         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5124         if (! prev_char_pos) {
5125             return SB_EDGE;
5126         }
5127
5128         /* Back up over Extend and Format.  curpos is always just to the right
5129          * of the characater whose value we are getting */
5130         do {
5131             U8 * prev_prev_char_pos;
5132             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5133                                                                       strbeg)))
5134             {
5135                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5136                 *curpos = prev_char_pos;
5137                 prev_char_pos = prev_prev_char_pos;
5138             }
5139             else {
5140                 *curpos = (U8 *) strbeg;
5141                 return SB_EDGE;
5142             }
5143         } while (sb == SB_Extend || sb == SB_Format);
5144     }
5145     else {
5146         do {
5147             if (*curpos - 2 < strbeg) {
5148                 *curpos = (U8 *) strbeg;
5149                 return SB_EDGE;
5150             }
5151             (*curpos)--;
5152             sb = getSB_VAL_CP(*(*curpos - 1));
5153         } while (sb == SB_Extend || sb == SB_Format);
5154     }
5155
5156     return sb;
5157 }
5158
5159 STATIC bool
5160 S_isWB(pTHX_ WB_enum previous,
5161              WB_enum before,
5162              WB_enum after,
5163              const U8 * const strbeg,
5164              const U8 * const curpos,
5165              const U8 * const strend,
5166              const bool utf8_target)
5167 {
5168     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5169      *  a Unicode word break, using their published algorithm, but tailored for
5170      *  Perl by treating spans of white space as one unit.  Context may be
5171      *  needed to make this determination.  If the value for the character
5172      *  before 'before' is known, it is passed as 'previous'; otherwise that
5173      *  should be set to WB_UNKNOWN.  The other input parameters give the
5174      *  boundaries and current position in the matching of the string.  That
5175      *  is, 'curpos' marks the position where the character whose wb value is
5176      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5177
5178     U8 * before_pos = (U8 *) curpos;
5179     U8 * after_pos = (U8 *) curpos;
5180     WB_enum prev = before;
5181     WB_enum next;
5182
5183     PERL_ARGS_ASSERT_ISWB;
5184
5185     /* Rule numbers in the comments below are as of Unicode 9.0 */
5186
5187   redo:
5188     before = prev;
5189     switch (WB_table[before][after]) {
5190         case WB_BREAKABLE:
5191             return TRUE;
5192
5193         case WB_NOBREAK:
5194             return FALSE;
5195
5196         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5197             next = advance_one_WB(&after_pos, strend, utf8_target,
5198                                  FALSE /* Don't skip Extend nor Format */ );
5199             /* A space immediately preceeding an Extend or Format is attached
5200              * to by them, and hence gets separated from previous spaces.
5201              * Otherwise don't break between horizontal white space */
5202             return next == WB_Extend || next == WB_Format;
5203
5204         /* WB4 Ignore Format and Extend characters, except when they appear at
5205          * the beginning of a region of text.  This code currently isn't
5206          * general purpose, but it works as the rules are currently and likely
5207          * to be laid out.  The reason it works is that when 'they appear at
5208          * the beginning of a region of text', the rule is to break before
5209          * them, just like any other character.  Therefore, the default rule
5210          * applies and we don't have to look in more depth.  Should this ever
5211          * change, we would have to have 2 'case' statements, like in the rules
5212          * below, and backup a single character (not spacing over the extend
5213          * ones) and then see if that is one of the region-end characters and
5214          * go from there */
5215         case WB_Ex_or_FO_or_ZWJ_then_foo:
5216             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5217             goto redo;
5218
5219         case WB_DQ_then_HL + WB_BREAKABLE:
5220         case WB_DQ_then_HL + WB_NOBREAK:
5221
5222             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5223
5224             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5225                                                             == WB_Hebrew_Letter)
5226             {
5227                 return FALSE;
5228             }
5229
5230              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5231
5232         case WB_HL_then_DQ + WB_BREAKABLE:
5233         case WB_HL_then_DQ + WB_NOBREAK:
5234
5235             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5236
5237             if (advance_one_WB(&after_pos, strend, utf8_target,
5238                                        TRUE /* Do skip Extend and Format */ )
5239                                                             == WB_Hebrew_Letter)
5240             {
5241                 return FALSE;
5242             }
5243
5244             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5245
5246         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5247         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5248
5249             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5250              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5251
5252             next = advance_one_WB(&after_pos, strend, utf8_target,
5253                                        TRUE /* Do skip Extend and Format */ );
5254
5255             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5256             {
5257                 return FALSE;
5258             }
5259
5260             return WB_table[before][after]
5261                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5262
5263         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5264         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5265
5266             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5267              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5268
5269             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5270             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5271             {
5272                 return FALSE;
5273             }
5274
5275             return WB_table[before][after]
5276                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5277
5278         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5279         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5280
5281             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5282              * */
5283
5284             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5285                                                             == WB_Numeric)
5286             {
5287                 return FALSE;
5288             }
5289
5290             return WB_table[before][after]
5291                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5292
5293         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5294         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5295
5296             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5297
5298             if (advance_one_WB(&after_pos, strend, utf8_target,
5299                                        TRUE /* Do skip Extend and Format */ )
5300                                                             == WB_Numeric)
5301             {
5302                 return FALSE;
5303             }
5304
5305             return WB_table[before][after]
5306                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5307
5308         case WB_RI_then_RI + WB_NOBREAK:
5309         case WB_RI_then_RI + WB_BREAKABLE:
5310             {
5311                 int RI_count = 1;
5312
5313                 /* Do not break within emoji flag sequences. That is, do not
5314                  * break between regional indicator (RI) symbols if there is an
5315                  * odd number of RI characters before the potential break
5316                  * point.
5317                  *
5318                  * WB15   sot (RI RI)* RI × RI
5319                  * WB16 [^RI] (RI RI)* RI × RI */
5320
5321                 while (backup_one_WB(&previous,
5322                                      strbeg,
5323                                      &before_pos,
5324                                      utf8_target) == WB_Regional_Indicator)
5325                 {
5326                     RI_count++;
5327                 }
5328
5329                 return RI_count % 2 != 1;
5330             }
5331
5332         default:
5333             break;
5334     }
5335
5336 #ifdef DEBUGGING
5337     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5338                                   before, after, WB_table[before][after]);
5339     assert(0);
5340 #endif
5341     return TRUE;
5342 }
5343
5344 STATIC WB_enum
5345 S_advance_one_WB(pTHX_ U8 ** curpos,
5346                        const U8 * const strend,
5347                        const bool utf8_target,
5348                        const bool skip_Extend_Format)
5349 {
5350     WB_enum wb;
5351
5352     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5353
5354     if (*curpos >= strend) {
5355         return WB_EDGE;
5356     }
5357
5358     if (utf8_target) {
5359
5360         /* Advance over Extend and Format */
5361         do {
5362             *curpos += UTF8SKIP(*curpos);
5363             if (*curpos >= strend) {
5364                 return WB_EDGE;
5365             }
5366             wb = getWB_VAL_UTF8(*curpos, strend);
5367         } while (    skip_Extend_Format
5368                  && (wb == WB_Extend || wb == WB_Format));
5369     }
5370     else {
5371         do {
5372             (*curpos)++;
5373             if (*curpos >= strend) {
5374                 return WB_EDGE;
5375             }
5376             wb = getWB_VAL_CP(**curpos);
5377         } while (    skip_Extend_Format
5378                  && (wb == WB_Extend || wb == WB_Format));
5379     }
5380
5381     return wb;
5382 }
5383
5384 STATIC WB_enum
5385 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5386 {
5387     WB_enum wb;
5388
5389     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5390
5391     /* If we know what the previous character's break value is, don't have
5392         * to look it up */
5393     if (*previous != WB_UNKNOWN) {
5394         wb = *previous;
5395
5396         /* But we need to move backwards by one */
5397         if (utf8_target) {
5398             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5399             if (! *curpos) {
5400                 *previous = WB_EDGE;
5401                 *curpos = (U8 *) strbeg;
5402             }
5403             else {
5404                 *previous = WB_UNKNOWN;
5405             }
5406         }
5407         else {
5408             (*curpos)--;
5409             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5410         }
5411
5412         /* And we always back up over these three types */
5413         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5414             return wb;
5415         }
5416     }
5417
5418     if (*curpos < strbeg) {
5419         return WB_EDGE;
5420     }
5421
5422     if (utf8_target) {
5423         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5424         if (! prev_char_pos) {
5425             return WB_EDGE;
5426         }
5427
5428         /* Back up over Extend and Format.  curpos is always just to the right
5429          * of the characater whose value we are getting */
5430         do {
5431             U8 * prev_prev_char_pos;
5432             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5433                                                    -1,
5434                                                    strbeg)))
5435             {
5436                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5437                 *curpos = prev_char_pos;
5438                 prev_char_pos = prev_prev_char_pos;
5439             }
5440             else {
5441                 *curpos = (U8 *) strbeg;
5442                 return WB_EDGE;
5443             }
5444         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5445     }
5446     else {
5447         do {
5448             if (*curpos - 2 < strbeg) {
5449                 *curpos = (U8 *) strbeg;
5450                 return WB_EDGE;
5451             }
5452             (*curpos)--;
5453             wb = getWB_VAL_CP(*(*curpos - 1));
5454         } while (wb == WB_Extend || wb == WB_Format);
5455     }
5456
5457     return wb;
5458 }
5459
5460 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5461 (                                                           \
5462     (   ( st )                                         ) && \
5463     (   ( st )->u.eval.close_paren                     ) && \
5464     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5465 )
5466
5467 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5468 (                                                           \
5469     (   ( st )                                         ) && \
5470     (   ( st )->u.eval.close_paren                     ) && \
5471     (   ( expr )                                       ) && \
5472     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5473 )
5474
5475
5476 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5477     (st)->u.eval.close_paren = ( (expr) + 1 )
5478
5479 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5480     (st)->u.eval.close_paren = 0
5481
5482 /* returns -1 on failure, $+[0] on success */
5483 STATIC SSize_t
5484 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5485 {
5486     dVAR;
5487     const bool utf8_target = reginfo->is_utf8_target;
5488     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5489     REGEXP *rex_sv = reginfo->prog;
5490     regexp *rex = ReANY(rex_sv);
5491     RXi_GET_DECL(rex,rexi);
5492     /* the current state. This is a cached copy of PL_regmatch_state */
5493     regmatch_state *st;
5494     /* cache heavy used fields of st in registers */
5495     regnode *scan;
5496     regnode *next;
5497     U32 n = 0;  /* general value; init to avoid compiler warning */
5498     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5499     SSize_t endref = 0; /* offset of end of backref when ln is start */
5500     char *locinput = startpos;
5501     char *pushinput; /* where to continue after a PUSH */
5502     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5503
5504     bool result = 0;        /* return value of S_regmatch */
5505     U32 depth = 0;            /* depth of backtrack stack */
5506     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5507     const U32 max_nochange_depth =
5508         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5509         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5510     regmatch_state *yes_state = NULL; /* state to pop to on success of
5511                                                             subpattern */
5512     /* mark_state piggy backs on the yes_state logic so that when we unwind
5513        the stack on success we can update the mark_state as we go */
5514     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5515     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5516     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5517     U32 state_num;
5518     bool no_final = 0;      /* prevent failure from backtracking? */
5519     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5520     char *startpoint = locinput;
5521     SV *popmark = NULL;     /* are we looking for a mark? */
5522     SV *sv_commit = NULL;   /* last mark name seen in failure */
5523     SV *sv_yes_mark = NULL; /* last mark name we have seen
5524                                during a successful match */
5525     U32 lastopen = 0;       /* last open we saw */
5526     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5527     SV* const oreplsv = GvSVn(PL_replgv);
5528     /* these three flags are set by various ops to signal information to
5529      * the very next op. They have a useful lifetime of exactly one loop
5530      * iteration, and are not preserved or restored by state pushes/pops
5531      */
5532     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5533     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5534     int logical = 0;        /* the following EVAL is:
5535                                 0: (?{...})
5536                                 1: (?(?{...})X|Y)
5537                                 2: (??{...})
5538                                or the following IFMATCH/UNLESSM is:
5539                                 false: plain (?=foo)
5540                                 true:  used as a condition: (?(?=foo))
5541                             */
5542     PAD* last_pad = NULL;
5543     dMULTICALL;
5544     U8 gimme = G_SCALAR;
5545     CV *caller_cv = NULL;       /* who called us */
5546     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5547     U32 maxopenparen = 0;       /* max '(' index seen so far */
5548     int to_complement;  /* Invert the result? */
5549     _char_class_number classnum;
5550     bool is_utf8_pat = reginfo->is_utf8_pat;
5551     bool match = FALSE;
5552     I32 orig_savestack_ix = PL_savestack_ix;
5553     char * script_run_begin = NULL;
5554
5555 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5556 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5557 #  define SOLARIS_BAD_OPTIMIZER
5558     const U32 *pl_charclass_dup = PL_charclass;
5559 #  define PL_charclass pl_charclass_dup
5560 #endif
5561
5562 #ifdef DEBUGGING
5563     GET_RE_DEBUG_FLAGS_DECL;
5564 #endif
5565
5566     /* protect against undef(*^R) */
5567     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5568
5569     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5570     multicall_oldcatch = 0;
5571     PERL_UNUSED_VAR(multicall_cop);
5572
5573     PERL_ARGS_ASSERT_REGMATCH;
5574
5575     st = PL_regmatch_state;
5576
5577     /* Note that nextchr is a byte even in UTF */
5578     SET_nextchr;
5579     scan = prog;
5580
5581     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5582             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5583             Perl_re_printf( aTHX_ "regmatch start\n" );
5584     }));
5585
5586     while (scan != NULL) {
5587         next = scan + NEXT_OFF(scan);
5588         if (next == scan)
5589             next = NULL;
5590         state_num = OP(scan);
5591
5592       reenter_switch:
5593         DEBUG_EXECUTE_r(
5594             if (state_num <= REGNODE_MAX) {
5595                 SV * const prop = sv_newmortal();
5596                 regnode *rnext = regnext(scan);
5597
5598                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5599                 regprop(rex, prop, scan, reginfo, NULL);
5600                 Perl_re_printf( aTHX_
5601                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5602                     INDENT_CHARS(depth), "",
5603                     (IV)(scan - rexi->program),
5604                     SvPVX_const(prop),
5605                     (PL_regkind[OP(scan)] == END || !rnext) ?
5606                         0 : (IV)(rnext - rexi->program));
5607             }
5608         );
5609
5610         to_complement = 0;
5611
5612         SET_nextchr;
5613         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5614
5615         switch (state_num) {
5616         case SBOL: /*  /^../ and /\A../  */
5617             if (locinput == reginfo->strbeg)
5618                 break;
5619             sayNO;
5620
5621         case MBOL: /*  /^../m  */
5622             if (locinput == reginfo->strbeg ||
5623                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5624             {
5625                 break;
5626             }
5627             sayNO;
5628
5629         case GPOS: /*  \G  */
5630             if (locinput == reginfo->ganch)
5631                 break;
5632             sayNO;
5633
5634         case KEEPS: /*   \K  */
5635             /* update the startpoint */
5636             st->u.keeper.val = rex->offs[0].start;
5637             rex->offs[0].start = locinput - reginfo->strbeg;
5638             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5639             NOT_REACHED; /* NOTREACHED */
5640
5641         case KEEPS_next_fail:
5642             /* rollback the start point change */
5643             rex->offs[0].start = st->u.keeper.val;
5644             sayNO_SILENT;
5645             NOT_REACHED; /* NOTREACHED */
5646
5647         case MEOL: /* /..$/m  */
5648             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5649                 sayNO;
5650             break;
5651
5652         case SEOL: /* /..$/  */
5653             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5654                 sayNO;
5655             if (reginfo->strend - locinput > 1)
5656                 sayNO;
5657             break;
5658
5659         case EOS: /*  \z  */
5660             if (!NEXTCHR_IS_EOS)
5661                 sayNO;
5662             break;
5663
5664         case SANY: /*  /./s  */
5665             if (NEXTCHR_IS_EOS)
5666                 sayNO;
5667             goto increment_locinput;
5668
5669         case REG_ANY: /*  /./  */
5670             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5671                 sayNO;
5672             goto increment_locinput;
5673
5674
5675 #undef  ST
5676 #define ST st->u.trie
5677         case TRIEC: /* (ab|cd) with known charclass */
5678             /* In this case the charclass data is available inline so
5679                we can fail fast without a lot of extra overhead.
5680              */
5681             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5682                 DEBUG_EXECUTE_r(
5683                     Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5684                               depth, PL_colors[4], PL_colors[5])
5685                 );
5686                 sayNO_SILENT;
5687                 NOT_REACHED; /* NOTREACHED */
5688             }
5689             /* FALLTHROUGH */
5690         case TRIE:  /* (ab|cd)  */
5691             /* the basic plan of execution of the trie is:
5692              * At the beginning, run though all the states, and
5693              * find the longest-matching word. Also remember the position
5694              * of the shortest matching word. For example, this pattern:
5695              *    1  2 3 4    5
5696              *    ab|a|x|abcd|abc
5697              * when matched against the string "abcde", will generate
5698              * accept states for all words except 3, with the longest
5699              * matching word being 4, and the shortest being 2 (with
5700              * the position being after char 1 of the string).
5701              *
5702              * Then for each matching word, in word order (i.e. 1,2,4,5),
5703              * we run the remainder of the pattern; on each try setting
5704              * the current position to the character following the word,
5705              * returning to try the next word on failure.
5706              *
5707              * We avoid having to build a list of words at runtime by
5708              * using a compile-time structure, wordinfo[].prev, which
5709              * gives, for each word, the previous accepting word (if any).
5710              * In the case above it would contain the mappings 1->2, 2->0,
5711              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5712              * the longest word (4 above), a list of all words, by
5713              * following the list of prev pointers; this gives us the
5714              * unordered list 4,5,1,2. Then given the current word we have
5715              * just tried, we can go through the list and find the
5716              * next-biggest word to try (so if we just failed on word 2,
5717              * the next in the list is 4).
5718              *
5719              * Since at runtime we don't record the matching position in
5720              * the string for each word, we have to work that out for
5721              * each word we're about to process. The wordinfo table holds
5722              * the character length of each word; given that we recorded
5723              * at the start: the position of the shortest word and its
5724              * length in chars, we just need to move the pointer the
5725              * difference between the two char lengths. Depending on
5726              * Unicode status and folding, that's cheap or expensive.
5727              *
5728              * This algorithm is optimised for the case where are only a
5729              * small number of accept states, i.e. 0,1, or maybe 2.
5730              * With lots of accepts states, and having to try all of them,
5731              * it becomes quadratic on number of accept states to find all
5732              * the next words.
5733              */
5734
5735             {
5736                 /* what type of TRIE am I? (utf8 makes this contextual) */
5737                 DECL_TRIE_TYPE(scan);
5738
5739                 /* what trie are we using right now */
5740                 reg_trie_data * const trie
5741                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5742                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5743                 U32 state = trie->startstate;
5744
5745                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5746                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5747                     if (utf8_target
5748                         && nextchr >= 0 /* guard against negative EOS value in nextchr */
5749                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5750                         && scan->flags == EXACTL)
5751                     {
5752                         /* We only output for EXACTL, as we let the folder
5753                          * output this message for EXACTFLU8 to avoid
5754                          * duplication */
5755                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5756                                                                reginfo->strend);
5757                     }
5758                 }
5759                 if (   trie->bitmap
5760                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5761                 {
5762                     if (trie->states[ state ].wordnum) {
5763                          DEBUG_EXECUTE_r(
5764                             Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
5765                                           depth, PL_colors[4], PL_colors[5])
5766                         );
5767                         if (!trie->jump)
5768                             break;
5769                     } else {
5770                         DEBUG_EXECUTE_r(
5771                             Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5772                                           depth, PL_colors[4], PL_colors[5])
5773                         );
5774                         sayNO_SILENT;
5775                    }
5776                 }
5777
5778             {
5779                 U8 *uc = ( U8* )locinput;
5780
5781                 STRLEN len = 0;
5782                 STRLEN foldlen = 0;
5783                 U8 *uscan = (U8*)NULL;
5784                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5785                 U32 charcount = 0; /* how many input chars we have matched */
5786                 U32 accepted = 0; /* have we seen any accepting states? */
5787
5788                 ST.jump = trie->jump;
5789                 ST.me = scan;
5790                 ST.firstpos = NULL;
5791                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5792                 ST.nextword = 0;
5793
5794                 /* fully traverse the TRIE; note the position of the
5795                    shortest accept state and the wordnum of the longest
5796                    accept state */
5797
5798                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5799                     U32 base = trie->states[ state ].trans.base;
5800                     UV uvc = 0;
5801                     U16 charid = 0;
5802                     U16 wordnum;
5803                     wordnum = trie->states[ state ].wordnum;
5804
5805                     if (wordnum) { /* it's an accept state */
5806                         if (!accepted) {
5807                             accepted = 1;
5808                             /* record first match position */
5809                             if (ST.longfold) {
5810                                 ST.firstpos = (U8*)locinput;
5811                                 ST.firstchars = 0;
5812                             }
5813                             else {
5814                                 ST.firstpos = uc;
5815                                 ST.firstchars = charcount;
5816                             }
5817                         }
5818                         if (!ST.nextword || wordnum < ST.nextword)
5819                             ST.nextword = wordnum;
5820                         ST.topword = wordnum;
5821                     }
5822
5823                     DEBUG_TRIE_EXECUTE_r({
5824                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5825                                 /* HERE */
5826                                 PerlIO_printf( Perl_debug_log,
5827                                     "%*s%sState: %4" UVxf " Accepted: %c ",
5828                                     INDENT_CHARS(depth), "", PL_colors[4],
5829                                     (UV)state, (accepted ? 'Y' : 'N'));
5830                     });
5831
5832                     /* read a char and goto next state */
5833                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5834                         I32 offset;
5835                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5836                                              uscan, len, uvc, charid, foldlen,
5837                                              foldbuf, uniflags);
5838                         charcount++;
5839                         if (foldlen>0)
5840                             ST.longfold = TRUE;
5841                         if (charid &&
5842                              ( ((offset =
5843                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5844
5845                              && ((U32)offset < trie->lasttrans)
5846                              && trie->trans[offset].check == state)
5847                         {
5848                             state = trie->trans[offset].next;
5849                         }
5850                         else {
5851                             state = 0;
5852                         }
5853                         uc += len;
5854
5855                     }
5856                     else {
5857                         state = 0;
5858                     }
5859                     DEBUG_TRIE_EXECUTE_r(
5860                         Perl_re_printf( aTHX_
5861                             "Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
5862                             charid, uvc, (UV)state, PL_colors[5] );
5863                     );
5864                 }
5865                 if (!accepted)
5866                    sayNO;
5867
5868                 /* calculate total number of accept states */
5869                 {
5870                     U16 w = ST.topword;
5871                     accepted = 0;
5872                     while (w) {
5873                         w = trie->wordinfo[w].prev;
5874                         accepted++;
5875                     }
5876                     ST.accepted = accepted;
5877                 }
5878
5879                 DEBUG_EXECUTE_r(
5880                     Perl_re_exec_indentf( aTHX_  "%sgot %" IVdf " possible matches%s\n",
5881                         depth,
5882                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5883                 );
5884                 goto trie_first_try; /* jump into the fail handler */
5885             }}
5886             NOT_REACHED; /* NOTREACHED */
5887
5888         case TRIE_next_fail: /* we failed - try next alternative */
5889         {
5890             U8 *uc;
5891             if ( ST.jump ) {
5892                 /* undo any captures done in the tail part of a branch,
5893                  * e.g.
5894                  *    /(?:X(.)(.)|Y(.)).../
5895                  * where the trie just matches X then calls out to do the
5896                  * rest of the branch */
5897                 REGCP_UNWIND(ST.cp);
5898                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5899             }
5900             if (!--ST.accepted) {
5901                 DEBUG_EXECUTE_r({
5902                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
5903                         depth,
5904                         PL_colors[4],
5905                         PL_colors[5] );
5906                 });
5907                 sayNO_SILENT;
5908             }
5909             {
5910                 /* Find next-highest word to process.  Note that this code
5911                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5912                 U16 min = 0;
5913                 U16 word;
5914                 U16 const nextword = ST.nextword;
5915                 reg_trie_wordinfo * const wordinfo
5916                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5917                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5918                     if (word > nextword && (!min || word < min))
5919                         min = word;
5920                 }
5921                 ST.nextword = min;
5922             }
5923
5924           trie_first_try:
5925             if (do_cutgroup) {
5926                 do_cutgroup = 0;
5927                 no_final = 0;
5928             }
5929
5930             if ( ST.jump ) {
5931                 ST.lastparen = rex->lastparen;
5932                 ST.lastcloseparen = rex->lastcloseparen;
5933                 REGCP_SET(ST.cp);
5934             }
5935
5936             /* find start char of end of current word */
5937             {
5938                 U32 chars; /* how many chars to skip */
5939                 reg_trie_data * const trie
5940                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5941
5942                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5943                             >=  ST.firstchars);
5944                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5945                             - ST.firstchars;
5946                 uc = ST.firstpos;
5947
5948                 if (ST.longfold) {
5949                     /* the hard option - fold each char in turn and find
5950                      * its folded length (which may be different */
5951                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5952                     STRLEN foldlen;
5953                     STRLEN len;
5954                     UV uvc;
5955                     U8 *uscan;
5956
5957                     while (chars) {
5958                         if (utf8_target) {
5959                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5960                                                     uniflags);
5961                             uc += len;
5962                         }
5963                         else {
5964                             uvc = *uc;
5965                             uc++;
5966                         }
5967                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5968                         uscan = foldbuf;
5969                         while (foldlen) {
5970                             if (!--chars)
5971                                 break;
5972                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5973                                             uniflags);
5974                             uscan += len;
5975                             foldlen -= len;
5976                         }
5977                     }
5978                 }
5979                 else {
5980                     if (utf8_target)
5981                         while (chars--)
5982                             uc += UTF8SKIP(uc);
5983                     else
5984                         uc += chars;
5985                 }
5986             }
5987
5988             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5989                             ? ST.jump[ST.nextword]
5990                             : NEXT_OFF(ST.me));
5991
5992             DEBUG_EXECUTE_r({
5993                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
5994                     depth,
5995                     PL_colors[4],
5996                     ST.nextword,
5997                     PL_colors[5]
5998                     );
5999             });
6000
6001             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6002                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6003                 NOT_REACHED; /* NOTREACHED */
6004             }
6005             /* only one choice left - just continue */
6006             DEBUG_EXECUTE_r({
6007                 AV *const trie_words
6008                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6009                 SV ** const tmp = trie_words
6010                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6011                 SV *sv= tmp ? sv_newmortal() : NULL;
6012
6013                 Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
6014                     depth, PL_colors[4],
6015                     ST.nextword,
6016                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6017                             PL_colors[0], PL_colors[1],
6018                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6019                         )
6020                     : "not compiled under -Dr",
6021                     PL_colors[5] );
6022             });
6023
6024             locinput = (char*)uc;
6025             continue; /* execute rest of RE */
6026             /* NOTREACHED */
6027         }
6028 #undef  ST
6029
6030         case EXACTL:             /*  /abc/l       */
6031             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6032
6033             /* Complete checking would involve going through every character
6034              * matched by the string to see if any is above latin1.  But the
6035              * comparision otherwise might very well be a fast assembly
6036              * language routine, and I (khw) don't think slowing things down
6037              * just to check for this warning is worth it.  So this just checks
6038              * the first character */
6039             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6040                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6041             }
6042             /* FALLTHROUGH */
6043         case EXACT: {            /*  /abc/        */
6044             char *s = STRING(scan);
6045             ln = STR_LEN(scan);
6046             if (utf8_target != is_utf8_pat) {
6047                 /* The target and the pattern have differing utf8ness. */
6048                 char *l = locinput;
6049                 const char * const e = s + ln;
6050
6051                 if (utf8_target) {
6052                     /* The target is utf8, the pattern is not utf8.
6053                      * Above-Latin1 code points can't match the pattern;
6054                      * invariants match exactly, and the other Latin1 ones need
6055                      * to be downgraded to a single byte in order to do the
6056                      * comparison.  (If we could be confident that the target
6057                      * is not malformed, this could be refactored to have fewer
6058                      * tests by just assuming that if the first bytes match, it
6059                      * is an invariant, but there are tests in the test suite
6060                      * dealing with (??{...}) which violate this) */
6061                     while (s < e) {
6062                         if (l >= reginfo->strend
6063                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6064                         {
6065                             sayNO;
6066                         }
6067                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6068                             if (*l != *s) {
6069                                 sayNO;
6070                             }
6071                             l++;
6072                         }
6073                         else {
6074                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6075                             {
6076                                 sayNO;
6077                             }
6078                             l += 2;
6079                         }
6080                         s++;
6081                     }
6082                 }
6083                 else {
6084                     /* The target is not utf8, the pattern is utf8. */
6085                     while (s < e) {
6086                         if (l >= reginfo->strend
6087                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6088                         {
6089                             sayNO;
6090                         }
6091                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6092                             if (*s != *l) {
6093                                 sayNO;
6094                             }
6095                             s++;
6096                         }
6097                         else {
6098                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6099                             {
6100                                 sayNO;
6101                             }
6102                             s += 2;
6103                         }
6104                         l++;
6105                     }
6106                 }
6107                 locinput = l;
6108             }
6109             else {
6110                 /* The target and the pattern have the same utf8ness. */
6111                 /* Inline the first character, for speed. */
6112                 if (reginfo->strend - locinput < ln
6113                     || UCHARAT(s) != nextchr
6114                     || (ln > 1 && memNE(s, locinput, ln)))
6115                 {
6116                     sayNO;
6117                 }
6118                 locinput += ln;
6119             }
6120             break;
6121             }
6122
6123         case EXACTFL: {          /*  /abc/il      */
6124             re_fold_t folder;
6125             const U8 * fold_array;
6126             const char * s;
6127             U32 fold_utf8_flags;
6128
6129             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6130             folder = foldEQ_locale;
6131             fold_array = PL_fold_locale;
6132             fold_utf8_flags = FOLDEQ_LOCALE;
6133             goto do_exactf;
6134
6135         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6136                                       is effectively /u; hence to match, target
6137                                       must be UTF-8. */
6138             if (! utf8_target) {
6139                 sayNO;
6140             }
6141             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6142                                              | FOLDEQ_S1_FOLDS_SANE;
6143             folder = foldEQ_latin1;
6144             fold_array = PL_fold_latin1;
6145             goto do_exactf;
6146
6147         case EXACTFU_SS:         /*  /\x{df}/iu   */
6148         case EXACTFU:            /*  /abc/iu      */
6149             folder = foldEQ_latin1;
6150             fold_array = PL_fold_latin1;
6151             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6152             goto do_exactf;
6153
6154         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
6155                                    patterns */
6156             assert(! is_utf8_pat);
6157             /* FALLTHROUGH */
6158         case EXACTFA:            /*  /abc/iaa     */
6159             folder = foldEQ_latin1;
6160             fold_array = PL_fold_latin1;
6161             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6162             goto do_exactf;
6163
6164         case EXACTF:             /*  /abc/i    This node only generated for
6165                                                non-utf8 patterns */
6166             assert(! is_utf8_pat);
6167             folder = foldEQ;
6168             fold_array = PL_fold;
6169             fold_utf8_flags = 0;
6170
6171           do_exactf:
6172             s = STRING(scan);
6173             ln = STR_LEN(scan);
6174
6175             if (utf8_target
6176                 || is_utf8_pat
6177                 || state_num == EXACTFU_SS
6178                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6179             {
6180               /* Either target or the pattern are utf8, or has the issue where
6181                * the fold lengths may differ. */
6182                 const char * const l = locinput;
6183                 char *e = reginfo->strend;
6184
6185                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6186                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6187                 {
6188                     sayNO;
6189                 }
6190                 locinput = e;
6191                 break;
6192             }
6193
6194             /* Neither the target nor the pattern are utf8 */
6195             if (UCHARAT(s) != nextchr
6196                 && !NEXTCHR_IS_EOS
6197                 && UCHARAT(s) != fold_array[nextchr])
6198             {
6199                 sayNO;
6200             }
6201             if (reginfo->strend - locinput < ln)
6202                 sayNO;
6203             if (ln > 1 && ! folder(s, locinput, ln))
6204                 sayNO;
6205             locinput += ln;
6206             break;
6207         }
6208
6209         case NBOUNDL: /*  /\B/l  */
6210             to_complement = 1;
6211             /* FALLTHROUGH */
6212
6213         case BOUNDL:  /*  /\b/l  */
6214         {
6215             bool b1, b2;
6216             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6217
6218             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6219                 if (! IN_UTF8_CTYPE_LOCALE) {
6220                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6221                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6222                 }
6223                 goto boundu;
6224             }
6225
6226             if (utf8_target) {
6227                 if (locinput == reginfo->strbeg)
6228                     b1 = isWORDCHAR_LC('\n');
6229                 else {
6230                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6231                                                         (U8*)(reginfo->strbeg)),
6232                                                  (U8*)(reginfo->strend));
6233                 }
6234                 b2 = (NEXTCHR_IS_EOS)
6235                     ? isWORDCHAR_LC('\n')
6236                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6237                                               (U8*) reginfo->strend);
6238             }
6239             else { /* Here the string isn't utf8 */
6240                 b1 = (locinput == reginfo->strbeg)
6241                      ? isWORDCHAR_LC('\n')
6242                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6243                 b2 = (NEXTCHR_IS_EOS)
6244                     ? isWORDCHAR_LC('\n')
6245                     : isWORDCHAR_LC(nextchr);
6246             }
6247             if (to_complement ^ (b1 == b2)) {
6248                 sayNO;
6249             }
6250             break;
6251         }
6252
6253         case NBOUND:  /*  /\B/   */
6254             to_complement = 1;
6255             /* FALLTHROUGH */
6256
6257         case BOUND:   /*  /\b/   */
6258             if (utf8_target) {
6259                 goto bound_utf8;
6260             }
6261             goto bound_ascii_match_only;
6262
6263         case NBOUNDA: /*  /\B/a  */
6264             to_complement = 1;
6265             /* FALLTHROUGH */
6266
6267         case BOUNDA:  /*  /\b/a  */
6268         {
6269             bool b1, b2;
6270
6271           bound_ascii_match_only:
6272             /* Here the string isn't utf8, or is utf8 and only ascii characters
6273              * are to match \w.  In the latter case looking at the byte just
6274              * prior to the current one may be just the final byte of a
6275              * multi-byte character.  This is ok.  There are two cases:
6276              * 1) it is a single byte character, and then the test is doing
6277              *    just what it's supposed to.
6278              * 2) it is a multi-byte character, in which case the final byte is
6279              *    never mistakable for ASCII, and so the test will say it is
6280              *    not a word character, which is the correct answer. */
6281             b1 = (locinput == reginfo->strbeg)
6282                  ? isWORDCHAR_A('\n')
6283                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6284             b2 = (NEXTCHR_IS_EOS)
6285                 ? isWORDCHAR_A('\n')
6286                 : isWORDCHAR_A(nextchr);
6287             if (to_complement ^ (b1 == b2)) {
6288                 sayNO;
6289             }
6290             break;
6291         }
6292
6293         case NBOUNDU: /*  /\B/u  */
6294             to_complement = 1;
6295             /* FALLTHROUGH */
6296
6297         case BOUNDU:  /*  /\b/u  */
6298
6299           boundu:
6300             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6301                 match = FALSE;
6302             }
6303             else if (utf8_target) {
6304               bound_utf8:
6305                 switch((bound_type) FLAGS(scan)) {
6306                     case TRADITIONAL_BOUND:
6307                     {
6308                         bool b1, b2;
6309                         b1 = (locinput == reginfo->strbeg)
6310                              ? 0 /* isWORDCHAR_L1('\n') */
6311                              : isWORDCHAR_utf8_safe(
6312                                                reghop3((U8*)locinput,
6313                                                        -1,
6314                                                        (U8*)(reginfo->strbeg)),
6315                                                     (U8*) reginfo->strend);
6316                         b2 = (NEXTCHR_IS_EOS)
6317                             ? 0 /* isWORDCHAR_L1('\n') */
6318                             : isWORDCHAR_utf8_safe((U8*)locinput,
6319                                                    (U8*) reginfo->strend);
6320                         match = cBOOL(b1 != b2);
6321                         break;
6322                     }
6323                     case GCB_BOUND:
6324                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6325                             match = TRUE; /* GCB always matches at begin and
6326                                              end */
6327                         }
6328                         else {
6329                             /* Find the gcb values of previous and current
6330                              * chars, then see if is a break point */
6331                             match = isGCB(getGCB_VAL_UTF8(
6332                                                 reghop3((U8*)locinput,
6333                                                         -1,
6334                                                         (U8*)(reginfo->strbeg)),
6335                                                 (U8*) reginfo->strend),
6336                                           getGCB_VAL_UTF8((U8*) locinput,
6337                                                         (U8*) reginfo->strend),
6338                                           (U8*) reginfo->strbeg,
6339                                           (U8*) locinput,
6340                                           utf8_target);
6341                         }
6342                         break;
6343
6344                     case LB_BOUND:
6345                         if (locinput == reginfo->strbeg) {
6346                             match = FALSE;
6347                         }
6348                         else if (NEXTCHR_IS_EOS) {
6349                             match = TRUE;
6350                         }
6351                         else {
6352                             match = isLB(getLB_VAL_UTF8(
6353                                                 reghop3((U8*)locinput,
6354                                                         -1,
6355                                                         (U8*)(reginfo->strbeg)),
6356                                                 (U8*) reginfo->strend),
6357                                           getLB_VAL_UTF8((U8*) locinput,
6358                                                         (U8*) reginfo->strend),
6359                                           (U8*) reginfo->strbeg,
6360                                           (U8*) locinput,
6361                                           (U8*) reginfo->strend,
6362                                           utf8_target);
6363                         }
6364                         break;
6365
6366                     case SB_BOUND: /* Always matches at begin and end */
6367                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6368                             match = TRUE;
6369                         }
6370                         else {
6371                             match = isSB(getSB_VAL_UTF8(
6372                                                 reghop3((U8*)locinput,
6373                                                         -1,
6374                                                         (U8*)(reginfo->strbeg)),
6375                                                 (U8*) reginfo->strend),
6376                                           getSB_VAL_UTF8((U8*) locinput,
6377                                                         (U8*) reginfo->strend),
6378                                           (U8*) reginfo->strbeg,
6379                                           (U8*) locinput,
6380                                           (U8*) reginfo->strend,
6381                                           utf8_target);
6382                         }
6383                         break;
6384
6385                     case WB_BOUND:
6386                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6387                             match = TRUE;
6388                         }
6389                         else {
6390                             match = isWB(WB_UNKNOWN,
6391                                          getWB_VAL_UTF8(
6392                                                 reghop3((U8*)locinput,
6393                                                         -1,
6394                                                         (U8*)(reginfo->strbeg)),
6395                                                 (U8*) reginfo->strend),
6396                                           getWB_VAL_UTF8((U8*) locinput,
6397                                                         (U8*) reginfo->strend),
6398                                           (U8*) reginfo->strbeg,
6399                                           (U8*) locinput,
6400                                           (U8*) reginfo->strend,
6401                                           utf8_target);
6402                         }
6403                         break;
6404                 }
6405             }
6406             else {  /* Not utf8 target */
6407                 switch((bound_type) FLAGS(scan)) {
6408                     case TRADITIONAL_BOUND:
6409                     {
6410                         bool b1, b2;
6411                         b1 = (locinput == reginfo->strbeg)
6412                             ? 0 /* isWORDCHAR_L1('\n') */
6413                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6414                         b2 = (NEXTCHR_IS_EOS)
6415                             ? 0 /* isWORDCHAR_L1('\n') */
6416                             : isWORDCHAR_L1(nextchr);
6417                         match = cBOOL(b1 != b2);
6418                         break;
6419                     }
6420
6421                     case GCB_BOUND:
6422                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6423                             match = TRUE; /* GCB always matches at begin and
6424                                              end */
6425                         }
6426                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6427                                    range */
6428                             match =    UCHARAT(locinput - 1) != '\r'
6429                                     || UCHARAT(locinput) != '\n';
6430                         }
6431                         break;
6432
6433                     case LB_BOUND:
6434                         if (locinput == reginfo->strbeg) {
6435                             match = FALSE;
6436                         }
6437                         else if (NEXTCHR_IS_EOS) {
6438                             match = TRUE;
6439                         }
6440                         else {
6441                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6442                                          getLB_VAL_CP(UCHARAT(locinput)),
6443                                          (U8*) reginfo->strbeg,
6444                                          (U8*) locinput,
6445                                          (U8*) reginfo->strend,
6446                                          utf8_target);
6447                         }
6448                         break;
6449
6450                     case SB_BOUND: /* Always matches at begin and end */
6451                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6452                             match = TRUE;
6453                         }
6454                         else {
6455                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6456                                          getSB_VAL_CP(UCHARAT(locinput)),
6457                                          (U8*) reginfo->strbeg,
6458                                          (U8*) locinput,
6459                                          (U8*) reginfo->strend,
6460                                          utf8_target);
6461                         }
6462                         break;
6463
6464                     case WB_BOUND:
6465                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6466                             match = TRUE;
6467                         }
6468                         else {
6469                             match = isWB(WB_UNKNOWN,
6470                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6471                                          getWB_VAL_CP(UCHARAT(locinput)),
6472                                          (U8*) reginfo->strbeg,
6473                                          (U8*) locinput,
6474                                          (U8*) reginfo->strend,
6475                                          utf8_target);
6476                         }
6477                         break;
6478                 }
6479             }
6480
6481             if (to_complement ^ ! match) {
6482                 sayNO;
6483             }
6484             break;
6485
6486         case ANYOFL:  /*  /[abc]/l      */
6487             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6488
6489             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6490             {
6491               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6492             }
6493             /* FALLTHROUGH */
6494         case ANYOFD:  /*   /[abc]/d       */
6495         case ANYOF:  /*   /[abc]/       */
6496             if (NEXTCHR_IS_EOS)
6497                 sayNO;
6498             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6499                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6500                                                                    utf8_target))
6501                     sayNO;
6502                 locinput += UTF8SKIP(locinput);
6503             }
6504             else {
6505                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6506                     sayNO;
6507                 locinput++;
6508             }
6509             break;
6510
6511         case ASCII:
6512             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6513                 sayNO;
6514             }
6515
6516             locinput++;     /* ASCII is always single byte */
6517             break;
6518
6519         case NASCII:
6520             if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6521                 sayNO;
6522             }
6523
6524             goto increment_locinput;
6525             break;
6526
6527         /* The argument (FLAGS) to all the POSIX node types is the class number
6528          * */
6529
6530         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6531             to_complement = 1;
6532             /* FALLTHROUGH */
6533
6534         case POSIXL:    /* \w or [:punct:] etc. under /l */
6535             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6536             if (NEXTCHR_IS_EOS)
6537                 sayNO;
6538
6539             /* Use isFOO_lc() for characters within Latin1.  (Note that
6540              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6541              * wouldn't be invariant) */
6542             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6543                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6544                     sayNO;
6545                 }
6546
6547                 locinput++;
6548                 break;
6549             }
6550
6551             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6552                 /* An above Latin-1 code point, or malformed */
6553                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6554                                                        reginfo->strend);
6555                 goto utf8_posix_above_latin1;
6556             }
6557
6558             /* Here is a UTF-8 variant code point below 256 and the target is
6559              * UTF-8 */
6560             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6561                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6562                                             *(locinput + 1))))))
6563             {
6564                 sayNO;
6565             }
6566
6567             goto increment_locinput;
6568
6569         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6570             to_complement = 1;
6571             /* FALLTHROUGH */
6572
6573         case POSIXD:    /* \w or [:punct:] etc. under /d */
6574             if (utf8_target) {
6575                 goto utf8_posix;
6576             }
6577             goto posixa;
6578
6579         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6580
6581             if (NEXTCHR_IS_EOS) {
6582                 sayNO;
6583             }
6584
6585             /* All UTF-8 variants match */
6586             if (! UTF8_IS_INVARIANT(nextchr)) {
6587                 goto increment_locinput;
6588             }
6589
6590             to_complement = 1;
6591             goto join_nposixa;
6592
6593         case POSIXA:    /* \w or [:punct:] etc. under /a */
6594
6595           posixa:
6596             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6597              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6598              * character is a single byte */
6599
6600             if (NEXTCHR_IS_EOS) {
6601                 sayNO;
6602             }
6603
6604           join_nposixa:
6605
6606             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6607                                                                 FLAGS(scan)))))
6608             {
6609                 sayNO;
6610             }
6611
6612             /* Here we are either not in utf8, or we matched a utf8-invariant,
6613              * so the next char is the next byte */
6614             locinput++;
6615             break;
6616
6617         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6618             to_complement = 1;
6619             /* FALLTHROUGH */
6620
6621         case POSIXU:    /* \w or [:punct:] etc. under /u */
6622           utf8_posix:
6623             if (NEXTCHR_IS_EOS) {
6624                 sayNO;
6625             }
6626
6627             /* Use _generic_isCC() for characters within Latin1.  (Note that
6628              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6629              * wouldn't be invariant) */
6630             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6631                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6632                                                            FLAGS(scan)))))
6633                 {
6634                     sayNO;
6635                 }
6636                 locinput++;
6637             }
6638             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6639                 if (! (to_complement
6640                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6641                                                                *(locinput + 1)),
6642                                              FLAGS(scan)))))
6643                 {
6644                     sayNO;
6645                 }
6646                 locinput += 2;
6647             }
6648             else {  /* Handle above Latin-1 code points */
6649               utf8_posix_above_latin1:
6650                 classnum = (_char_class_number) FLAGS(scan);
6651                 if (classnum < _FIRST_NON_SWASH_CC) {
6652
6653                     /* Here, uses a swash to find such code points.  Load if if
6654                      * not done already */
6655                     if (! PL_utf8_swash_ptrs[classnum]) {
6656                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6657                         PL_utf8_swash_ptrs[classnum]
6658                                 = _core_swash_init("utf8",
6659                                         "",
6660                                         &PL_sv_undef, 1, 0,
6661                                         PL_XPosix_ptrs[classnum], &flags);
6662                     }
6663                     if (! (to_complement
6664                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6665                                                (U8 *) locinput, TRUE))))
6666                     {
6667                         sayNO;
6668                     }
6669                 }
6670                 else {  /* Here, uses macros to find above Latin-1 code points */
6671                     switch (classnum) {
6672                         case _CC_ENUM_SPACE:
6673                             if (! (to_complement
6674                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
6675                             {
6676                                 sayNO;
6677                             }
6678                             break;
6679                         case _CC_ENUM_BLANK:
6680                             if (! (to_complement
6681                                             ^ cBOOL(is_HORIZWS_high(locinput))))
6682                             {
6683                                 sayNO;
6684                             }
6685                             break;
6686                         case _CC_ENUM_XDIGIT:
6687                             if (! (to_complement
6688                                             ^ cBOOL(is_XDIGIT_high(locinput))))
6689                             {
6690                                 sayNO;
6691                             }
6692                             break;
6693                         case _CC_ENUM_VERTSPACE:
6694                             if (! (to_complement
6695                                             ^ cBOOL(is_VERTWS_high(locinput))))
6696                             {
6697                                 sayNO;
6698                             }
6699                             break;
6700                         default:    /* The rest, e.g. [:cntrl:], can't match
6701                                        above Latin1 */
6702                             if (! to_complement) {
6703                                 sayNO;
6704                             }
6705                             break;
6706                     }
6707                 }
6708                 locinput += UTF8SKIP(locinput);
6709             }
6710             break;
6711
6712         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6713                        a Unicode extended Grapheme Cluster */
6714             if (NEXTCHR_IS_EOS)
6715                 sayNO;
6716             if  (! utf8_target) {
6717
6718                 /* Match either CR LF  or '.', as all the other possibilities
6719                  * require utf8 */
6720                 locinput++;         /* Match the . or CR */
6721                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6722                                        match the LF */
6723                     && locinput < reginfo->strend
6724                     && UCHARAT(locinput) == '\n')
6725                 {
6726                     locinput++;
6727                 }
6728             }
6729             else {
6730
6731                 /* Get the gcb type for the current character */
6732                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6733                                                        (U8*) reginfo->strend);
6734
6735                 /* Then scan through the input until we get to the first
6736                  * character whose type is supposed to be a gcb with the
6737                  * current character.  (There is always a break at the
6738                  * end-of-input) */
6739                 locinput += UTF8SKIP(locinput);
6740                 while (locinput < reginfo->strend) {
6741                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6742                                                          (U8*) reginfo->strend);
6743                     if (isGCB(prev_gcb, cur_gcb,
6744                               (U8*) reginfo->strbeg, (U8*) locinput,
6745                               utf8_target))
6746                     {
6747                         break;
6748                     }
6749
6750                     prev_gcb = cur_gcb;
6751                     locinput += UTF8SKIP(locinput);
6752                 }
6753
6754
6755             }
6756             break;
6757
6758         case NREFFL:  /*  /\g{name}/il  */
6759         {   /* The capture buffer cases.  The ones beginning with N for the
6760                named buffers just convert to the equivalent numbered and
6761                pretend they were called as the corresponding numbered buffer
6762                op.  */
6763             /* don't initialize these in the declaration, it makes C++
6764                unhappy */
6765             const char *s;
6766             char type;
6767             re_fold_t folder;
6768             const U8 *fold_array;
6769             UV utf8_fold_flags;
6770
6771             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6772             folder = foldEQ_locale;
6773             fold_array = PL_fold_locale;
6774             type = REFFL;
6775             utf8_fold_flags = FOLDEQ_LOCALE;
6776             goto do_nref;
6777
6778         case NREFFA:  /*  /\g{name}/iaa  */
6779             folder = foldEQ_latin1;
6780             fold_array = PL_fold_latin1;
6781             type = REFFA;
6782             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6783             goto do_nref;
6784
6785         case NREFFU:  /*  /\g{name}/iu  */
6786             folder = foldEQ_latin1;
6787             fold_array = PL_fold_latin1;
6788             type = REFFU;
6789             utf8_fold_flags = 0;
6790             goto do_nref;
6791
6792         case NREFF:  /*  /\g{name}/i  */
6793             folder = foldEQ;
6794             fold_array = PL_fold;
6795             type = REFF;
6796             utf8_fold_flags = 0;
6797             goto do_nref;
6798
6799         case NREF:  /*  /\g{name}/   */
6800             type = REF;
6801             folder = NULL;
6802             fold_array = NULL;
6803             utf8_fold_flags = 0;
6804           do_nref:
6805
6806             /* For the named back references, find the corresponding buffer
6807              * number */
6808             n = reg_check_named_buff_matched(rex,scan);
6809
6810             if ( ! n ) {
6811                 sayNO;
6812             }
6813             goto do_nref_ref_common;
6814
6815         case REFFL:  /*  /\1/il  */
6816             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6817             folder = foldEQ_locale;
6818             fold_array = PL_fold_locale;
6819             utf8_fold_flags = FOLDEQ_LOCALE;
6820             goto do_ref;
6821
6822         case REFFA:  /*  /\1/iaa  */
6823             folder = foldEQ_latin1;
6824             fold_array = PL_fold_latin1;
6825             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6826             goto do_ref;
6827
6828         case REFFU:  /*  /\1/iu  */
6829             folder = foldEQ_latin1;
6830             fold_array = PL_fold_latin1;
6831             utf8_fold_flags = 0;
6832             goto do_ref;
6833
6834         case REFF:  /*  /\1/i  */
6835             folder = foldEQ;
6836             fold_array = PL_fold;
6837             utf8_fold_flags = 0;
6838             goto do_ref;
6839
6840         case REF:  /*  /\1/    */
6841             folder = NULL;
6842             fold_array = NULL;
6843             utf8_fold_flags = 0;
6844
6845           do_ref:
6846             type = OP(scan);
6847             n = ARG(scan);  /* which paren pair */
6848
6849           do_nref_ref_common:
6850             ln = rex->offs[n].start;
6851             endref = rex->offs[n].end;
6852             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6853             if (rex->lastparen < n || ln == -1 || endref == -1)
6854                 sayNO;                  /* Do not match unless seen CLOSEn. */
6855             if (ln == endref)
6856                 break;
6857
6858             s = reginfo->strbeg + ln;
6859             if (type != REF     /* REF can do byte comparison */
6860                 && (utf8_target || type == REFFU || type == REFFL))
6861             {
6862                 char * limit = reginfo->strend;
6863
6864                 /* This call case insensitively compares the entire buffer
6865                     * at s, with the current input starting at locinput, but
6866                     * not going off the end given by reginfo->strend, and
6867                     * returns in <limit> upon success, how much of the
6868                     * current input was matched */
6869                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
6870                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6871                 {
6872                     sayNO;
6873                 }
6874                 locinput = limit;
6875                 break;
6876             }
6877
6878             /* Not utf8:  Inline the first character, for speed. */
6879             if (!NEXTCHR_IS_EOS &&
6880                 UCHARAT(s) != nextchr &&
6881                 (type == REF ||
6882                  UCHARAT(s) != fold_array[nextchr]))
6883                 sayNO;
6884             ln = endref - ln;
6885             if (locinput + ln > reginfo->strend)
6886                 sayNO;
6887             if (ln > 1 && (type == REF
6888                            ? memNE(s, locinput, ln)
6889                            : ! folder(s, locinput, ln)))
6890                 sayNO;
6891             locinput += ln;
6892             break;
6893         }
6894
6895         case NOTHING: /* null op; e.g. the 'nothing' following
6896                        * the '*' in m{(a+|b)*}' */
6897             break;
6898         case TAIL: /* placeholder while compiling (A|B|C) */
6899             break;
6900
6901 #undef  ST
6902 #define ST st->u.eval
6903 #define CUR_EVAL cur_eval->u.eval
6904
6905         {
6906             SV *ret;
6907             REGEXP *re_sv;
6908             regexp *re;
6909             regexp_internal *rei;
6910             regnode *startpoint;
6911             U32 arg;
6912
6913         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6914             arg= (U32)ARG(scan);
6915             if (cur_eval && cur_eval->locinput == locinput) {
6916                 if ( ++nochange_depth > max_nochange_depth )
6917                     Perl_croak(aTHX_
6918                         "Pattern subroutine nesting without pos change"
6919                         " exceeded limit in regex");
6920             } else {
6921                 nochange_depth = 0;
6922             }
6923             re_sv = rex_sv;
6924             re = rex;
6925             rei = rexi;
6926             startpoint = scan + ARG2L(scan);
6927             EVAL_CLOSE_PAREN_SET( st, arg );
6928             /* Detect infinite recursion
6929              *
6930              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6931              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6932              * So we track the position in the string we are at each time
6933              * we recurse and if we try to enter the same routine twice from
6934              * the same position we throw an error.
6935              */
6936             if ( rex->recurse_locinput[arg] == locinput ) {
6937                 /* FIXME: we should show the regop that is failing as part
6938                  * of the error message. */
6939                 Perl_croak(aTHX_ "Infinite recursion in regex");
6940             } else {
6941                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6942                 rex->recurse_locinput[arg]= locinput;
6943
6944                 DEBUG_r({
6945                     GET_RE_DEBUG_FLAGS_DECL;
6946                     DEBUG_STACK_r({
6947                         Perl_re_exec_indentf( aTHX_
6948                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6949                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6950                         );
6951                     });
6952                 });
6953             }
6954
6955             /* Save all the positions seen so far. */
6956             ST.cp = regcppush(rex, 0, maxopenparen);
6957             REGCP_SET(ST.lastcp);
6958
6959             /* and then jump to the code we share with EVAL */
6960             goto eval_recurse_doit;
6961             /* NOTREACHED */
6962
6963         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
6964             if (cur_eval && cur_eval->locinput==locinput) {
6965                 if ( ++nochange_depth > max_nochange_depth )
6966                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6967             } else {
6968                 nochange_depth = 0;
6969             }
6970             {
6971                 /* execute the code in the {...} */
6972
6973                 dSP;
6974                 IV before;
6975                 OP * const oop = PL_op;
6976                 COP * const ocurcop = PL_curcop;
6977                 OP *nop;
6978                 CV *newcv;
6979
6980                 /* save *all* paren positions */
6981                 regcppush(rex, 0, maxopenparen);
6982                 REGCP_SET(ST.lastcp);
6983
6984                 if (!caller_cv)
6985                     caller_cv = find_runcv(NULL);
6986
6987                 n = ARG(scan);
6988
6989                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6990                     newcv = (ReANY(
6991                                     (REGEXP*)(rexi->data->data[n])
6992                             ))->qr_anoncv;
6993                     nop = (OP*)rexi->data->data[n+1];
6994                 }
6995                 else if (rexi->data->what[n] == 'l') { /* literal code */
6996                     newcv = caller_cv;
6997                     nop = (OP*)rexi->data->data[n];
6998                     assert(CvDEPTH(newcv));
6999                 }
7000                 else {
7001                     /* literal with own CV */
7002                     assert(rexi->data->what[n] == 'L');
7003                     newcv = rex->qr_anoncv;
7004                     nop = (OP*)rexi->data->data[n];
7005                 }
7006
7007                 /* Some notes about MULTICALL and the context and save stacks.
7008                  *
7009                  * In something like
7010                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7011                  * since codeblocks don't introduce a new scope (so that
7012                  * local() etc accumulate), at the end of a successful
7013                  * match there will be a SAVEt_CLEARSV on the savestack
7014                  * for each of $x, $y, $z. If the three code blocks above
7015                  * happen to have come from different CVs (e.g. via
7016                  * embedded qr//s), then we must ensure that during any
7017                  * savestack unwinding, PL_comppad always points to the
7018                  * right pad at each moment. We achieve this by
7019                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7020                  * there is a change of pad.
7021                  * In theory whenever we call a code block, we should
7022                  * push a CXt_SUB context, then pop it on return from
7023                  * that code block. This causes a bit of an issue in that
7024                  * normally popping a context also clears the savestack
7025                  * back to cx->blk_oldsaveix, but here we specifically
7026                  * don't want to clear the save stack on exit from the
7027                  * code block.
7028                  * Also for efficiency we don't want to keep pushing and
7029                  * popping the single SUB context as we backtrack etc.
7030                  * So instead, we push a single context the first time
7031                  * we need, it, then hang onto it until the end of this
7032                  * function. Whenever we encounter a new code block, we
7033                  * update the CV etc if that's changed. During the times
7034                  * in this function where we're not executing a code
7035                  * block, having the SUB context still there is a bit
7036                  * naughty - but we hope that no-one notices.
7037                  * When the SUB context is initially pushed, we fake up
7038                  * cx->blk_oldsaveix to be as if we'd pushed this context
7039                  * on first entry to S_regmatch rather than at some random
7040                  * point during the regexe execution. That way if we
7041                  * croak, popping the context stack will ensure that
7042                  * *everything* SAVEd by this function is undone and then
7043                  * the context popped, rather than e.g., popping the
7044                  * context (and restoring the original PL_comppad) then
7045                  * popping more of the savestack and restoring a bad
7046                  * PL_comppad.
7047                  */
7048
7049                 /* If this is the first EVAL, push a MULTICALL. On
7050                  * subsequent calls, if we're executing a different CV, or
7051                  * if PL_comppad has got messed up from backtracking
7052                  * through SAVECOMPPADs, then refresh the context.
7053                  */
7054                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7055                 {
7056                     U8 flags = (CXp_SUB_RE |
7057                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7058                     SAVECOMPPAD();
7059                     if (last_pushed_cv) {
7060                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7061                     }
7062                     else {
7063                         PUSH_MULTICALL_FLAGS(newcv, flags);
7064                     }
7065                     /* see notes above */
7066                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7067
7068                     last_pushed_cv = newcv;
7069                 }
7070                 else {
7071                     /* these assignments are just to silence compiler
7072                      * warnings */
7073                     multicall_cop = NULL;
7074                 }
7075                 last_pad = PL_comppad;
7076
7077                 /* the initial nextstate you would normally execute
7078                  * at the start of an eval (which would cause error
7079                  * messages to come from the eval), may be optimised
7080                  * away from the execution path in the regex code blocks;
7081                  * so manually set PL_curcop to it initially */
7082                 {
7083                     OP *o = cUNOPx(nop)->op_first;
7084                     assert(o->op_type == OP_NULL);
7085                     if (o->op_targ == OP_SCOPE) {
7086                         o = cUNOPo->op_first;
7087                     }
7088                     else {
7089                         assert(o->op_targ == OP_LEAVE);
7090                         o = cUNOPo->op_first;
7091                         assert(o->op_type == OP_ENTER);
7092                         o = OpSIBLING(o);
7093                     }
7094
7095                     if (o->op_type != OP_STUB) {
7096                         assert(    o->op_type == OP_NEXTSTATE
7097                                 || o->op_type == OP_DBSTATE
7098                                 || (o->op_type == OP_NULL
7099                                     &&  (  o->op_targ == OP_NEXTSTATE
7100                                         || o->op_targ == OP_DBSTATE
7101                                         )
7102                                     )
7103                         );
7104                         PL_curcop = (COP*)o;
7105                     }
7106                 }
7107                 nop = nop->op_next;
7108
7109                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7110                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7111
7112                 rex->offs[0].end = locinput - reginfo->strbeg;
7113                 if (reginfo->info_aux_eval->pos_magic)
7114                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7115                                   reginfo->sv, reginfo->strbeg,
7116                                   locinput - reginfo->strbeg);
7117
7118                 if (sv_yes_mark) {
7119                     SV *sv_mrk = get_sv("REGMARK", 1);
7120                     sv_setsv(sv_mrk, sv_yes_mark);
7121                 }
7122
7123                 /* we don't use MULTICALL here as we want to call the
7124                  * first op of the block of interest, rather than the
7125                  * first op of the sub. Also, we don't want to free
7126                  * the savestack frame */
7127                 before = (IV)(SP-PL_stack_base);
7128                 PL_op = nop;
7129                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7130                 SPAGAIN;
7131                 if ((IV)(SP-PL_stack_base) == before)
7132                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7133                 else {
7134                     ret = POPs;
7135                     PUTBACK;
7136                 }
7137
7138                 /* before restoring everything, evaluate the returned
7139                  * value, so that 'uninit' warnings don't use the wrong
7140                  * PL_op or pad. Also need to process any magic vars
7141                  * (e.g. $1) *before* parentheses are restored */
7142
7143                 PL_op = NULL;
7144
7145                 re_sv = NULL;
7146                 if (logical == 0)        /*   (?{})/   */
7147                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7148                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7149                     sw = cBOOL(SvTRUE_NN(ret));
7150                     logical = 0;
7151                 }
7152                 else {                   /*  /(??{})  */
7153                     /*  if its overloaded, let the regex compiler handle
7154                      *  it; otherwise extract regex, or stringify  */
7155                     if (SvGMAGICAL(ret))
7156                         ret = sv_mortalcopy(ret);
7157                     if (!SvAMAGIC(ret)) {
7158                         SV *sv = ret;
7159                         if (SvROK(sv))
7160                             sv = SvRV(sv);
7161                         if (SvTYPE(sv) == SVt_REGEXP)
7162                             re_sv = (REGEXP*) sv;
7163                         else if (SvSMAGICAL(ret)) {
7164                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7165                             if (mg)
7166                                 re_sv = (REGEXP *) mg->mg_obj;
7167                         }
7168
7169                         /* force any undef warnings here */
7170                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7171                             ret = sv_mortalcopy(ret);
7172                             (void) SvPV_force_nolen(ret);
7173                         }
7174                     }
7175
7176                 }
7177
7178                 /* *** Note that at this point we don't restore
7179                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7180                  * be used again soon. This is safe as long as nothing
7181                  * in the regexp code uses the pad ! */
7182                 PL_op = oop;
7183                 PL_curcop = ocurcop;
7184                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7185                 PL_curpm_under = PL_curpm;
7186                 PL_curpm = PL_reg_curpm;
7187
7188                 if (logical != 2) {
7189                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7190                     /* NOTREACHED */
7191                 }
7192             }
7193
7194                 /* only /(??{})/  from now on */
7195                 logical = 0;
7196                 {
7197                     /* extract RE object from returned value; compiling if
7198                      * necessary */
7199
7200                     if (re_sv) {
7201                         re_sv = reg_temp_copy(NULL, re_sv);
7202                     }
7203                     else {
7204                         U32 pm_flags = 0;
7205
7206                         if (SvUTF8(ret) && IN_BYTES) {
7207                             /* In use 'bytes': make a copy of the octet
7208                              * sequence, but without the flag on */
7209                             STRLEN len;
7210                             const char *const p = SvPV(ret, len);
7211                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7212                         }
7213                         if (rex->intflags & PREGf_USE_RE_EVAL)
7214                             pm_flags |= PMf_USE_RE_EVAL;
7215
7216                         /* if we got here, it should be an engine which
7217                          * supports compiling code blocks and stuff */
7218                         assert(rex->engine && rex->engine->op_comp);
7219                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7220                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7221                                     rex->engine, NULL, NULL,
7222                                     /* copy /msixn etc to inner pattern */
7223                                     ARG2L(scan),
7224                                     pm_flags);
7225
7226                         if (!(SvFLAGS(ret)
7227                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7228                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7229                             /* This isn't a first class regexp. Instead, it's
7230                                caching a regexp onto an existing, Perl visible
7231                                scalar.  */
7232                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7233                         }
7234                     }
7235                     SAVEFREESV(re_sv);
7236                     re = ReANY(re_sv);
7237                 }
7238                 RXp_MATCH_COPIED_off(re);
7239                 re->subbeg = rex->subbeg;
7240                 re->sublen = rex->sublen;
7241                 re->suboffset = rex->suboffset;
7242                 re->subcoffset = rex->subcoffset;
7243                 re->lastparen = 0;
7244                 re->lastcloseparen = 0;
7245                 rei = RXi_GET(re);
7246                 DEBUG_EXECUTE_r(
7247                     debug_start_match(re_sv, utf8_target, locinput,
7248                                     reginfo->strend, "Matching embedded");
7249                 );
7250                 startpoint = rei->program + 1;
7251                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7252                                              * close_paren only for GOSUB */
7253                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7254                 /* Save all the seen positions so far. */
7255                 ST.cp = regcppush(rex, 0, maxopenparen);
7256                 REGCP_SET(ST.lastcp);
7257                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7258                 maxopenparen = 0;
7259                 /* run the pattern returned from (??{...}) */
7260
7261               eval_recurse_doit: /* Share code with GOSUB below this line
7262                             * At this point we expect the stack context to be
7263                             * set up correctly */
7264
7265                 /* invalidate the S-L poscache. We're now executing a
7266                  * different set of WHILEM ops (and their associated
7267                  * indexes) against the same string, so the bits in the
7268                  * cache are meaningless. Setting maxiter to zero forces
7269                  * the cache to be invalidated and zeroed before reuse.
7270                  * XXX This is too dramatic a measure. Ideally we should
7271                  * save the old cache and restore when running the outer
7272                  * pattern again */
7273                 reginfo->poscache_maxiter = 0;
7274
7275                 /* the new regexp might have a different is_utf8_pat than we do */
7276                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7277
7278                 ST.prev_rex = rex_sv;
7279                 ST.prev_curlyx = cur_curlyx;
7280                 rex_sv = re_sv;
7281                 SET_reg_curpm(rex_sv);
7282                 rex = re;
7283                 rexi = rei;
7284                 cur_curlyx = NULL;
7285                 ST.B = next;
7286                 ST.prev_eval = cur_eval;
7287                 cur_eval = st;
7288                 /* now continue from first node in postoned RE */
7289                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7290                 NOT_REACHED; /* NOTREACHED */
7291         }
7292
7293         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7294             /* note: this is called twice; first after popping B, then A */
7295             DEBUG_STACK_r({
7296                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7297                     depth, cur_eval, ST.prev_eval);
7298             });
7299
7300 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7301             if ( cur_eval && CUR_EVAL.close_paren ) {\
7302                 DEBUG_STACK_r({ \
7303                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7304                         depth,    \
7305                         CUR_EVAL.close_paren - 1,\
7306                         cur_eval, \
7307                         VAL);     \
7308                 });               \
7309                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7310             }
7311
7312             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7313
7314             rex_sv = ST.prev_rex;
7315             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7316             SET_reg_curpm(rex_sv);
7317             rex = ReANY(rex_sv);
7318             rexi = RXi_GET(rex);
7319             {
7320                 /* preserve $^R across LEAVE's. See Bug 121070. */
7321                 SV *save_sv= GvSV(PL_replgv);
7322                 SvREFCNT_inc(save_sv);
7323                 regcpblow(ST.cp); /* LEAVE in disguise */
7324                 sv_setsv(GvSV(PL_replgv), save_sv);
7325                 SvREFCNT_dec(save_sv);
7326             }
7327             cur_eval = ST.prev_eval;
7328             cur_curlyx = ST.prev_curlyx;
7329
7330             /* Invalidate cache. See "invalidate" comment above. */
7331             reginfo->poscache_maxiter = 0;
7332             if ( nochange_depth )
7333                 nochange_depth--;
7334
7335             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7336             sayYES;
7337
7338
7339         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7340             REGCP_UNWIND(ST.lastcp);
7341             sayNO;
7342
7343         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7344             /* note: this is called twice; first after popping B, then A */
7345             DEBUG_STACK_r({
7346                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7347                     depth, cur_eval, ST.prev_eval);
7348             });
7349
7350             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7351
7352             rex_sv = ST.prev_rex;
7353             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7354             SET_reg_curpm(rex_sv);
7355             rex = ReANY(rex_sv);
7356             rexi = RXi_GET(rex);
7357
7358             REGCP_UNWIND(ST.lastcp);
7359             regcppop(rex, &maxopenparen);
7360             cur_eval = ST.prev_eval;
7361             cur_curlyx = ST.prev_curlyx;
7362
7363             /* Invalidate cache. See "invalidate" comment above. */
7364             reginfo->poscache_maxiter = 0;
7365             if ( nochange_depth )
7366                 nochange_depth--;
7367
7368             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7369             sayNO_SILENT;
7370 #undef ST
7371
7372         case OPEN: /*  (  */
7373             n = ARG(scan);  /* which paren pair */
7374             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7375             if (n > maxopenparen)
7376                 maxopenparen = n;
7377             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7378                 "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7379                 depth,
7380                 PTR2UV(rex),
7381                 PTR2UV(rex->offs),
7382                 (UV)n,
7383                 (IV)rex->offs[n].start_tmp,
7384                 (UV)maxopenparen
7385             ));
7386             lastopen = n;
7387             break;
7388
7389         case SROPEN: /*  (*SCRIPT_RUN:  */
7390             script_run_begin = locinput;
7391             break;
7392
7393 /* XXX really need to log other places start/end are set too */
7394 #define CLOSE_CAPTURE                                                      \
7395     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7396     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7397     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
7398         "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
7399         depth,                                                             \
7400         PTR2UV(rex),                                                       \
7401         PTR2UV(rex->offs),                                                 \
7402         (UV)n,                                                             \
7403         (IV)rex->offs[n].start,                                            \
7404         (IV)rex->offs[n].end                                               \
7405     ))
7406
7407         case CLOSE:  /*  )  */
7408             n = ARG(scan);  /* which paren pair */
7409             CLOSE_CAPTURE;
7410             if (n > rex->lastparen)
7411                 rex->lastparen = n;
7412             rex->lastcloseparen = n;
7413             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7414                 goto fake_end;
7415
7416             break;
7417
7418         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7419           {
7420             /* Checks that every character in the sequence started by SROPEN
7421              * and ending here is one of three scripts: Common, Inherited, and
7422              * possibly one other.  Additionally all decimal digits must come
7423              * from the same consecutive sequence of 10.
7424              *
7425              * Basically, it looks at each character in the sequence to see if
7426              * the above conditions are met; if not it fails.  It uses an
7427              * inversion map to find the enum corresponding to the script of
7428              * each character.  But this is complicated by the fact that a few
7429              * code points can be in any of several scripts.  The data has been
7430              * constructed so that there are additional enum values (all
7431              * negative) for these situations.  The absolute value of those is
7432              * an index into another table which contains pointers to auxiliary
7433              * tables for each such situation.  Each aux array lists all the
7434              * scripts for the given situation.  There is another, parallel,
7435              * table that gives the number of entries in each aux table.  These
7436              * are all defined in charclass_invlists.h */
7437
7438             /* XXX Here are the additional things UTS 39 says could be done:
7439              * Mark Chinese strings as “mixed script” if they contain both
7440              * simplified (S) and traditional (T) Chinese characters, using the
7441              * Unihan data in the Unicode Character Database [UCD].  The
7442              * criterion can only be applied if the language of the string is
7443              * known to be Chinese. So, for example, the string
7444              * “写真だけの結婚式 ” is Japanese, and should not be marked as
7445              * mixed script because of a mixture of S and T characters.
7446              * Testing for whether a character is S or T needs to be based not
7447              * on whether the character has a S or T variant , but whether the
7448              * character is an S or T variant. khw notes that the sample
7449              * contains a Hiragana character, and it is unclear if absence of
7450              * any foreign script marks the script as "Chinese"
7451              *
7452              * Forbid sequences of the same nonspacing mark
7453              *
7454              * Check to see that all the characters are in the sets of exemplar
7455              * characters for at least one language in the Unicode Common
7456              * Locale Data Repository [CLDR]. */
7457
7458
7459             /* Things that match /\d/u */
7460             SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
7461             UV * decimals_array = invlist_array(decimals_invlist);
7462
7463             /* What code point is the digit '0' of the script run? */
7464             UV zero_of_run = 0;
7465             SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
7466             SCX_enum script_of_char = SCX_INVALID;
7467
7468             /* If the script remains not fully determined from iteration to
7469              * iteration, this is the current intersection of the possiblities.
7470              * */
7471             SCX_enum * intersection = NULL;
7472             PERL_UINT_FAST8_T intersection_len = 0;
7473
7474             const U8 * s = (U8 *) script_run_begin;
7475             const U8 * strend = (U8 *) locinput;
7476
7477             assert(s);
7478
7479             /* Look at each character in the sequence */
7480             while (s < strend) {
7481                 UV cp;
7482
7483                 /* The code allows all scripts to use the ASCII digits.  This
7484                  * is because they are used in commerce even in scripts that
7485                  * have their own set.  Hence any ASCII ones found are ok,
7486                  * unless a digit from another set has already been
7487                  * encountered.  (The other digit ranges in Common are not
7488                  * similarly blessed */
7489                 if (UNLIKELY(isDIGIT(*s))) {
7490                     if (zero_of_run > 0) {
7491                         if (zero_of_run != '0') {
7492                             Safefree(intersection);
7493                             sayNO;
7494                         }
7495                     }
7496                     else {
7497                         zero_of_run = '0';
7498                     }
7499                     s++;
7500                     continue;
7501                 }
7502
7503                 /* Here, isn't an ASCII digit.  Find the code point of the
7504                  * character */
7505                 if (utf8_target && ! UTF8_IS_INVARIANT(*s)) {
7506                     Size_t len;
7507                     cp = valid_utf8_to_uvchr((U8 *) s, &len);
7508                     s += len;
7509                 }
7510                 else {
7511                     cp = *(s++);
7512                 }
7513
7514                 /* If is within the range [+0 .. +9] of the script's zero, it
7515                  * also is a digit in that script.  We can skip the rest of
7516                  * this code for this character. */
7517                 if (UNLIKELY(   zero_of_run > 0
7518                              && cp >= zero_of_run
7519                              && cp - zero_of_run <= 9))
7520                 {
7521                     continue;
7522                 }
7523
7524                 /* Find the character's script.  The correct values are
7525                  * hard-coded here for small-enough code points. */
7526                 if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
7527                                        unlikely to change */
7528                     if (       cp > 255
7529                         || (   isALPHA_L1(cp)
7530                             && LIKELY(cp != MICRO_SIGN_NATIVE)))
7531                     {
7532                         script_of_char = SCX_Latin;
7533                     }
7534                     else {
7535                         script_of_char = SCX_Common;
7536                     }
7537                 }
7538                 else {
7539                     script_of_char = _Perl_SCX_invmap[
7540                            _invlist_search(PL_SCX_invlist, cp)];
7541                 }
7542
7543                 /* We arbitrarily accept a single unassigned character, but not
7544                  * in combination with anything else, and not a run of them. */
7545                 if (   UNLIKELY(script_of_run == SCX_Unknown)
7546                     || UNLIKELY(   script_of_run != SCX_INVALID
7547                                 && script_of_char == SCX_Unknown))
7548                 {
7549                     Safefree(intersection);
7550                     sayNO;
7551                 }
7552
7553                 if (UNLIKELY(script_of_char == SCX_Unknown)) {
7554                         script_of_run = SCX_Unknown;
7555                         continue;
7556                 }
7557
7558                 /* We accept 'inherited' script characters currently even at
7559                  * the beginning.  (We know that no characters in Inherited
7560                  * are digits, or we'd have to check for that) */
7561                 if (UNLIKELY(script_of_char == SCX_Inherited)) {
7562                     continue;
7563                 }
7564
7565                 /* If unknown, the run's script is set to the char's */
7566                 if (UNLIKELY(script_of_run == SCX_INVALID)) {
7567                     script_of_run = script_of_char;
7568                 }
7569
7570                 /* All decimal digits must be from the same sequence of 10.
7571                  * Above, we handled any ASCII digits without descending to
7572                  * here.  We also handled the case where we already knew what
7573                  * digit sequence is the one to use, and the character is in
7574                  * that sequence.  Now that we know the script, we can use
7575                  * script_zeros[] to directly find which sequence the script
7576                  * uses, except in a few cases it returns 0 */
7577                 if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
7578                     zero_of_run = script_zeros[script_of_char];
7579                 }
7580
7581                 /* Now we can see if the script of the character is the same as
7582                  * that of the run, or 'Common' which is considered to be in
7583                  * every script */
7584                 if (LIKELY(   script_of_char == script_of_run
7585                            || script_of_char == SCX_Common))
7586                 {   /* By far the most common case */
7587                     goto scripts_match;
7588                 }
7589
7590 #ifndef HAS_SCX_AUX_TABLES
7591
7592                 /* Too early a Unicode version to have a code point belonging
7593                  * to more than one script, so, if the scripts don't exactly
7594                  * match, fail */
7595                 Safefree(intersection);
7596                 sayNO;
7597
7598 #else
7599
7600                 /* Here there is no exact match between the character's script
7601                  * and the run's.  Negative script numbers signify that the
7602                  * value may be any of several scripts, and we need to look at
7603                  * auxiliary information to make our deterimination.  But if
7604                  * both are non-negative, we can fail now */
7605                 if (LIKELY(script_of_char >= 0)) {
7606                     const SCX_enum * search_in;
7607                     PERL_UINT_FAST8_T search_in_len;
7608                     PERL_UINT_FAST8_T i;
7609
7610                     if (LIKELY(script_of_run >= 0)) {
7611                         Safefree(intersection);
7612                         sayNO;
7613                     }
7614
7615                     /* Use any previously constructed set of possible scripts.
7616                      * */
7617                     if (intersection) {
7618                         search_in = intersection;
7619                         search_in_len = intersection_len;
7620                     }
7621                     else {
7622                         search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
7623                         search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
7624                     }
7625
7626                     for (i = 0; i < search_in_len; i++) {
7627                         if (search_in[i] == script_of_char) {
7628                             script_of_run = script_of_char;
7629                             goto scripts_match;
7630                         }
7631                     }
7632
7633                     Safefree(intersection);
7634                     sayNO;
7635                 }
7636                 else if (LIKELY(script_of_run >= 0)) {
7637                     /* script of character could be one of several, but run is
7638                      * a single script */
7639                     const SCX_enum * search_in
7640                                         = SCX_AUX_TABLE_ptrs[-script_of_char];
7641                     const PERL_UINT_FAST8_T search_in_len
7642                                      = SCX_AUX_TABLE_lengths[-script_of_char];
7643                     PERL_UINT_FAST8_T i;
7644
7645                     for (i = 0; i < search_in_len; i++) {
7646                         if (search_in[i] == script_of_run) {
7647                             script_of_char = script_of_run;
7648                             goto scripts_match;
7649                         }
7650                     }
7651
7652                     Safefree(intersection);
7653                     sayNO;
7654                 }
7655                 else {
7656                     /* Both run and char could be in one of several scripts.
7657                      * If the intersection is empty, then this character isn't
7658                      * in this script run.  Otherwise, we need to calculate the
7659                      * intersection to use for future iterations of the loop,
7660                      * unless we are already at the final character */
7661                     const SCX_enum * search_char
7662                                         = SCX_AUX_TABLE_ptrs[-script_of_char];
7663                     const PERL_UINT_FAST8_T char_len
7664                                      = SCX_AUX_TABLE_lengths[-script_of_char];
7665                     const SCX_enum * search_run;
7666                     PERL_UINT_FAST8_T run_len;
7667
7668                     SCX_enum * new_overlap = NULL;
7669                     PERL_UINT_FAST8_T i, j;
7670
7671                     if (intersection) {
7672                         search_run = intersection;
7673                         run_len = intersection_len;
7674                     }
7675                     else {
7676                         search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
7677                         run_len = SCX_AUX_TABLE_lengths[-script_of_run];
7678                     }
7679
7680                     intersection_len = 0;
7681
7682                     for (i = 0; i < run_len; i++) {
7683                         for (j = 0; j < char_len; j++) {
7684                             if (search_run[i] == search_char[j]) {
7685
7686                                 /* Here, the script at i,j matches.  That means
7687                                  * this character is in the run.  But continue
7688                                  * on to find the complete intersection, for
7689                                  * the next loop iteration, and for the digit
7690                                  * check after it.
7691                                  *
7692                                  * On the first found common script, we malloc
7693                                  * space for the intersection list for the worst
7694                                  * case of the intersection, which is the
7695                                  * minimum of the number of scripts remaining
7696                                  * in each set. */
7697                                 if (intersection_len == 0) {
7698                                     Newx(new_overlap,
7699                                          MIN(run_len - i, char_len - j),
7700                                          SCX_enum);
7701                                 }
7702                                 new_overlap[intersection_len++] = search_run[i];
7703                             }
7704                         }
7705                     }
7706
7707                     /* Here we've looked through everything.  If they have no
7708                      * scripts in common, not a run */
7709                     if (intersection_len == 0) {
7710                         Safefree(intersection);
7711                         sayNO;
7712                     }
7713
7714
7715                     /* If there is only a single script in common, set to that.
7716                      * Otherwise, use the intersection going forward */
7717                     Safefree(intersection);
7718                     if (intersection_len == 1) {
7719                         script_of_run = script_of_char = new_overlap[0];
7720                         Safefree(new_overlap);
7721                     }
7722                     else {
7723                         intersection = new_overlap;
7724                     }
7725                 }
7726
7727 #endif
7728
7729           scripts_match: ;
7730
7731                 /* Here, the script of the character is compatible with that of
7732                  * the run.  Either they match exactly, or one or both can be
7733                  * any of several scripts, and the intersection is not empty.
7734                  * If the character is not a decimal digit, we are done with
7735                  * it.  Otherwise, it could still fail if it is from a
7736                  * different set of 10 than seen already (or we may not have
7737                  * seen any, and we need to set the sequence).  If we have
7738                  * determined a single script and that script only has one set
7739                  * of digits (almost all scripts are like that), then this
7740                  * isn't a problem, as any digit must come from the same
7741                  * sequence.  The only scripts that have multiple sequences
7742                  * have been constructed to be 0 in 'script_zeros[]'.
7743                  *
7744                  * Here we check if it is a digit. */
7745                 if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
7746                     && (   (          zero_of_run == 0
7747                             || (  (   script_of_char >= 0
7748                                    && script_zeros[script_of_char] == 0)
7749                                 ||    intersection))))
7750                 {
7751                     SSize_t range_zero_index;
7752                     range_zero_index = _invlist_search(decimals_invlist, cp);
7753                     if (   LIKELY(range_zero_index >= 0)
7754                         && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
7755                     {
7756                         UV range_zero = decimals_array[range_zero_index];
7757                         if (zero_of_run) {
7758                             if (zero_of_run != range_zero) {
7759                                 Safefree(intersection);
7760                                 sayNO;
7761                             }
7762                         }
7763                         else {
7764                             zero_of_run = range_zero;
7765                         }
7766                     }
7767                 }
7768             } /* end of looping through CLOSESR text */
7769
7770             Safefree(intersection);
7771             break;
7772           }
7773
7774         case ACCEPT:  /*  (*ACCEPT)  */
7775             if (scan->flags)
7776                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7777             if (ARG2L(scan)){
7778                 regnode *cursor;
7779                 for (cursor=scan;
7780                      cursor && OP(cursor)!=END;
7781                      cursor=regnext(cursor))
7782                 {
7783                     if ( OP(cursor)==CLOSE ){
7784                         n = ARG(cursor);
7785                         if ( n <= lastopen ) {
7786                             CLOSE_CAPTURE;
7787                             if (n > rex->lastparen)
7788                                 rex->lastparen = n;
7789                             rex->lastcloseparen = n;
7790                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7791                                 break;
7792                         }
7793                     }
7794                 }
7795             }
7796             goto fake_end;
7797             /* NOTREACHED */
7798
7799         case GROUPP:  /*  (?(1))  */
7800             n = ARG(scan);  /* which paren pair */
7801             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7802             break;
7803
7804         case NGROUPP:  /*  (?(<name>))  */
7805             /* reg_check_named_buff_matched returns 0 for no match */
7806             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7807             break;
7808
7809         case INSUBP:   /*  (?(R))  */
7810             n = ARG(scan);
7811             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7812              * of SCAN is already set up as matches a eval.close_paren */
7813             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7814             break;
7815
7816         case DEFINEP:  /*  (?(DEFINE))  */
7817             sw = 0;
7818             break;
7819
7820         case IFTHEN:   /*  (?(cond)A|B)  */
7821             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7822             if (sw)
7823                 next = NEXTOPER(NEXTOPER(scan));
7824             else {
7825                 next = scan + ARG(scan);
7826                 if (OP(next) == IFTHEN) /* Fake one. */
7827                     next = NEXTOPER(NEXTOPER(next));
7828             }
7829             break;
7830
7831         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7832             logical = scan->flags;
7833             break;
7834
7835 /*******************************************************************
7836
7837 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7838 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7839 STAR/PLUS/CURLY/CURLYN are used instead.)
7840
7841 A*B is compiled as <CURLYX><A><WHILEM><B>
7842
7843 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7844 state, which contains the current count, initialised to -1. It also sets
7845 cur_curlyx to point to this state, with any previous value saved in the
7846 state block.
7847
7848 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7849 since the pattern may possibly match zero times (i.e. it's a while {} loop
7850 rather than a do {} while loop).
7851
7852 Each entry to WHILEM represents a successful match of A. The count in the
7853 CURLYX block is incremented, another WHILEM state is pushed, and execution
7854 passes to A or B depending on greediness and the current count.
7855
7856 For example, if matching against the string a1a2a3b (where the aN are
7857 substrings that match /A/), then the match progresses as follows: (the
7858 pushed states are interspersed with the bits of strings matched so far):
7859
7860     <CURLYX cnt=-1>
7861     <CURLYX cnt=0><WHILEM>
7862     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7863     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7864     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7865     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7866
7867 (Contrast this with something like CURLYM, which maintains only a single
7868 backtrack state:
7869
7870     <CURLYM cnt=0> a1
7871     a1 <CURLYM cnt=1> a2
7872     a1 a2 <CURLYM cnt=2> a3
7873     a1 a2 a3 <CURLYM cnt=3> b
7874 )
7875
7876 Each WHILEM state block marks a point to backtrack to upon partial failure
7877 of A or B, and also contains some minor state data related to that
7878 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7879 overall state, such as the count, and pointers to the A and B ops.
7880
7881 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7882 must always point to the *current* CURLYX block, the rules are:
7883
7884 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7885 and set cur_curlyx to point the new block.
7886
7887 When popping the CURLYX block after a successful or unsuccessful match,
7888 restore the previous cur_curlyx.
7889
7890 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7891 to the outer one saved in the CURLYX block.
7892
7893 When popping the WHILEM block after a successful or unsuccessful B match,
7894 restore the previous cur_curlyx.
7895
7896 Here's an example for the pattern (AI* BI)*BO
7897 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7898
7899 cur_
7900 curlyx backtrack stack
7901 ------ ---------------
7902 NULL
7903 CO     <CO prev=NULL> <WO>
7904 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7905 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7906 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7907
7908 At this point the pattern succeeds, and we work back down the stack to
7909 clean up, restoring as we go:
7910
7911 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7912 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7913 CO     <CO prev=NULL> <WO>
7914 NULL
7915
7916 *******************************************************************/
7917
7918 #define ST st->u.curlyx
7919
7920         case CURLYX:    /* start of /A*B/  (for complex A) */
7921         {
7922             /* No need to save/restore up to this paren */
7923             I32 parenfloor = scan->flags;
7924
7925             assert(next); /* keep Coverity happy */
7926             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7927                 next += ARG(next);
7928
7929             /* XXXX Probably it is better to teach regpush to support
7930                parenfloor > maxopenparen ... */
7931             if (parenfloor > (I32)rex->lastparen)
7932                 parenfloor = rex->lastparen; /* Pessimization... */
7933
7934             ST.prev_curlyx= cur_curlyx;
7935             cur_curlyx = st;
7936             ST.cp = PL_savestack_ix;
7937
7938             /* these fields contain the state of the current curly.
7939              * they are accessed by subsequent WHILEMs */
7940             ST.parenfloor = parenfloor;
7941             ST.me = scan;
7942             ST.B = next;
7943             ST.minmod = minmod;
7944             minmod = 0;
7945             ST.count = -1;      /* this will be updated by WHILEM */
7946             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7947
7948             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7949             NOT_REACHED; /* NOTREACHED */
7950         }
7951
7952         case CURLYX_end: /* just finished matching all of A*B */
7953             cur_curlyx = ST.prev_curlyx;
7954             sayYES;
7955             NOT_REACHED; /* NOTREACHED */
7956
7957         case CURLYX_end_fail: /* just failed to match all of A*B */
7958             regcpblow(ST.cp);
7959             cur_curlyx = ST.prev_curlyx;
7960             sayNO;
7961             NOT_REACHED; /* NOTREACHED */
7962
7963
7964 #undef ST
7965 #define ST st->u.whilem
7966
7967         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7968         {
7969             /* see the discussion above about CURLYX/WHILEM */
7970             I32 n;
7971             int min, max;
7972             regnode *A;
7973
7974             assert(cur_curlyx); /* keep Coverity happy */
7975
7976             min = ARG1(cur_curlyx->u.curlyx.me);
7977             max = ARG2(cur_curlyx->u.curlyx.me);
7978             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7979             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7980             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7981             ST.cache_offset = 0;
7982             ST.cache_mask = 0;
7983
7984
7985             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
7986                   depth, (long)n, min, max)
7987             );
7988
7989             /* First just match a string of min A's. */
7990
7991             if (n < min) {
7992                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7993                 cur_curlyx->u.curlyx.lastloc = locinput;
7994                 REGCP_SET(ST.lastcp);
7995
7996                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7997                 NOT_REACHED; /* NOTREACHED */
7998             }
7999
8000             /* If degenerate A matches "", assume A done. */
8001
8002             if (locinput == cur_curlyx->u.curlyx.lastloc) {
8003                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
8004                    depth)
8005                 );
8006                 goto do_whilem_B_max;
8007             }
8008
8009             /* super-linear cache processing.
8010              *
8011              * The idea here is that for certain types of CURLYX/WHILEM -
8012              * principally those whose upper bound is infinity (and
8013              * excluding regexes that have things like \1 and other very
8014              * non-regular expresssiony things), then if a pattern like
8015              * /....A*.../ fails and we backtrack to the WHILEM, then we
8016              * make a note that this particular WHILEM op was at string
8017              * position 47 (say) when the rest of pattern failed. Then, if
8018              * we ever find ourselves back at that WHILEM, and at string
8019              * position 47 again, we can just fail immediately rather than
8020              * running the rest of the pattern again.
8021              *
8022              * This is very handy when patterns start to go
8023              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8024              * with a combinatorial explosion of backtracking.
8025              *
8026              * The cache is implemented as a bit array, with one bit per
8027              * string byte position per WHILEM op (up to 16) - so its
8028              * between 0.25 and 2x the string size.
8029              *
8030              * To avoid allocating a poscache buffer every time, we do an
8031              * initially countdown; only after we have  executed a WHILEM
8032              * op (string-length x #WHILEMs) times do we allocate the
8033              * cache.
8034              *
8035              * The top 4 bits of scan->flags byte say how many different
8036              * relevant CURLLYX/WHILEM op pairs there are, while the
8037              * bottom 4-bits is the identifying index number of this
8038              * WHILEM.
8039              */
8040
8041             if (scan->flags) {
8042
8043                 if (!reginfo->poscache_maxiter) {
8044                     /* start the countdown: Postpone detection until we
8045                      * know the match is not *that* much linear. */
8046                     reginfo->poscache_maxiter
8047                         =    (reginfo->strend - reginfo->strbeg + 1)
8048                            * (scan->flags>>4);
8049                     /* possible overflow for long strings and many CURLYX's */
8050                     if (reginfo->poscache_maxiter < 0)
8051                         reginfo->poscache_maxiter = I32_MAX;
8052                     reginfo->poscache_iter = reginfo->poscache_maxiter;
8053                 }
8054
8055                 if (reginfo->poscache_iter-- == 0) {
8056                     /* initialise cache */
8057                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8058                     regmatch_info_aux *const aux = reginfo->info_aux;
8059                     if (aux->poscache) {
8060                         if ((SSize_t)reginfo->poscache_size < size) {
8061                             Renew(aux->poscache, size, char);
8062                             reginfo->poscache_size = size;
8063                         }
8064                         Zero(aux->poscache, size, char);
8065                     }
8066                     else {
8067                         reginfo->poscache_size = size;
8068                         Newxz(aux->poscache, size, char);
8069                     }
8070                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8071       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
8072                               PL_colors[4], PL_colors[5])
8073                     );
8074                 }
8075
8076                 if (reginfo->poscache_iter < 0) {
8077                     /* have we already failed at this position? */
8078                     SSize_t offset, mask;
8079
8080                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8081                     offset  = (scan->flags & 0xf) - 1
8082                                 +   (locinput - reginfo->strbeg)
8083                                   * (scan->flags>>4);
8084                     mask    = 1 << (offset % 8);
8085                     offset /= 8;
8086                     if (reginfo->info_aux->poscache[offset] & mask) {
8087                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
8088                             depth)
8089                         );
8090                         cur_curlyx->u.curlyx.count--;
8091                         sayNO; /* cache records failure */
8092                     }
8093                     ST.cache_offset = offset;
8094                     ST.cache_mask   = mask;
8095                 }
8096             }
8097
8098             /* Prefer B over A for minimal matching. */
8099
8100             if (cur_curlyx->u.curlyx.minmod) {
8101                 ST.save_curlyx = cur_curlyx;
8102                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8103                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8104                                     locinput);
8105                 NOT_REACHED; /* NOTREACHED */
8106             }
8107
8108             /* Prefer A over B for maximal matching. */
8109
8110             if (n < max) { /* More greed allowed? */
8111                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8112                             maxopenparen);
8113                 cur_curlyx->u.curlyx.lastloc = locinput;
8114                 REGCP_SET(ST.lastcp);
8115                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
8116                 NOT_REACHED; /* NOTREACHED */
8117             }
8118             goto do_whilem_B_max;
8119         }
8120         NOT_REACHED; /* NOTREACHED */
8121
8122         case WHILEM_B_min: /* just matched B in a minimal match */
8123         case WHILEM_B_max: /* just matched B in a maximal match */
8124             cur_curlyx = ST.save_curlyx;
8125             sayYES;
8126             NOT_REACHED; /* NOTREACHED */
8127
8128         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8129             cur_curlyx = ST.save_curlyx;
8130             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8131             cur_curlyx->u.curlyx.count--;
8132             CACHEsayNO;
8133             NOT_REACHED; /* NOTREACHED */
8134
8135         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8136             REGCP_UNWIND(ST.lastcp);
8137             regcppop(rex, &maxopenparen);
8138             /* FALLTHROUGH */
8139         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8140             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8141             cur_curlyx->u.curlyx.count--;
8142             CACHEsayNO;
8143             NOT_REACHED; /* NOTREACHED */
8144
8145         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8146             REGCP_UNWIND(ST.lastcp);
8147             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8148             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
8149                 depth)
8150             );
8151           do_whilem_B_max:
8152             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8153                 && ckWARN(WARN_REGEXP)
8154                 && !reginfo->warned)
8155             {
8156                 reginfo->warned = TRUE;
8157                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8158                      "Complex regular subexpression recursion limit (%d) "
8159                      "exceeded",
8160                      REG_INFTY - 1);
8161             }
8162
8163             /* now try B */
8164             ST.save_curlyx = cur_curlyx;
8165             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8166             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8167                                 locinput);
8168             NOT_REACHED; /* NOTREACHED */
8169
8170         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8171             cur_curlyx = ST.save_curlyx;
8172
8173             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8174                 /* Maximum greed exceeded */
8175                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8176                     && ckWARN(WARN_REGEXP)
8177                     && !reginfo->warned)
8178                 {
8179                     reginfo->warned     = TRUE;
8180                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8181                         "Complex regular subexpression recursion "
8182                         "limit (%d) exceeded",
8183                         REG_INFTY - 1);
8184                 }
8185                 cur_curlyx->u.curlyx.count--;
8186                 CACHEsayNO;
8187             }
8188
8189             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
8190             );
8191             /* Try grabbing another A and see if it helps. */
8192             cur_curlyx->u.curlyx.lastloc = locinput;
8193             PUSH_STATE_GOTO(WHILEM_A_min,
8194                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8195                 locinput);
8196             NOT_REACHED; /* NOTREACHED */
8197
8198 #undef  ST
8199 #define ST st->u.branch
8200
8201         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8202             next = scan + ARG(scan);
8203             if (next == scan)
8204                 next = NULL;
8205             scan = NEXTOPER(scan);
8206             /* FALLTHROUGH */
8207
8208         case BRANCH:        /*  /(...|A|...)/ */
8209             scan = NEXTOPER(scan); /* scan now points to inner node */
8210             ST.lastparen = rex->lastparen;
8211             ST.lastcloseparen = rex->lastcloseparen;
8212             ST.next_branch = next;
8213             REGCP_SET(ST.cp);
8214
8215             /* Now go into the branch */
8216             if (has_cutgroup) {
8217                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
8218             } else {
8219                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
8220             }
8221             NOT_REACHED; /* NOTREACHED */
8222
8223         case CUTGROUP:  /*  /(*THEN)/  */
8224             sv_yes_mark = st->u.mark.mark_name = scan->flags
8225                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8226                 : NULL;
8227             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
8228             NOT_REACHED; /* NOTREACHED */
8229
8230         case CUTGROUP_next_fail:
8231             do_cutgroup = 1;
8232             no_final = 1;
8233             if (st->u.mark.mark_name)
8234                 sv_commit = st->u.mark.mark_name;
8235             sayNO;
8236             NOT_REACHED; /* NOTREACHED */
8237
8238         case BRANCH_next:
8239             sayYES;
8240             NOT_REACHED; /* NOTREACHED */
8241
8242         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8243             if (do_cutgroup) {
8244                 do_cutgroup = 0;
8245                 no_final = 0;
8246             }
8247             REGCP_UNWIND(ST.cp);
8248             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8249             scan = ST.next_branch;
8250             /* no more branches? */
8251             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8252                 DEBUG_EXECUTE_r({
8253                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8254                         depth,
8255                         PL_colors[4],
8256                         PL_colors[5] );
8257                 });
8258                 sayNO_SILENT;
8259             }
8260             continue; /* execute next BRANCH[J] op */
8261             /* NOTREACHED */
8262
8263         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8264             minmod = 1;
8265             break;
8266
8267 #undef  ST
8268 #define ST st->u.curlym
8269
8270         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8271
8272             /* This is an optimisation of CURLYX that enables us to push
8273              * only a single backtracking state, no matter how many matches
8274              * there are in {m,n}. It relies on the pattern being constant
8275              * length, with no parens to influence future backrefs
8276              */
8277
8278             ST.me = scan;
8279             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8280
8281             ST.lastparen      = rex->lastparen;
8282             ST.lastcloseparen = rex->lastcloseparen;
8283
8284             /* if paren positive, emulate an OPEN/CLOSE around A */
8285             if (ST.me->flags) {
8286                 U32 paren = ST.me->flags;
8287                 if (paren > maxopenparen)
8288                     maxopenparen = paren;
8289                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8290             }
8291             ST.A = scan;
8292             ST.B = next;
8293             ST.alen = 0;
8294             ST.count = 0;
8295             ST.minmod = minmod;
8296             minmod = 0;
8297             ST.c1 = CHRTEST_UNINIT;
8298             REGCP_SET(ST.cp);
8299
8300             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8301                 goto curlym_do_B;
8302
8303           curlym_do_A: /* execute the A in /A{m,n}B/  */
8304             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
8305             NOT_REACHED; /* NOTREACHED */
8306
8307         case CURLYM_A: /* we've just matched an A */
8308             ST.count++;
8309             /* after first match, determine A's length: u.curlym.alen */
8310             if (ST.count == 1) {
8311                 if (reginfo->is_utf8_target) {
8312                     char *s = st->locinput;
8313                     while (s < locinput) {
8314                         ST.alen++;
8315                         s += UTF8SKIP(s);
8316                     }
8317                 }
8318                 else {
8319                     ST.alen = locinput - st->locinput;
8320                 }
8321                 if (ST.alen == 0)
8322                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8323             }
8324             DEBUG_EXECUTE_r(
8325                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8326                           depth, (IV) ST.count, (IV)ST.alen)
8327             );
8328
8329             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8330                 goto fake_end;
8331
8332             {
8333                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8334                 if ( max == REG_INFTY || ST.count < max )
8335                     goto curlym_do_A; /* try to match another A */
8336             }
8337             goto curlym_do_B; /* try to match B */
8338
8339         case CURLYM_A_fail: /* just failed to match an A */
8340             REGCP_UNWIND(ST.cp);
8341
8342
8343             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8344                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8345                 sayNO;
8346
8347           curlym_do_B: /* execute the B in /A{m,n}B/  */
8348             if (ST.c1 == CHRTEST_UNINIT) {
8349                 /* calculate c1 and c2 for possible match of 1st char
8350                  * following curly */
8351                 ST.c1 = ST.c2 = CHRTEST_VOID;
8352                 assert(ST.B);
8353                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8354                     regnode *text_node = ST.B;
8355                     if (! HAS_TEXT(text_node))
8356                         FIND_NEXT_IMPT(text_node);
8357                     /* this used to be
8358
8359                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8360
8361                         But the former is redundant in light of the latter.
8362
8363                         if this changes back then the macro for
8364                         IS_TEXT and friends need to change.
8365                      */
8366                     if (PL_regkind[OP(text_node)] == EXACT) {
8367                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8368                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8369                            reginfo))
8370                         {
8371                             sayNO;
8372                         }
8373                     }
8374                 }
8375             }
8376
8377             DEBUG_EXECUTE_r(
8378                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8379                     depth, (IV)ST.count)
8380                 );
8381             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8382                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8383                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8384                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8385                     {
8386                         /* simulate B failing */
8387                         DEBUG_OPTIMISE_r(
8388                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8389                                 depth,
8390                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8391                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8392                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8393                         );
8394                         state_num = CURLYM_B_fail;
8395                         goto reenter_switch;
8396                     }
8397                 }
8398                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8399                     /* simulate B failing */
8400                     DEBUG_OPTIMISE_r(
8401                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8402                             depth,
8403                             (int) nextchr, ST.c1, ST.c2)
8404                     );
8405                     state_num = CURLYM_B_fail;
8406                     goto reenter_switch;
8407                 }
8408             }
8409
8410             if (ST.me->flags) {
8411                 /* emulate CLOSE: mark current A as captured */
8412                 I32 paren = ST.me->flags;
8413                 if (ST.count) {
8414                     rex->offs[paren].start
8415                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
8416                     rex->offs[paren].end = locinput - reginfo->strbeg;
8417                     if ((U32)paren > rex->lastparen)
8418                         rex->lastparen = paren;
8419                     rex->lastcloseparen = paren;
8420                 }
8421                 else
8422                     rex->offs[paren].end = -1;
8423
8424                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8425                 {
8426                     if (ST.count)
8427                         goto fake_end;
8428                     else
8429                         sayNO;
8430                 }
8431             }
8432
8433             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8434             NOT_REACHED; /* NOTREACHED */
8435
8436         case CURLYM_B_fail: /* just failed to match a B */
8437             REGCP_UNWIND(ST.cp);
8438             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8439             if (ST.minmod) {
8440                 I32 max = ARG2(ST.me);
8441                 if (max != REG_INFTY && ST.count == max)
8442                     sayNO;
8443                 goto curlym_do_A; /* try to match a further A */
8444             }
8445             /* backtrack one A */
8446             if (ST.count == ARG1(ST.me) /* min */)
8447                 sayNO;
8448             ST.count--;
8449             SET_locinput(HOPc(locinput, -ST.alen));
8450             goto curlym_do_B; /* try to match B */
8451
8452 #undef ST
8453 #define ST st->u.curly
8454
8455 #define CURLY_SETPAREN(paren, success) \
8456     if (paren) { \
8457         if (success) { \
8458             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
8459             rex->offs[paren].end = locinput - reginfo->strbeg; \
8460             if (paren > rex->lastparen) \
8461                 rex->lastparen = paren; \
8462             rex->lastcloseparen = paren; \
8463         } \
8464         else { \
8465             rex->offs[paren].end = -1; \
8466             rex->lastparen      = ST.lastparen; \
8467             rex->lastcloseparen = ST.lastcloseparen; \
8468         } \
8469     }
8470
8471         case STAR:              /*  /A*B/ where A is width 1 char */
8472             ST.paren = 0;
8473             ST.min = 0;
8474             ST.max = REG_INFTY;
8475             scan = NEXTOPER(scan);
8476             goto repeat;
8477
8478         case PLUS:              /*  /A+B/ where A is width 1 char */
8479             ST.paren = 0;
8480             ST.min = 1;
8481             ST.max = REG_INFTY;
8482             scan = NEXTOPER(scan);
8483             goto repeat;
8484
8485         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8486             ST.paren = scan->flags;     /* Which paren to set */
8487             ST.lastparen      = rex->lastparen;
8488             ST.lastcloseparen = rex->lastcloseparen;
8489             if (ST.paren > maxopenparen)
8490                 maxopenparen = ST.paren;
8491             ST.min = ARG1(scan);  /* min to match */
8492             ST.max = ARG2(scan);  /* max to match */
8493             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8494             {
8495                 ST.min=1;
8496                 ST.max=1;
8497             }
8498             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8499             goto repeat;
8500
8501         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8502             ST.paren = 0;
8503             ST.min = ARG1(scan);  /* min to match */
8504             ST.max = ARG2(scan);  /* max to match */
8505             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8506           repeat:
8507             /*
8508             * Lookahead to avoid useless match attempts
8509             * when we know what character comes next.
8510             *
8511             * Used to only do .*x and .*?x, but now it allows
8512             * for )'s, ('s and (?{ ... })'s to be in the way
8513             * of the quantifier and the EXACT-like node.  -- japhy
8514             */
8515
8516             assert(ST.min <= ST.max);
8517             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8518                 ST.c1 = ST.c2 = CHRTEST_VOID;
8519             }
8520             else {
8521                 regnode *text_node = next;
8522
8523                 if (! HAS_TEXT(text_node))
8524                     FIND_NEXT_IMPT(text_node);
8525
8526                 if (! HAS_TEXT(text_node))
8527                     ST.c1 = ST.c2 = CHRTEST_VOID;
8528                 else {
8529                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8530                         ST.c1 = ST.c2 = CHRTEST_VOID;
8531                     }
8532                     else {
8533
8534                     /*  Currently we only get here when
8535
8536                         PL_rekind[OP(text_node)] == EXACT
8537
8538                         if this changes back then the macro for IS_TEXT and
8539                         friends need to change. */
8540                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8541                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8542                            reginfo))
8543                         {
8544                             sayNO;
8545                         }
8546                     }
8547                 }
8548             }
8549
8550             ST.A = scan;
8551             ST.B = next;
8552             if (minmod) {
8553                 char *li = locinput;
8554                 minmod = 0;
8555                 if (ST.min &&
8556                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8557                             < ST.min)
8558                     sayNO;
8559                 SET_locinput(li);
8560                 ST.count = ST.min;
8561                 REGCP_SET(ST.cp);
8562                 if (ST.c1 == CHRTEST_VOID)
8563                     goto curly_try_B_min;
8564
8565                 ST.oldloc = locinput;
8566
8567                 /* set ST.maxpos to the furthest point along the
8568                  * string that could possibly match */
8569                 if  (ST.max == REG_INFTY) {
8570                     ST.maxpos = reginfo->strend - 1;
8571                     if (utf8_target)
8572                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8573                             ST.maxpos--;
8574                 }
8575                 else if (utf8_target) {
8576                     int m = ST.max - ST.min;
8577                     for (ST.maxpos = locinput;
8578                          m >0 && ST.maxpos < reginfo->strend; m--)
8579                         ST.maxpos += UTF8SKIP(ST.maxpos);
8580                 }
8581                 else {
8582                     ST.maxpos = locinput + ST.max - ST.min;
8583                     if (ST.maxpos >= reginfo->strend)
8584                         ST.maxpos = reginfo->strend - 1;
8585                 }
8586                 goto curly_try_B_min_known;
8587
8588             }
8589             else {
8590                 /* avoid taking address of locinput, so it can remain
8591                  * a register var */
8592                 char *li = locinput;
8593                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8594                 if (ST.count < ST.min)
8595                     sayNO;
8596                 SET_locinput(li);
8597                 if ((ST.count > ST.min)
8598                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8599                 {
8600                     /* A{m,n} must come at the end of the string, there's
8601                      * no point in backing off ... */
8602                     ST.min = ST.count;
8603                     /* ...except that $ and \Z can match before *and* after
8604                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8605                        We may back off by one in this case. */
8606                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8607                         ST.min--;
8608                 }
8609                 REGCP_SET(ST.cp);
8610                 goto curly_try_B_max;
8611             }
8612             NOT_REACHED; /* NOTREACHED */
8613
8614         case CURLY_B_min_known_fail:
8615             /* failed to find B in a non-greedy match where c1,c2 valid */
8616
8617             REGCP_UNWIND(ST.cp);
8618             if (ST.paren) {
8619                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8620             }
8621             /* Couldn't or didn't -- move forward. */
8622             ST.oldloc = locinput;
8623             if (utf8_target)
8624                 locinput += UTF8SKIP(locinput);
8625             else
8626                 locinput++;
8627             ST.count++;
8628           curly_try_B_min_known:
8629              /* find the next place where 'B' could work, then call B */
8630             {
8631                 int n;
8632                 if (utf8_target) {
8633                     n = (ST.oldloc == locinput) ? 0 : 1;
8634                     if (ST.c1 == ST.c2) {
8635                         /* set n to utf8_distance(oldloc, locinput) */
8636                         while (locinput <= ST.maxpos
8637                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8638                         {
8639                             locinput += UTF8SKIP(locinput);
8640                             n++;
8641                         }
8642                     }
8643                     else {
8644                         /* set n to utf8_distance(oldloc, locinput) */
8645                         while (locinput <= ST.maxpos
8646                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8647                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8648                         {
8649                             locinput += UTF8SKIP(locinput);
8650                             n++;
8651                         }
8652                     }
8653                 }
8654                 else {  /* Not utf8_target */
8655                     if (ST.c1 == ST.c2) {
8656                         locinput = (char *) memchr(locinput,
8657                                                    ST.c1,
8658                                                    ST.maxpos + 1 - locinput);
8659                         if (! locinput) {
8660                             locinput = ST.maxpos + 1;
8661                         }
8662                     }
8663                     else {
8664                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8665
8666                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8667                             while (   locinput <= ST.maxpos
8668                                    && UCHARAT(locinput) != ST.c1
8669                                    && UCHARAT(locinput) != ST.c2)
8670                             {
8671                                 locinput++;
8672                             }
8673                         }
8674                         else {
8675                             /* If c1 and c2 only differ by a single bit, we can
8676                              * avoid a conditional each time through the loop,
8677                              * at the expense of a little preliminary setup and
8678                              * an extra mask each iteration.  By masking out
8679                              * that bit, we match exactly two characters, c1
8680                              * and c2, and so we don't have to test for both.
8681                              * On both ASCII and EBCDIC platforms, most of the
8682                              * ASCII-range and Latin1-range folded equivalents
8683                              * differ only in a single bit, so this is actually
8684                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8685                              * 0x61). */
8686                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8687                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8688                             while (   locinput <= ST.maxpos
8689                                    && (UCHARAT(locinput) & c1_c2_mask)
8690                                                                 != c1_masked)
8691                             {
8692                                 locinput++;
8693                             }
8694                         }
8695                     }
8696                     n = locinput - ST.oldloc;
8697                 }
8698                 if (locinput > ST.maxpos)
8699                     sayNO;
8700                 if (n) {
8701                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8702                      * at b; check that everything between oldloc and
8703                      * locinput matches */
8704                     char *li = ST.oldloc;
8705                     ST.count += n;
8706                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8707                         sayNO;
8708                     assert(n == REG_INFTY || locinput == li);
8709                 }
8710                 CURLY_SETPAREN(ST.paren, ST.count);
8711                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8712                     goto fake_end;
8713                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8714             }
8715             NOT_REACHED; /* NOTREACHED */
8716
8717         case CURLY_B_min_fail:
8718             /* failed to find B in a non-greedy match where c1,c2 invalid */
8719
8720             REGCP_UNWIND(ST.cp);
8721             if (ST.paren) {
8722                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8723             }
8724             /* failed -- move forward one */
8725             {
8726                 char *li = locinput;
8727                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8728                     sayNO;
8729                 }
8730                 locinput = li;
8731             }
8732             {
8733                 ST.count++;
8734                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8735                         ST.count > 0)) /* count overflow ? */
8736                 {
8737                   curly_try_B_min:
8738                     CURLY_SETPAREN(ST.paren, ST.count);
8739                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8740                         goto fake_end;
8741                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8742                 }
8743             }
8744             sayNO;
8745             NOT_REACHED; /* NOTREACHED */
8746
8747           curly_try_B_max:
8748             /* a successful greedy match: now try to match B */
8749             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8750                 goto fake_end;
8751             {
8752                 bool could_match = locinput < reginfo->strend;
8753
8754                 /* If it could work, try it. */
8755                 if (ST.c1 != CHRTEST_VOID && could_match) {
8756                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8757                     {
8758                         could_match = memEQ(locinput,
8759                                             ST.c1_utf8,
8760                                             UTF8SKIP(locinput))
8761                                     || memEQ(locinput,
8762                                              ST.c2_utf8,
8763                                              UTF8SKIP(locinput));
8764                     }
8765                     else {
8766                         could_match = UCHARAT(locinput) == ST.c1
8767                                       || UCHARAT(locinput) == ST.c2;
8768                     }
8769                 }
8770                 if (ST.c1 == CHRTEST_VOID || could_match) {
8771                     CURLY_SETPAREN(ST.paren, ST.count);
8772                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8773                     NOT_REACHED; /* NOTREACHED */
8774                 }
8775             }
8776             /* FALLTHROUGH */
8777
8778         case CURLY_B_max_fail:
8779             /* failed to find B in a greedy match */
8780
8781             REGCP_UNWIND(ST.cp);
8782             if (ST.paren) {
8783                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8784             }
8785             /*  back up. */
8786             if (--ST.count < ST.min)
8787                 sayNO;
8788             locinput = HOPc(locinput, -1);
8789             goto curly_try_B_max;
8790
8791 #undef ST
8792
8793         case END: /*  last op of main pattern  */
8794           fake_end:
8795             if (cur_eval) {
8796                 /* we've just finished A in /(??{A})B/; now continue with B */
8797                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8798                 st->u.eval.prev_rex = rex_sv;           /* inner */
8799
8800                 /* Save *all* the positions. */
8801                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8802                 rex_sv = CUR_EVAL.prev_rex;
8803                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8804                 SET_reg_curpm(rex_sv);
8805                 rex = ReANY(rex_sv);
8806                 rexi = RXi_GET(rex);
8807
8808                 st->u.eval.prev_curlyx = cur_curlyx;
8809                 cur_curlyx = CUR_EVAL.prev_curlyx;
8810
8811                 REGCP_SET(st->u.eval.lastcp);
8812
8813                 /* Restore parens of the outer rex without popping the
8814                  * savestack */
8815                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8816
8817                 st->u.eval.prev_eval = cur_eval;
8818                 cur_eval = CUR_EVAL.prev_eval;
8819                 DEBUG_EXECUTE_r(
8820                     Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
8821                                       depth, cur_eval););
8822                 if ( nochange_depth )
8823                     nochange_depth--;
8824
8825                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8826
8827                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8828                                     locinput); /* match B */
8829             }
8830
8831             if (locinput < reginfo->till) {
8832                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8833                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8834                                       PL_colors[4],
8835                                       (long)(locinput - startpos),
8836                                       (long)(reginfo->till - startpos),
8837                                       PL_colors[5]));
8838
8839                 sayNO_SILENT;           /* Cannot match: too short. */
8840             }
8841             sayYES;                     /* Success! */
8842
8843         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8844             DEBUG_EXECUTE_r(
8845             Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
8846                 depth, PL_colors[4], PL_colors[5]));
8847             sayYES;                     /* Success! */
8848
8849 #undef  ST
8850 #define ST st->u.ifmatch
8851
8852         {
8853             char *newstart;
8854
8855         case SUSPEND:   /* (?>A) */
8856             ST.wanted = 1;
8857             newstart = locinput;
8858             goto do_ifmatch;
8859
8860         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8861             ST.wanted = 0;
8862             goto ifmatch_trivial_fail_test;
8863
8864         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8865             ST.wanted = 1;
8866           ifmatch_trivial_fail_test:
8867             if (scan->flags) {
8868                 char * const s = HOPBACKc(locinput, scan->flags);
8869                 if (!s) {
8870                     /* trivial fail */
8871                     if (logical) {
8872                         logical = 0;
8873                         sw = 1 - cBOOL(ST.wanted);
8874                     }
8875                     else if (ST.wanted)
8876                         sayNO;
8877                     next = scan + ARG(scan);
8878                     if (next == scan)
8879                         next = NULL;
8880                     break;
8881                 }
8882                 newstart = s;
8883             }
8884             else
8885                 newstart = locinput;
8886
8887           do_ifmatch:
8888             ST.me = scan;
8889             ST.logical = logical;
8890             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8891
8892             /* execute body of (?...A) */
8893             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8894             NOT_REACHED; /* NOTREACHED */
8895         }
8896
8897         case IFMATCH_A_fail: /* body of (?...A) failed */
8898             ST.wanted = !ST.wanted;
8899             /* FALLTHROUGH */
8900
8901         case IFMATCH_A: /* body of (?...A) succeeded */
8902             if (ST.logical) {
8903                 sw = cBOOL(ST.wanted);
8904             }
8905             else if (!ST.wanted)
8906                 sayNO;
8907
8908             if (OP(ST.me) != SUSPEND) {
8909                 /* restore old position except for (?>...) */
8910                 locinput = st->locinput;
8911             }
8912             scan = ST.me + ARG(ST.me);
8913             if (scan == ST.me)
8914                 scan = NULL;
8915             continue; /* execute B */
8916
8917 #undef ST
8918
8919         case LONGJMP: /*  alternative with many branches compiles to
8920                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8921             next = scan + ARG(scan);
8922             if (next == scan)
8923                 next = NULL;
8924             break;
8925
8926         case COMMIT:  /*  (*COMMIT)  */
8927             reginfo->cutpoint = reginfo->strend;
8928             /* FALLTHROUGH */
8929
8930         case PRUNE:   /*  (*PRUNE)   */
8931             if (scan->flags)
8932                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8933             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8934             NOT_REACHED; /* NOTREACHED */
8935
8936         case COMMIT_next_fail:
8937             no_final = 1;
8938             /* FALLTHROUGH */
8939             sayNO;
8940             NOT_REACHED; /* NOTREACHED */
8941
8942         case OPFAIL:   /* (*FAIL)  */
8943             if (scan->flags)
8944                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8945             if (logical) {
8946                 /* deal with (?(?!)X|Y) properly,
8947                  * make sure we trigger the no branch
8948                  * of the trailing IFTHEN structure*/
8949                 sw= 0;
8950                 break;
8951             } else {
8952                 sayNO;
8953             }
8954             NOT_REACHED; /* NOTREACHED */
8955
8956 #define ST st->u.mark
8957         case MARKPOINT: /*  (*MARK:foo)  */
8958             ST.prev_mark = mark_state;
8959             ST.mark_name = sv_commit = sv_yes_mark
8960                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8961             mark_state = st;
8962             ST.mark_loc = locinput;
8963             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8964             NOT_REACHED; /* NOTREACHED */
8965
8966         case MARKPOINT_next:
8967             mark_state = ST.prev_mark;
8968             sayYES;
8969             NOT_REACHED; /* NOTREACHED */
8970
8971         case MARKPOINT_next_fail:
8972             if (popmark && sv_eq(ST.mark_name,popmark))
8973             {
8974                 if (ST.mark_loc > startpoint)
8975                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8976                 popmark = NULL; /* we found our mark */
8977                 sv_commit = ST.mark_name;
8978
8979                 DEBUG_EXECUTE_r({
8980                         Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%" SVf "...%s\n",
8981                             depth,
8982                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8983                 });
8984             }
8985             mark_state = ST.prev_mark;
8986             sv_yes_mark = mark_state ?
8987                 mark_state->u.mark.mark_name : NULL;
8988             sayNO;
8989             NOT_REACHED; /* NOTREACHED */
8990
8991         case SKIP:  /*  (*SKIP)  */
8992             if (!scan->flags) {
8993                 /* (*SKIP) : if we fail we cut here*/
8994                 ST.mark_name = NULL;
8995                 ST.mark_loc = locinput;
8996                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8997             } else {
8998                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8999                    otherwise do nothing.  Meaning we need to scan
9000                  */
9001                 regmatch_state *cur = mark_state;
9002                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9003
9004                 while (cur) {
9005                     if ( sv_eq( cur->u.mark.mark_name,
9006                                 find ) )
9007                     {
9008                         ST.mark_name = find;
9009                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
9010                     }
9011                     cur = cur->u.mark.prev_mark;
9012                 }
9013             }
9014             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9015             break;
9016
9017         case SKIP_next_fail:
9018             if (ST.mark_name) {
9019                 /* (*CUT:NAME) - Set up to search for the name as we
9020                    collapse the stack*/
9021                 popmark = ST.mark_name;
9022             } else {
9023                 /* (*CUT) - No name, we cut here.*/
9024                 if (ST.mark_loc > startpoint)
9025                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9026                 /* but we set sv_commit to latest mark_name if there
9027                    is one so they can test to see how things lead to this
9028                    cut */
9029                 if (mark_state)
9030                     sv_commit=mark_state->u.mark.mark_name;
9031             }
9032             no_final = 1;
9033             sayNO;
9034             NOT_REACHED; /* NOTREACHED */
9035 #undef ST
9036
9037         case LNBREAK: /* \R */
9038             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
9039                 locinput += n;
9040             } else
9041                 sayNO;
9042             break;
9043
9044         default:
9045             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9046                           PTR2UV(scan), OP(scan));
9047             Perl_croak(aTHX_ "regexp memory corruption");
9048
9049         /* this is a point to jump to in order to increment
9050          * locinput by one character */
9051           increment_locinput:
9052             assert(!NEXTCHR_IS_EOS);
9053             if (utf8_target) {
9054                 locinput += PL_utf8skip[nextchr];
9055                 /* locinput is allowed to go 1 char off the end (signifying
9056                  * EOS), but not 2+ */
9057                 if (locinput > reginfo->strend)
9058                     sayNO;
9059             }
9060             else
9061                 locinput++;
9062             break;
9063
9064         } /* end switch */
9065
9066         /* switch break jumps here */
9067         scan = next; /* prepare to execute the next op and ... */
9068         continue;    /* ... jump back to the top, reusing st */
9069         /* NOTREACHED */
9070
9071       push_yes_state:
9072         /* push a state that backtracks on success */
9073         st->u.yes.prev_yes_state = yes_state;
9074         yes_state = st;
9075         /* FALLTHROUGH */
9076       push_state:
9077         /* push a new regex state, then continue at scan  */
9078         {
9079             regmatch_state *newst;
9080
9081             DEBUG_STACK_r({
9082                 regmatch_state *cur = st;
9083                 regmatch_state *curyes = yes_state;
9084                 U32 i;
9085                 regmatch_slab *slab = PL_regmatch_slab;
9086                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9087                     if (cur < SLAB_FIRST(slab)) {
9088                         slab = slab->prev;
9089                         cur = SLAB_LAST(slab);
9090                     }
9091                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9092                         depth,
9093                         i ? "    " : "push",
9094                         depth - i, PL_reg_name[cur->resume_state],
9095                         (curyes == cur) ? "yes" : ""
9096                     );
9097                     if (curyes == cur)
9098                         curyes = cur->u.yes.prev_yes_state;
9099                 }
9100             } else
9101                 DEBUG_STATE_pp("push")
9102             );
9103             depth++;
9104             st->locinput = locinput;
9105             newst = st+1;
9106             if (newst >  SLAB_LAST(PL_regmatch_slab))
9107                 newst = S_push_slab(aTHX);
9108             PL_regmatch_state = newst;
9109
9110             locinput = pushinput;
9111             st = newst;
9112             continue;
9113             /* NOTREACHED */
9114         }
9115     }
9116 #ifdef SOLARIS_BAD_OPTIMIZER
9117 #  undef PL_charclass
9118 #endif
9119
9120     /*
9121     * We get here only if there's trouble -- normally "case END" is
9122     * the terminating point.
9123     */
9124     Perl_croak(aTHX_ "corrupted regexp pointers");
9125     NOT_REACHED; /* NOTREACHED */
9126
9127   yes:
9128     if (yes_state) {
9129         /* we have successfully completed a subexpression, but we must now
9130          * pop to the state marked by yes_state and continue from there */
9131         assert(st != yes_state);
9132 #ifdef DEBUGGING
9133         while (st != yes_state) {
9134             st--;
9135             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9136                 PL_regmatch_slab = PL_regmatch_slab->prev;
9137                 st = SLAB_LAST(PL_regmatch_slab);
9138             }
9139             DEBUG_STATE_r({
9140                 if (no_final) {
9141                     DEBUG_STATE_pp("pop (no final)");
9142                 } else {
9143                     DEBUG_STATE_pp("pop (yes)");
9144                 }
9145             });
9146             depth--;
9147         }
9148 #else
9149         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9150             || yes_state > SLAB_LAST(PL_regmatch_slab))
9151         {
9152             /* not in this slab, pop slab */
9153             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9154             PL_regmatch_slab = PL_regmatch_slab->prev;
9155             st = SLAB_LAST(PL_regmatch_slab);
9156         }
9157         depth -= (st - yes_state);
9158 #endif
9159         st = yes_state;
9160         yes_state = st->u.yes.prev_yes_state;
9161         PL_regmatch_state = st;
9162
9163         if (no_final)
9164             locinput= st->locinput;
9165         state_num = st->resume_state + no_final;
9166         goto reenter_switch;
9167     }
9168
9169     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9170                           PL_colors[4], PL_colors[5]));
9171
9172     if (reginfo->info_aux_eval) {
9173         /* each successfully executed (?{...}) block does the equivalent of
9174          *   local $^R = do {...}
9175          * When popping the save stack, all these locals would be undone;
9176          * bypass this by setting the outermost saved $^R to the latest
9177          * value */
9178         /* I dont know if this is needed or works properly now.
9179          * see code related to PL_replgv elsewhere in this file.
9180          * Yves
9181          */
9182         if (oreplsv != GvSV(PL_replgv))
9183             sv_setsv(oreplsv, GvSV(PL_replgv));
9184     }
9185     result = 1;
9186     goto final_exit;
9187
9188   no:
9189     DEBUG_EXECUTE_r(
9190         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9191             depth,
9192             PL_colors[4], PL_colors[5])
9193         );
9194
9195   no_silent:
9196     if (no_final) {
9197         if (yes_state) {
9198             goto yes;
9199         } else {
9200             goto final_exit;
9201         }
9202     }
9203     if (depth) {
9204         /* there's a previous state to backtrack to */
9205         st--;
9206         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9207             PL_regmatch_slab = PL_regmatch_slab->prev;
9208             st = SLAB_LAST(PL_regmatch_slab);
9209         }
9210         PL_regmatch_state = st;
9211         locinput= st->locinput;
9212
9213         DEBUG_STATE_pp("pop");
9214         depth--;
9215         if (yes_state == st)
9216             yes_state = st->u.yes.prev_yes_state;
9217
9218         state_num = st->resume_state + 1; /* failure = success + 1 */
9219         PERL_ASYNC_CHECK();
9220         goto reenter_switch;
9221     }
9222     result = 0;
9223
9224   final_exit:
9225     if (rex->intflags & PREGf_VERBARG_SEEN) {
9226         SV *sv_err = get_sv("REGERROR", 1);
9227         SV *sv_mrk = get_sv("REGMARK", 1);
9228         if (result) {
9229             sv_commit = &PL_sv_no;
9230             if (!sv_yes_mark)
9231                 sv_yes_mark = &PL_sv_yes;
9232         } else {
9233             if (!sv_commit)
9234                 sv_commit = &PL_sv_yes;
9235             sv_yes_mark = &PL_sv_no;
9236         }
9237         assert(sv_err);
9238         assert(sv_mrk);
9239         sv_setsv(sv_err, sv_commit);
9240         sv_setsv(sv_mrk, sv_yes_mark);
9241     }
9242
9243
9244     if (last_pushed_cv) {
9245         dSP;
9246         /* see "Some notes about MULTICALL" above */
9247         POP_MULTICALL;
9248         PERL_UNUSED_VAR(SP);
9249     }
9250     else
9251         LEAVE_SCOPE(orig_savestack_ix);
9252
9253     assert(!result ||  locinput - reginfo->strbeg >= 0);
9254     return result ?  locinput - reginfo->strbeg : -1;
9255 }
9256
9257 /*
9258  - regrepeat - repeatedly match something simple, report how many
9259  *
9260  * What 'simple' means is a node which can be the operand of a quantifier like
9261  * '+', or {1,3}
9262  *
9263  * startposp - pointer a pointer to the start position.  This is updated
9264  *             to point to the byte following the highest successful
9265  *             match.
9266  * p         - the regnode to be repeatedly matched against.
9267  * reginfo   - struct holding match state, such as strend
9268  * max       - maximum number of things to match.
9269  * depth     - (for debugging) backtracking depth.
9270  */
9271 STATIC I32
9272 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9273             regmatch_info *const reginfo, I32 max _pDEPTH)
9274 {
9275     char *scan;     /* Pointer to current position in target string */
9276     I32 c;
9277     char *loceol = reginfo->strend;   /* local version */
9278     I32 hardcount = 0;  /* How many matches so far */
9279     bool utf8_target = reginfo->is_utf8_target;
9280     unsigned int to_complement = 0;  /* Invert the result? */
9281     UV utf8_flags;
9282     _char_class_number classnum;
9283
9284     PERL_ARGS_ASSERT_REGREPEAT;
9285
9286     scan = *startposp;
9287     if (max == REG_INFTY)
9288         max = I32_MAX;
9289     else if (! utf8_target && loceol - scan > max)
9290         loceol = scan + max;
9291
9292     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
9293      * to the maximum of how far we should go in it (leaving it set to the real
9294      * end, if the maximum permissible would take us beyond that).  This allows
9295      * us to make the loop exit condition that we haven't gone past <loceol> to
9296      * also mean that we haven't exceeded the max permissible count, saving a
9297      * test each time through the loop.  But it assumes that the OP matches a
9298      * single byte, which is true for most of the OPs below when applied to a
9299      * non-UTF-8 target.  Those relatively few OPs that don't have this
9300      * characteristic will have to compensate.
9301      *
9302      * There is no adjustment for UTF-8 targets, as the number of bytes per
9303      * character varies.  OPs will have to test both that the count is less
9304      * than the max permissible (using <hardcount> to keep track), and that we
9305      * are still within the bounds of the string (using <loceol>.  A few OPs
9306      * match a single byte no matter what the encoding.  They can omit the max
9307      * test if, for the UTF-8 case, they do the adjustment that was skipped
9308      * above.
9309      *
9310      * Thus, the code above sets things up for the common case; and exceptional
9311      * cases need extra work; the common case is to make sure <scan> doesn't
9312      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
9313      * count doesn't exceed the maximum permissible */
9314
9315     switch (OP(p)) {
9316     case REG_ANY:
9317         if (utf8_target) {
9318             while (scan < loceol && hardcount < max && *scan != '\n') {
9319                 scan += UTF8SKIP(scan);
9320                 hardcount++;
9321             }
9322         } else {
9323             while (scan < loceol && *scan != '\n')
9324                 scan++;
9325         }
9326         break;
9327     case SANY:
9328         if (utf8_target) {
9329             while (scan < loceol && hardcount < max) {
9330                 scan += UTF8SKIP(scan);
9331                 hardcount++;
9332             }
9333         }
9334         else
9335             scan = loceol;
9336         break;
9337     case EXACTL:
9338         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9339         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9340             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9341         }
9342         /* FALLTHROUGH */
9343     case EXACT:
9344         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9345
9346         c = (U8)*STRING(p);
9347
9348         /* Can use a simple loop if the pattern char to match on is invariant
9349          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9350          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9351          * true iff it doesn't matter if the argument is in UTF-8 or not */
9352         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9353             if (utf8_target && loceol - scan > max) {
9354                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9355                  * since here, to match at all, 1 char == 1 byte */
9356                 loceol = scan + max;
9357             }
9358             while (scan < loceol && UCHARAT(scan) == c) {
9359                 scan++;
9360             }
9361         }
9362         else if (reginfo->is_utf8_pat) {
9363             if (utf8_target) {
9364                 STRLEN scan_char_len;
9365
9366                 /* When both target and pattern are UTF-8, we have to do
9367                  * string EQ */
9368                 while (hardcount < max
9369                        && scan < loceol
9370                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9371                        && memEQ(scan, STRING(p), scan_char_len))
9372                 {
9373                     scan += scan_char_len;
9374                     hardcount++;
9375                 }
9376             }
9377             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9378
9379                 /* Target isn't utf8; convert the character in the UTF-8
9380                  * pattern to non-UTF8, and do a simple loop */
9381                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9382                 while (scan < loceol && UCHARAT(scan) == c) {
9383                     scan++;
9384                 }
9385             } /* else pattern char is above Latin1, can't possibly match the
9386                  non-UTF-8 target */
9387         }
9388         else {
9389
9390             /* Here, the string must be utf8; pattern isn't, and <c> is
9391              * different in utf8 than not, so can't compare them directly.
9392              * Outside the loop, find the two utf8 bytes that represent c, and
9393              * then look for those in sequence in the utf8 string */
9394             U8 high = UTF8_TWO_BYTE_HI(c);
9395             U8 low = UTF8_TWO_BYTE_LO(c);
9396
9397             while (hardcount < max
9398                     && scan + 1 < loceol
9399                     && UCHARAT(scan) == high
9400                     && UCHARAT(scan + 1) == low)
9401             {
9402                 scan += 2;
9403                 hardcount++;
9404             }
9405         }
9406         break;
9407
9408     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
9409         assert(! reginfo->is_utf8_pat);
9410         /* FALLTHROUGH */
9411     case EXACTFA:
9412         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9413         goto do_exactf;
9414
9415     case EXACTFL:
9416         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9417         utf8_flags = FOLDEQ_LOCALE;
9418         goto do_exactf;
9419
9420     case EXACTF:   /* This node only generated for non-utf8 patterns */
9421         assert(! reginfo->is_utf8_pat);
9422         utf8_flags = 0;
9423         goto do_exactf;
9424
9425     case EXACTFLU8:
9426         if (! utf8_target) {
9427             break;
9428         }
9429         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9430                                     | FOLDEQ_S2_FOLDS_SANE;
9431         goto do_exactf;
9432
9433     case EXACTFU_SS:
9434     case EXACTFU:
9435         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9436
9437       do_exactf: {
9438         int c1, c2;
9439         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9440
9441         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9442
9443         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9444                                         reginfo))
9445         {
9446             if (c1 == CHRTEST_VOID) {
9447                 /* Use full Unicode fold matching */
9448                 char *tmpeol = reginfo->strend;
9449                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9450                 while (hardcount < max
9451                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9452                                              STRING(p), NULL, pat_len,
9453                                              reginfo->is_utf8_pat, utf8_flags))
9454                 {
9455                     scan = tmpeol;
9456                     tmpeol = reginfo->strend;
9457                     hardcount++;
9458                 }
9459             }
9460             else if (utf8_target) {
9461                 if (c1 == c2) {
9462                     while (scan < loceol
9463                            && hardcount < max
9464                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9465                     {
9466                         scan += UTF8SKIP(scan);
9467                         hardcount++;
9468                     }
9469                 }
9470                 else {
9471                     while (scan < loceol
9472                            && hardcount < max
9473                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9474                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9475                     {
9476                         scan += UTF8SKIP(scan);
9477                         hardcount++;
9478                     }
9479                 }
9480             }
9481             else if (c1 == c2) {
9482                 while (scan < loceol && UCHARAT(scan) == c1) {
9483                     scan++;
9484                 }
9485             }
9486             else {
9487                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9488                  * a conditional each time through the loop if the characters
9489                  * differ only in a single bit, as is the usual situation */
9490                 U8 c1_c2_bits_differing = c1 ^ c2;
9491
9492                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9493                     U8 c1_masked = c1 & ~ c1_c2_bits_differing;
9494                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9495
9496                     while (   scan < loceol
9497                            && (UCHARAT(scan) & c1_c2_mask) == c1_masked)
9498                     {
9499                         scan++;
9500                     }
9501                 }
9502                 else {
9503                     while (    scan < loceol
9504                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9505                     {
9506                         scan++;
9507                     }
9508                 }
9509             }
9510         }
9511         break;
9512     }
9513     case ANYOFL:
9514         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9515
9516         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9517             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9518         }
9519         /* FALLTHROUGH */
9520     case ANYOFD:
9521     case ANYOF:
9522         if (utf8_target) {
9523             while (hardcount < max
9524                    && scan < loceol
9525                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9526             {
9527                 scan += UTF8SKIP(scan);
9528                 hardcount++;
9529             }
9530         }
9531         else if (ANYOF_FLAGS(p)) {
9532             while (scan < loceol
9533                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9534                 scan++;
9535         }
9536         else {
9537             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9538                 scan++;
9539         }
9540         break;
9541
9542     case ASCII:
9543         if (utf8_target && loceol - scan > max) {
9544
9545             /* We didn't adjust <loceol> at the beginning of this routine
9546              * because is UTF-8, but it is actually ok to do so, since here, to
9547              * match, 1 char == 1 byte. */
9548             loceol = scan + max;
9549         }
9550
9551         scan = find_next_non_ascii(scan, loceol, utf8_target);
9552         break;
9553
9554     case NASCII:
9555         if (utf8_target) {
9556             while (     hardcount < max
9557                    &&   scan < loceol
9558                    && ! isASCII_utf8_safe(scan, loceol))
9559             {
9560                 scan += UTF8SKIP(scan);
9561                 hardcount++;
9562             }
9563         }
9564         else {
9565             scan = find_next_ascii(scan, loceol, utf8_target);
9566         }
9567         break;
9568
9569     /* The argument (FLAGS) to all the POSIX node types is the class number */
9570
9571     case NPOSIXL:
9572         to_complement = 1;
9573         /* FALLTHROUGH */
9574
9575     case POSIXL:
9576         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9577         if (! utf8_target) {
9578             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9579                                                                    *scan)))
9580             {
9581                 scan++;
9582             }
9583         } else {
9584             while (hardcount < max && scan < loceol
9585                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9586                                                                   (U8 *) scan)))
9587             {
9588                 scan += UTF8SKIP(scan);
9589                 hardcount++;
9590             }
9591         }
9592         break;
9593
9594     case POSIXD:
9595         if (utf8_target) {
9596             goto utf8_posix;
9597         }
9598         /* FALLTHROUGH */
9599
9600     case POSIXA:
9601         if (utf8_target && loceol - scan > max) {
9602
9603             /* We didn't adjust <loceol> at the beginning of this routine
9604              * because is UTF-8, but it is actually ok to do so, since here, to
9605              * match, 1 char == 1 byte. */
9606             loceol = scan + max;
9607         }
9608         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9609             scan++;
9610         }
9611         break;
9612
9613     case NPOSIXD:
9614         if (utf8_target) {
9615             to_complement = 1;
9616             goto utf8_posix;
9617         }
9618         /* FALLTHROUGH */
9619
9620     case NPOSIXA:
9621         if (! utf8_target) {
9622             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9623                 scan++;
9624             }
9625         }
9626         else {
9627
9628             /* The complement of something that matches only ASCII matches all
9629              * non-ASCII, plus everything in ASCII that isn't in the class. */
9630             while (hardcount < max && scan < loceol
9631                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9632                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9633             {
9634                 scan += UTF8SKIP(scan);
9635                 hardcount++;
9636             }
9637         }
9638         break;
9639
9640     case NPOSIXU:
9641         to_complement = 1;
9642         /* FALLTHROUGH */
9643
9644     case POSIXU:
9645         if (! utf8_target) {
9646             while (scan < loceol && to_complement
9647                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9648             {
9649                 scan++;
9650             }
9651         }
9652         else {
9653           utf8_posix:
9654             classnum = (_char_class_number) FLAGS(p);
9655             if (classnum < _FIRST_NON_SWASH_CC) {
9656
9657                 /* Here, a swash is needed for above-Latin1 code points.
9658                  * Process as many Latin1 code points using the built-in rules.
9659                  * Go to another loop to finish processing upon encountering
9660                  * the first Latin1 code point.  We could do that in this loop
9661                  * as well, but the other way saves having to test if the swash
9662                  * has been loaded every time through the loop: extra space to
9663                  * save a test. */
9664                 while (hardcount < max && scan < loceol) {
9665                     if (UTF8_IS_INVARIANT(*scan)) {
9666                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9667                                                                    classnum))))
9668                         {
9669                             break;
9670                         }
9671                         scan++;
9672                     }
9673                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9674                         if (! (to_complement
9675                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9676                                                                      *(scan + 1)),
9677                                                     classnum))))
9678                         {
9679                             break;
9680                         }
9681                         scan += 2;
9682                     }
9683                     else {
9684                         goto found_above_latin1;
9685                     }
9686
9687                     hardcount++;
9688                 }
9689             }
9690             else {
9691                 /* For these character classes, the knowledge of how to handle
9692                  * every code point is compiled in to Perl via a macro.  This
9693                  * code is written for making the loops as tight as possible.
9694                  * It could be refactored to save space instead */
9695                 switch (classnum) {
9696                     case _CC_ENUM_SPACE:
9697                         while (hardcount < max
9698                                && scan < loceol
9699                                && (to_complement
9700                                    ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9701                         {
9702                             scan += UTF8SKIP(scan);
9703                             hardcount++;
9704                         }
9705                         break;
9706                     case _CC_ENUM_BLANK:
9707                         while (hardcount < max
9708                                && scan < loceol
9709                                && (to_complement
9710                                     ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9711                         {
9712                             scan += UTF8SKIP(scan);
9713                             hardcount++;
9714                         }
9715                         break;
9716                     case _CC_ENUM_XDIGIT:
9717                         while (hardcount < max
9718                                && scan < loceol
9719                                && (to_complement
9720                                    ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9721                         {
9722                             scan += UTF8SKIP(scan);
9723                             hardcount++;
9724                         }
9725                         break;
9726                     case _CC_ENUM_VERTSPACE:
9727                         while (hardcount < max
9728                                && scan < loceol
9729                                && (to_complement
9730                                    ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9731                         {
9732                             scan += UTF8SKIP(scan);
9733                             hardcount++;
9734                         }
9735                         break;
9736                     case _CC_ENUM_CNTRL:
9737                         while (hardcount < max
9738                                && scan < loceol
9739                                && (to_complement
9740                                    ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9741                         {
9742                             scan += UTF8SKIP(scan);
9743                             hardcount++;
9744                         }
9745                         break;
9746                     default:
9747                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9748                 }
9749             }
9750         }
9751         break;
9752
9753       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
9754
9755         /* Load the swash if not already present */
9756         if (! PL_utf8_swash_ptrs[classnum]) {
9757             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9758             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9759                                         "utf8",
9760                                         "",
9761                                         &PL_sv_undef, 1, 0,
9762                                         PL_XPosix_ptrs[classnum], &flags);
9763         }
9764
9765         while (hardcount < max && scan < loceol
9766                && to_complement ^ cBOOL(_generic_utf8_safe(
9767                                        classnum,
9768                                        scan,
9769                                        loceol,
9770                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
9771                                                    (U8 *) scan,
9772                                                    TRUE))))
9773         {
9774             scan += UTF8SKIP(scan);
9775             hardcount++;
9776         }
9777         break;
9778
9779     case LNBREAK:
9780         if (utf8_target) {
9781             while (hardcount < max && scan < loceol &&
9782                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9783                 scan += c;
9784                 hardcount++;
9785             }
9786         } else {
9787             /* LNBREAK can match one or two latin chars, which is ok, but we
9788              * have to use hardcount in this situation, and throw away the
9789              * adjustment to <loceol> done before the switch statement */
9790             loceol = reginfo->strend;
9791             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9792                 scan+=c;
9793                 hardcount++;
9794             }
9795         }
9796         break;
9797
9798     case BOUNDL:
9799     case NBOUNDL:
9800         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9801         /* FALLTHROUGH */
9802     case BOUND:
9803     case BOUNDA:
9804     case BOUNDU:
9805     case EOS:
9806     case GPOS:
9807     case KEEPS:
9808     case NBOUND:
9809     case NBOUNDA:
9810     case NBOUNDU:
9811     case OPFAIL:
9812     case SBOL:
9813     case SEOL:
9814         /* These are all 0 width, so match right here or not at all. */
9815         break;
9816
9817     default:
9818         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9819         NOT_REACHED; /* NOTREACHED */
9820
9821     }
9822
9823     if (hardcount)
9824         c = hardcount;
9825     else
9826         c = scan - *startposp;
9827     *startposp = scan;
9828
9829     DEBUG_r({
9830         GET_RE_DEBUG_FLAGS_DECL;
9831         DEBUG_EXECUTE_r({
9832             SV * const prop = sv_newmortal();
9833             regprop(prog, prop, p, reginfo, NULL);
9834             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9835                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9836         });
9837     });
9838
9839     return(c);
9840 }
9841
9842
9843 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9844 /*
9845 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9846 create a copy so that changes the caller makes won't change the shared one.
9847 If <altsvp> is non-null, will return NULL in it, for back-compat.
9848  */
9849 SV *
9850 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9851 {
9852     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9853
9854     if (altsvp) {
9855         *altsvp = NULL;
9856     }
9857
9858     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9859 }
9860
9861 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9862
9863 /*
9864  - reginclass - determine if a character falls into a character class
9865
9866   n is the ANYOF-type regnode
9867   p is the target string
9868   p_end points to one byte beyond the end of the target string
9869   utf8_target tells whether p is in UTF-8.
9870
9871   Returns true if matched; false otherwise.
9872
9873   Note that this can be a synthetic start class, a combination of various
9874   nodes, so things you think might be mutually exclusive, such as locale,
9875   aren't.  It can match both locale and non-locale
9876
9877  */
9878
9879 STATIC bool
9880 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9881 {
9882     dVAR;
9883     const char flags = ANYOF_FLAGS(n);
9884     bool match = FALSE;
9885     UV c = *p;
9886
9887     PERL_ARGS_ASSERT_REGINCLASS;
9888
9889     /* If c is not already the code point, get it.  Note that
9890      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9891     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9892         STRLEN c_len = 0;
9893         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9894         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9895         if (c_len == (STRLEN)-1) {
9896             _force_out_malformed_utf8_message(p, p_end,
9897                                               utf8n_flags,
9898                                               1 /* 1 means die */ );
9899             NOT_REACHED; /* NOTREACHED */
9900         }
9901         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9902             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9903         }
9904     }
9905
9906     /* If this character is potentially in the bitmap, check it */
9907     if (c < NUM_ANYOF_CODE_POINTS) {
9908         if (ANYOF_BITMAP_TEST(n, c))
9909             match = TRUE;
9910         else if ((flags
9911                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9912                   && OP(n) == ANYOFD
9913                   && ! utf8_target
9914                   && ! isASCII(c))
9915         {
9916             match = TRUE;
9917         }
9918         else if (flags & ANYOF_LOCALE_FLAGS) {
9919             if ((flags & ANYOFL_FOLD)
9920                 && c < 256
9921                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9922             {
9923                 match = TRUE;
9924             }
9925             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9926                      && c < 256
9927             ) {
9928
9929                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9930                  * if the class includes the Posix character class given by
9931                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9932                  * complemented Posix class given by int(bit/2).  So we loop
9933                  * through the bits, each time changing whether we complement
9934                  * the result or not.  Suppose for the sake of illustration
9935                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9936                  * is set, it means there is a match for this ANYOF node if the
9937                  * character is in the class given by the expression (0 / 2 = 0
9938                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9939                  * and since 'to_complement' is 0, the result will stay TRUE,
9940                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9941                  * bit 1 is 1.  That means there is a match if the character
9942                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9943                  * but will on bit 1.  On the second iteration 'to_complement'
9944                  * will be 1, so the exclusive or will reverse things, so we
9945                  * are testing for \W.  On the third iteration, 'to_complement'
9946                  * will be 0, and we would be testing for \s; the fourth
9947                  * iteration would test for \S, etc.
9948                  *
9949                  * Note that this code assumes that all the classes are closed
9950                  * under folding.  For example, if a character matches \w, then
9951                  * its fold does too; and vice versa.  This should be true for
9952                  * any well-behaved locale for all the currently defined Posix
9953                  * classes, except for :lower: and :upper:, which are handled
9954                  * by the pseudo-class :cased: which matches if either of the
9955                  * other two does.  To get rid of this assumption, an outer
9956                  * loop could be used below to iterate over both the source
9957                  * character, and its fold (if different) */
9958
9959                 int count = 0;
9960                 int to_complement = 0;
9961
9962                 while (count < ANYOF_MAX) {
9963                     if (ANYOF_POSIXL_TEST(n, count)
9964                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9965                     {
9966                         match = TRUE;
9967                         break;
9968                     }
9969                     count++;
9970                     to_complement ^= 1;
9971                 }
9972             }
9973         }
9974     }
9975
9976
9977     /* If the bitmap didn't (or couldn't) match, and something outside the
9978      * bitmap could match, try that. */
9979     if (!match) {
9980         if (c >= NUM_ANYOF_CODE_POINTS
9981             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9982         {
9983             match = TRUE;       /* Everything above the bitmap matches */
9984         }
9985             /* Here doesn't match everything above the bitmap.  If there is
9986              * some information available beyond the bitmap, we may find a
9987              * match in it.  If so, this is most likely because the code point
9988              * is outside the bitmap range.  But rarely, it could be because of
9989              * some other reason.  If so, various flags are set to indicate
9990              * this possibility.  On ANYOFD nodes, there may be matches that
9991              * happen only when the target string is UTF-8; or for other node
9992              * types, because runtime lookup is needed, regardless of the
9993              * UTF-8ness of the target string.  Finally, under /il, there may
9994              * be some matches only possible if the locale is a UTF-8 one. */
9995         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9996                  && (   c >= NUM_ANYOF_CODE_POINTS
9997                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9998                          && (   UNLIKELY(OP(n) != ANYOFD)
9999                              || (utf8_target && ! isASCII_uni(c)
10000 #                               if NUM_ANYOF_CODE_POINTS > 256
10001                                                                  && c < 256
10002 #                               endif
10003                                 )))
10004                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10005                          && IN_UTF8_CTYPE_LOCALE)))
10006         {
10007             SV* only_utf8_locale = NULL;
10008             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10009                                                        &only_utf8_locale, NULL);
10010             if (sw) {
10011                 U8 utf8_buffer[2];
10012                 U8 * utf8_p;
10013                 if (utf8_target) {
10014                     utf8_p = (U8 *) p;
10015                 } else { /* Convert to utf8 */
10016                     utf8_p = utf8_buffer;
10017                     append_utf8_from_native_byte(*p, &utf8_p);
10018                     utf8_p = utf8_buffer;
10019                 }
10020
10021                 if (swash_fetch(sw, utf8_p, TRUE)) {
10022                     match = TRUE;
10023                 }
10024             }
10025             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10026                 match = _invlist_contains_cp(only_utf8_locale, c);
10027             }
10028         }
10029
10030         if (UNICODE_IS_SUPER(c)
10031             && (flags
10032                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10033             && OP(n) != ANYOFD
10034             && ckWARN_d(WARN_NON_UNICODE))
10035         {
10036             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10037                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10038         }
10039     }
10040
10041 #if ANYOF_INVERT != 1
10042     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10043      * use it, don't */
10044 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10045 #endif
10046
10047     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10048     return (flags & ANYOF_INVERT) ^ match;
10049 }
10050
10051 STATIC U8 *
10052 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10053 {
10054     /* return the position 'off' UTF-8 characters away from 's', forward if
10055      * 'off' >= 0, backwards if negative.  But don't go outside of position
10056      * 'lim', which better be < s  if off < 0 */
10057
10058     PERL_ARGS_ASSERT_REGHOP3;
10059
10060     if (off >= 0) {
10061         while (off-- && s < lim) {
10062             /* XXX could check well-formedness here */
10063             U8 *new_s = s + UTF8SKIP(s);
10064             if (new_s > lim) /* lim may be in the middle of a long character */
10065                 return s;
10066             s = new_s;
10067         }
10068     }
10069     else {
10070         while (off++ && s > lim) {
10071             s--;
10072             if (UTF8_IS_CONTINUED(*s)) {
10073                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10074                     s--;
10075                 if (! UTF8_IS_START(*s)) {
10076                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10077                 }
10078             }
10079             /* XXX could check well-formedness here */
10080         }
10081     }
10082     return s;
10083 }
10084
10085 STATIC U8 *
10086 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10087 {
10088     PERL_ARGS_ASSERT_REGHOP4;
10089
10090     if (off >= 0) {
10091         while (off-- && s < rlim) {
10092             /* XXX could check well-formedness here */
10093             s += UTF8SKIP(s);
10094         }
10095     }
10096     else {
10097         while (off++ && s > llim) {
10098             s--;
10099             if (UTF8_IS_CONTINUED(*s)) {
10100                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10101                     s--;
10102                 if (! UTF8_IS_START(*s)) {
10103                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10104                 }
10105             }
10106             /* XXX could check well-formedness here */
10107         }
10108     }
10109     return s;
10110 }
10111
10112 /* like reghop3, but returns NULL on overrun, rather than returning last
10113  * char pos */
10114
10115 STATIC U8 *
10116 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10117 {
10118     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10119
10120     if (off >= 0) {
10121         while (off-- && s < lim) {
10122             /* XXX could check well-formedness here */
10123             s += UTF8SKIP(s);
10124         }
10125         if (off >= 0)
10126             return NULL;
10127     }
10128     else {
10129         while (off++ && s > lim) {
10130             s--;
10131             if (UTF8_IS_CONTINUED(*s)) {
10132                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10133                     s--;
10134                 if (! UTF8_IS_START(*s)) {
10135                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10136                 }
10137             }
10138             /* XXX could check well-formedness here */
10139         }
10140         if (off <= 0)
10141             return NULL;
10142     }
10143     return s;
10144 }
10145
10146
10147 /* when executing a regex that may have (?{}), extra stuff needs setting
10148    up that will be visible to the called code, even before the current
10149    match has finished. In particular:
10150
10151    * $_ is localised to the SV currently being matched;
10152    * pos($_) is created if necessary, ready to be updated on each call-out
10153      to code;
10154    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10155      isn't set until the current pattern is successfully finished), so that
10156      $1 etc of the match-so-far can be seen;
10157    * save the old values of subbeg etc of the current regex, and  set then
10158      to the current string (again, this is normally only done at the end
10159      of execution)
10160 */
10161
10162 static void
10163 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10164 {
10165     MAGIC *mg;
10166     regexp *const rex = ReANY(reginfo->prog);
10167     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10168
10169     eval_state->rex = rex;
10170
10171     if (reginfo->sv) {
10172         /* Make $_ available to executed code. */
10173         if (reginfo->sv != DEFSV) {
10174             SAVE_DEFSV;
10175             DEFSV_set(reginfo->sv);
10176         }
10177
10178         if (!(mg = mg_find_mglob(reginfo->sv))) {
10179             /* prepare for quick setting of pos */
10180             mg = sv_magicext_mglob(reginfo->sv);
10181             mg->mg_len = -1;
10182         }
10183         eval_state->pos_magic = mg;
10184         eval_state->pos       = mg->mg_len;
10185         eval_state->pos_flags = mg->mg_flags;
10186     }
10187     else
10188         eval_state->pos_magic = NULL;
10189
10190     if (!PL_reg_curpm) {
10191         /* PL_reg_curpm is a fake PMOP that we can attach the current
10192          * regex to and point PL_curpm at, so that $1 et al are visible
10193          * within a /(?{})/. It's just allocated once per interpreter the
10194          * first time its needed */
10195         Newxz(PL_reg_curpm, 1, PMOP);
10196 #ifdef USE_ITHREADS
10197         {
10198             SV* const repointer = &PL_sv_undef;
10199             /* this regexp is also owned by the new PL_reg_curpm, which
10200                will try to free it.  */
10201             av_push(PL_regex_padav, repointer);
10202             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10203             PL_regex_pad = AvARRAY(PL_regex_padav);
10204         }
10205 #endif
10206     }
10207     SET_reg_curpm(reginfo->prog);
10208     eval_state->curpm = PL_curpm;
10209     PL_curpm_under = PL_curpm;
10210     PL_curpm = PL_reg_curpm;
10211     if (RXp_MATCH_COPIED(rex)) {
10212         /*  Here is a serious problem: we cannot rewrite subbeg,
10213             since it may be needed if this match fails.  Thus
10214             $` inside (?{}) could fail... */
10215         eval_state->subbeg     = rex->subbeg;
10216         eval_state->sublen     = rex->sublen;
10217         eval_state->suboffset  = rex->suboffset;
10218         eval_state->subcoffset = rex->subcoffset;
10219 #ifdef PERL_ANY_COW
10220         eval_state->saved_copy = rex->saved_copy;
10221 #endif
10222         RXp_MATCH_COPIED_off(rex);
10223     }
10224     else
10225         eval_state->subbeg = NULL;
10226     rex->subbeg = (char *)reginfo->strbeg;
10227     rex->suboffset = 0;
10228     rex->subcoffset = 0;
10229     rex->sublen = reginfo->strend - reginfo->strbeg;
10230 }
10231
10232
10233 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10234
10235 static void
10236 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10237 {
10238     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10239     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10240     regmatch_slab *s;
10241
10242     Safefree(aux->poscache);
10243
10244     if (eval_state) {
10245
10246         /* undo the effects of S_setup_eval_state() */
10247
10248         if (eval_state->subbeg) {
10249             regexp * const rex = eval_state->rex;
10250             rex->subbeg     = eval_state->subbeg;
10251             rex->sublen     = eval_state->sublen;
10252             rex->suboffset  = eval_state->suboffset;
10253             rex->subcoffset = eval_state->subcoffset;
10254 #ifdef PERL_ANY_COW
10255             rex->saved_copy = eval_state->saved_copy;
10256 #endif
10257             RXp_MATCH_COPIED_on(rex);
10258         }
10259         if (eval_state->pos_magic)
10260         {
10261             eval_state->pos_magic->mg_len = eval_state->pos;
10262             eval_state->pos_magic->mg_flags =
10263                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10264                | (eval_state->pos_flags & MGf_BYTES);
10265         }
10266
10267         PL_curpm = eval_state->curpm;
10268     }
10269
10270     PL_regmatch_state = aux->old_regmatch_state;
10271     PL_regmatch_slab  = aux->old_regmatch_slab;
10272
10273     /* free all slabs above current one - this must be the last action
10274      * of this function, as aux and eval_state are allocated within
10275      * slabs and may be freed here */
10276
10277     s = PL_regmatch_slab->next;
10278     if (s) {
10279         PL_regmatch_slab->next = NULL;
10280         while (s) {
10281             regmatch_slab * const osl = s;
10282             s = s->next;
10283             Safefree(osl);
10284         }
10285     }
10286 }
10287
10288
10289 STATIC void
10290 S_to_utf8_substr(pTHX_ regexp *prog)
10291 {
10292     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10293      * on the converted value */
10294
10295     int i = 1;
10296
10297     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10298
10299     do {
10300         if (prog->substrs->data[i].substr
10301             && !prog->substrs->data[i].utf8_substr) {
10302             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10303             prog->substrs->data[i].utf8_substr = sv;
10304             sv_utf8_upgrade(sv);
10305             if (SvVALID(prog->substrs->data[i].substr)) {
10306                 if (SvTAIL(prog->substrs->data[i].substr)) {
10307                     /* Trim the trailing \n that fbm_compile added last
10308                        time.  */
10309                     SvCUR_set(sv, SvCUR(sv) - 1);
10310                     /* Whilst this makes the SV technically "invalid" (as its
10311                        buffer is no longer followed by "\0") when fbm_compile()
10312                        adds the "\n" back, a "\0" is restored.  */
10313                     fbm_compile(sv, FBMcf_TAIL);
10314                 } else
10315                     fbm_compile(sv, 0);
10316             }
10317             if (prog->substrs->data[i].substr == prog->check_substr)
10318                 prog->check_utf8 = sv;
10319         }
10320     } while (i--);
10321 }
10322
10323 STATIC bool
10324 S_to_byte_substr(pTHX_ regexp *prog)
10325 {
10326     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10327      * on the converted value; returns FALSE if can't be converted. */
10328
10329     int i = 1;
10330
10331     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10332
10333     do {
10334         if (prog->substrs->data[i].utf8_substr
10335             && !prog->substrs->data[i].substr) {
10336             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10337             if (! sv_utf8_downgrade(sv, TRUE)) {
10338                 return FALSE;
10339             }
10340             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10341                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10342                     /* Trim the trailing \n that fbm_compile added last
10343                         time.  */
10344                     SvCUR_set(sv, SvCUR(sv) - 1);
10345                     fbm_compile(sv, FBMcf_TAIL);
10346                 } else
10347                     fbm_compile(sv, 0);
10348             }
10349             prog->substrs->data[i].substr = sv;
10350             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10351                 prog->check_substr = sv;
10352         }
10353     } while (i--);
10354
10355     return TRUE;
10356 }
10357
10358 #ifndef PERL_IN_XSUB_RE
10359
10360 bool
10361 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10362 {
10363     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10364      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10365      * the larger string bounded by 'strbeg' and 'strend'.
10366      *
10367      * 'cp' needs to be assigned (if not a future version of the Unicode
10368      * Standard could make it something that combines with adjacent characters,
10369      * so code using it would then break), and there has to be a GCB break
10370      * before and after the character. */
10371
10372     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10373     const U8 * prev_cp_start;
10374
10375     PERL_ARGS_ASSERT__IS_GRAPHEME;
10376
10377     /* Unassigned code points are forbidden */
10378     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10379                                     _invlist_search(PL_Assigned_invlist, cp))))
10380     {
10381         return FALSE;
10382     }
10383
10384     cp_gcb_val = getGCB_VAL_CP(cp);
10385
10386     /* Find the GCB value of the previous code point in the input */
10387     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10388     if (UNLIKELY(prev_cp_start == s)) {
10389         prev_cp_gcb_val = GCB_EDGE;
10390     }
10391     else {
10392         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10393     }
10394
10395     /* And check that is a grapheme boundary */
10396     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10397                 TRUE /* is UTF-8 encoded */ ))
10398     {
10399         return FALSE;
10400     }
10401
10402     /* Similarly verify there is a break between the current character and the
10403      * following one */
10404     s += UTF8SKIP(s);
10405     if (s >= strend) {
10406         next_cp_gcb_val = GCB_EDGE;
10407     }
10408     else {
10409         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10410     }
10411
10412     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10413 }
10414
10415 #endif
10416
10417
10418
10419 /*
10420  * ex: set ts=8 sts=4 sw=4 et:
10421  */