regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 /* Returns a boolean as to whether the input unsigned number is a power of 2
 100  * (2**0, 2**1, etc).  In other words if it has just a single bit set.
 101  * If not, subtracting 1 would leave the uppermost bit set, so the & would
 102  * yield non-zero */
 103 #define isPOWER_OF_2(n) ((n & (n-1)) == 0)
 104
 105 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 106     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 107     goto target;                                                         \
 108 } STMT_END
 109
 110 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 111
 112 #ifndef STATIC
 113 #define STATIC  static
 114 #endif
 115
 116 /* Valid only if 'c', the character being looke-up, is an invariant under
 117  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 118  * everything matchable is straight forward in the bitmap */
 119 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 120                                 ? reginclass(prog,p,c,c+1,u)                \
 121                                 : ANYOF_BITMAP_TEST(p,*(c)))
 122
 123 /*
 124  * Forwards.
 125  */
 126
 127 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 128
 129 #define HOPc(pos,off) \
 130         (char *)(reginfo->is_utf8_target \
 131             ? reghop3((U8*)pos, off, \
 132                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 133             : (U8*)(pos + off))
 134
 135 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 136 #define HOPBACK3(pos, off, lim) \
 137         (reginfo->is_utf8_target                          \
 138             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 139             : (pos - off >= lim)                                 \
 140                 ? (U8*)pos - off                                 \
 141                 : NULL)
 142
 143 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 144
 145 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 146 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 147
 148 /* lim must be +ve. Returns NULL on overshoot */
 149 #define HOPMAYBE3(pos,off,lim) \
 150         (reginfo->is_utf8_target                        \
 151             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 152             : ((U8*)pos + off <= lim)                   \
 153                 ? (U8*)pos + off                        \
 154                 : NULL)
 155
 156 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 157  * off must be >=0; args should be vars rather than expressions */
 158 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 159     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 160     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 161 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 162
 163 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 164     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 165     : (U8*)(pos + off))
 166 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 167
 168 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 169 #define NEXTCHR_IS_EOS (nextchr < 0)
 170
 171 #define SET_nextchr \
 172     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 173
 174 #define SET_locinput(p) \
 175     locinput = (p);  \
 176     SET_nextchr
 177
 178
 179 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 180         if (!swash_ptr) {                                                     \
 181             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 182             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 183                                          1, 0, invlist, &flags);              \
 184             assert(swash_ptr);                                                \
 185         }                                                                     \
 186     } STMT_END
 187
 188 /* If in debug mode, we test that a known character properly matches */
 189 #ifdef DEBUGGING
 190 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 191                                           property_name,                      \
 192                                           invlist,                            \
 193                                           utf8_char_in_property)              \
 194         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 195         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 196 #else
 197 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 198                                           property_name,                      \
 199                                           invlist,                            \
 200                                           utf8_char_in_property)              \
 201         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 202 #endif
 203
 204 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 205                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 206                                         "",                                   \
 207                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 208                                         LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
 209
 210 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 211 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 212
 213 /* for use after a quantifier and before an EXACT-like node -- japhy */
 214 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 215  *
 216  * NOTE that *nothing* that affects backtracking should be in here, specifically
 217  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 218  * node that is in between two EXACT like nodes when ascertaining what the required
 219  * "follow" character is. This should probably be moved to regex compile time
 220  * although it may be done at run time beause of the REF possibility - more
 221  * investigation required. -- demerphq
 222 */
 223 #define JUMPABLE(rn) (                                                             \
 224     OP(rn) == OPEN ||                                                              \
 225     (OP(rn) == CLOSE &&                                                            \
 226      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 227     OP(rn) == EVAL ||                                                              \
 228     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 229     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 230     OP(rn) == KEEPS ||                                                             \
 231     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 232 )
 233 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 234
 235 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 236
 237 #if 0
 238 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 239    we don't need this definition.  XXX These are now out-of-sync*/
 240 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 241 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFAA || OP(rn)==EXACTFAA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 242 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 243
 244 #else
 245 /* ... so we use this as its faster. */
 246 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 247 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFAA || OP(rn) == EXACTFAA_NO_TRIE)
 248 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 249 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 250
 251 #endif
 252
 253 /*
 254   Search for mandatory following text node; for lookahead, the text must
 255   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 256 */
 257 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 258     while (JUMPABLE(rn)) { \
 259         const OPCODE type = OP(rn); \
 260         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 261             rn = NEXTOPER(NEXTOPER(rn)); \
 262         else if (type == PLUS) \
 263             rn = NEXTOPER(rn); \
 264         else if (type == IFMATCH) \
 265             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 266         else rn += NEXT_OFF(rn); \
 267     } \
 268 } STMT_END
 269
 270 #define SLAB_FIRST(s) (&(s)->states[0])
 271 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 272
 273 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 274 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 275 static regmatch_state * S_push_slab(pTHX);
 276
 277 #define REGCP_PAREN_ELEMS 3
 278 #define REGCP_OTHER_ELEMS 3
 279 #define REGCP_FRAME_ELEMS 1
 280 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 281  * are needed for the regexp context stack bookkeeping. */
 282
 283 STATIC CHECKPOINT
 284 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 285 {
 286     const int retval = PL_savestack_ix;
 287     const int paren_elems_to_push =
 288                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 289     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 290     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 291     I32 p;
 292     GET_RE_DEBUG_FLAGS_DECL;
 293
 294     PERL_ARGS_ASSERT_REGCPPUSH;
 295
 296     if (paren_elems_to_push < 0)
 297         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 298                    (int)paren_elems_to_push, (int)maxopenparen,
 299                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 300
 301     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 302         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 303                    " out of range (%lu-%ld)",
 304                    total_elems,
 305                    (unsigned long)maxopenparen,
 306                    (long)parenfloor);
 307
 308     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 309
 310     DEBUG_BUFFERS_r(
 311         if ((int)maxopenparen > (int)parenfloor)
 312             Perl_re_exec_indentf( aTHX_
 313                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 314                 depth,
 315                 PTR2UV(rex),
 316                 PTR2UV(rex->offs)
 317             );
 318     );
 319     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 320 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 321         SSPUSHIV(rex->offs[p].end);
 322         SSPUSHIV(rex->offs[p].start);
 323         SSPUSHINT(rex->offs[p].start_tmp);
 324         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 325             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 326             depth,
 327             (UV)p,
 328             (IV)rex->offs[p].start,
 329             (IV)rex->offs[p].start_tmp,
 330             (IV)rex->offs[p].end
 331         ));
 332     }
 333 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 334     SSPUSHINT(maxopenparen);
 335     SSPUSHINT(rex->lastparen);
 336     SSPUSHINT(rex->lastcloseparen);
 337     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 338
 339     return retval;
 340 }
 341
 342 /* These are needed since we do not localize EVAL nodes: */
 343 #define REGCP_SET(cp)                                           \
 344     DEBUG_STATE_r(                                              \
 345         Perl_re_exec_indentf( aTHX_                             \
 346             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 347             depth, (IV)PL_savestack_ix                          \
 348         )                                                       \
 349     );                                                          \
 350     cp = PL_savestack_ix
 351
 352 #define REGCP_UNWIND(cp)                                        \
 353     DEBUG_STATE_r(                                              \
 354         if (cp != PL_savestack_ix)                              \
 355             Perl_re_exec_indentf( aTHX_                         \
 356                 "Clearing an EVAL scope, savestack=%"           \
 357                 IVdf "..%" IVdf "\n",                           \
 358                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 359             )                                                   \
 360     );                                                          \
 361     regcpblow(cp)
 362
 363 #define UNWIND_PAREN(lp, lcp)               \
 364     for (n = rex->lastparen; n > lp; n--)   \
 365         rex->offs[n].end = -1;              \
 366     rex->lastparen = n;                     \
 367     rex->lastcloseparen = lcp;
 368
 369
 370 STATIC void
 371 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 372 {
 373     UV i;
 374     U32 paren;
 375     GET_RE_DEBUG_FLAGS_DECL;
 376
 377     PERL_ARGS_ASSERT_REGCPPOP;
 378
 379     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 380     i = SSPOPUV;
 381     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 382     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 383     rex->lastcloseparen = SSPOPINT;
 384     rex->lastparen = SSPOPINT;
 385     *maxopenparen_p = SSPOPINT;
 386
 387     i -= REGCP_OTHER_ELEMS;
 388     /* Now restore the parentheses context. */
 389     DEBUG_BUFFERS_r(
 390         if (i || rex->lastparen + 1 <= rex->nparens)
 391             Perl_re_exec_indentf( aTHX_
 392                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 393                 depth,
 394                 PTR2UV(rex),
 395                 PTR2UV(rex->offs)
 396             );
 397     );
 398     paren = *maxopenparen_p;
 399     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 400         SSize_t tmps;
 401         rex->offs[paren].start_tmp = SSPOPINT;
 402         rex->offs[paren].start = SSPOPIV;
 403         tmps = SSPOPIV;
 404         if (paren <= rex->lastparen)
 405             rex->offs[paren].end = tmps;
 406         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 407             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 408             depth,
 409             (UV)paren,
 410             (IV)rex->offs[paren].start,
 411             (IV)rex->offs[paren].start_tmp,
 412             (IV)rex->offs[paren].end,
 413             (paren > rex->lastparen ? "(skipped)" : ""));
 414         );
 415         paren--;
 416     }
 417 #if 1
 418     /* It would seem that the similar code in regtry()
 419      * already takes care of this, and in fact it is in
 420      * a better location to since this code can #if 0-ed out
 421      * but the code in regtry() is needed or otherwise tests
 422      * requiring null fields (pat.t#187 and split.t#{13,14}
 423      * (as of patchlevel 7877)  will fail.  Then again,
 424      * this code seems to be necessary or otherwise
 425      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 426      * --jhi updated by dapm */
 427     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 428         if (i > *maxopenparen_p)
 429             rex->offs[i].start = -1;
 430         rex->offs[i].end = -1;
 431         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 432             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 433             depth,
 434             (UV)i,
 435             (i > *maxopenparen_p) ? "-1" : "  "
 436         ));
 437     }
 438 #endif
 439 }
 440
 441 /* restore the parens and associated vars at savestack position ix,
 442  * but without popping the stack */
 443
 444 STATIC void
 445 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 446 {
 447     I32 tmpix = PL_savestack_ix;
 448     PERL_ARGS_ASSERT_REGCP_RESTORE;
 449
 450     PL_savestack_ix = ix;
 451     regcppop(rex, maxopenparen_p);
 452     PL_savestack_ix = tmpix;
 453 }
 454
 455 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 456
 457 #ifndef PERL_IN_XSUB_RE
 458
 459 bool
 460 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 461 {
 462     /* Returns a boolean as to whether or not 'character' is a member of the
 463      * Posix character class given by 'classnum' that should be equivalent to a
 464      * value in the typedef '_char_class_number'.
 465      *
 466      * Ideally this could be replaced by a just an array of function pointers
 467      * to the C library functions that implement the macros this calls.
 468      * However, to compile, the precise function signatures are required, and
 469      * these may vary from platform to to platform.  To avoid having to figure
 470      * out what those all are on each platform, I (khw) am using this method,
 471      * which adds an extra layer of function call overhead (unless the C
 472      * optimizer strips it away).  But we don't particularly care about
 473      * performance with locales anyway. */
 474
 475     switch ((_char_class_number) classnum) {
 476         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 477         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 478         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 479         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 480         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 481                                         || isUPPER_LC(character);
 482         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 483         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 484         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 485         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 486         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 487         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 488         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 489         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 490         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 491         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 492         default:    /* VERTSPACE should never occur in locales */
 493             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 494     }
 495
 496     NOT_REACHED; /* NOTREACHED */
 497     return FALSE;
 498 }
 499
 500 #endif
 501
 502 STATIC bool
 503 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 504 {
 505     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 506      * 'character' is a member of the Posix character class given by 'classnum'
 507      * that should be equivalent to a value in the typedef
 508      * '_char_class_number'.
 509      *
 510      * This just calls isFOO_lc on the code point for the character if it is in
 511      * the range 0-255.  Outside that range, all characters use Unicode
 512      * rules, ignoring any locale.  So use the Unicode function if this class
 513      * requires a swash, and use the Unicode macro otherwise. */
 514
 515     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 516
 517     if (UTF8_IS_INVARIANT(*character)) {
 518         return isFOO_lc(classnum, *character);
 519     }
 520     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 521         return isFOO_lc(classnum,
 522                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 523     }
 524
 525     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 526
 527     if (classnum < _FIRST_NON_SWASH_CC) {
 528
 529         /* Initialize the swash unless done already */
 530         if (! PL_utf8_swash_ptrs[classnum]) {
 531             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 532             PL_utf8_swash_ptrs[classnum] =
 533                     _core_swash_init("utf8",
 534                                      "",
 535                                      &PL_sv_undef, 1, 0,
 536                                      PL_XPosix_ptrs[classnum], &flags);
 537         }
 538
 539         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 540                                  character,
 541                                  TRUE /* is UTF */ ));
 542     }
 543
 544     switch ((_char_class_number) classnum) {
 545         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 546         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 547         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 548         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 549         default:                 break;
 550     }
 551
 552     return FALSE; /* Things like CNTRL are always below 256 */
 553 }
 554
 555 STATIC char *
 556 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
 557 {
 558     /* Returns the position of the first ASCII byte in the sequence between 's'
 559      * and 'send-1' inclusive; returns 'send' if none found */
 560
 561     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 562
 563 #ifndef EBCDIC
 564
 565     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 566
 567                             /* This term is wordsize if subword; 0 if not */
 568                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 569
 570                             /* 'offset' */
 571                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 572     {
 573
 574         /* Process per-byte until reach word boundary.  XXX This loop could be
 575          * eliminated if we knew that this platform had fast unaligned reads */
 576         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 577             if (isASCII(*s)) {
 578                 return s;
 579             }
 580             s++;    /* khw didn't bother creating a separate loop for
 581                        utf8_target */
 582         }
 583
 584         /* Here, we know we have at least one full word to process.  Process
 585          * per-word as long as we have at least a full word left */
 586         do {
 587             PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
 588             if (complemented & PERL_VARIANTS_WORD_MASK)  {
 589
 590 #if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 591    || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 592
 593                 s += _variant_byte_number(complemented);
 594                 return s;
 595
 596 #else   /* If weird byte order, drop into next loop to do byte-at-a-time
 597            checks. */
 598
 599                 break;
 600 #endif
 601             }
 602
 603             s += PERL_WORDSIZE;
 604
 605         } while (s + PERL_WORDSIZE <= send);
 606     }
 607
 608 #endif
 609
 610     /* Process per-character */
 611     if (utf8_target) {
 612         while (s < send) {
 613             if (isASCII(*s)) {
 614                 return s;
 615             }
 616             s += UTF8SKIP(s);
 617         }
 618     }
 619     else {
 620         while (s < send) {
 621             if (isASCII(*s)) {
 622                 return s;
 623             }
 624             s++;
 625         }
 626     }
 627
 628     return s;
 629 }
 630
 631 STATIC char *
 632 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
 633 {
 634     /* Returns the position of the first non-ASCII byte in the sequence between
 635      * 's' and 'send-1' inclusive; returns 'send' if none found */
 636
 637 #ifdef EBCDIC
 638
 639     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 640
 641     if (utf8_target) {
 642         while (s < send) {
 643             if ( ! isASCII(*s)) {
 644                 return s;
 645             }
 646             s += UTF8SKIP(s);
 647         }
 648     }
 649     else {
 650         while (s < send) {
 651             if ( ! isASCII(*s)) {
 652                 return s;
 653             }
 654             s++;
 655         }
 656     }
 657
 658     return s;
 659
 660 #else
 661
 662     const U8 * next_non_ascii = NULL;
 663
 664     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 665     PERL_UNUSED_ARG(utf8_target);
 666
 667     /* On ASCII platforms invariants and ASCII are identical, so if the string
 668      * is entirely invariants, there is no non-ASCII character */
 669     return (is_utf8_invariant_string_loc((U8 *) s,
 670                                          (STRLEN) (send - s),
 671                                          &next_non_ascii))
 672             ? (char *) send
 673             : (char *) next_non_ascii;
 674
 675 #endif
 676
 677 }
 678
 679 STATIC U8 *
 680 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 681 {
 682     /* Returns the position of the first byte in the sequence between 's' and
 683      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 684      * */
 685
 686     PERL_ARGS_ASSERT_FIND_SPAN_END;
 687
 688     assert(send >= s);
 689
 690     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 691                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 692                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 693     {
 694         PERL_UINTMAX_T span_word;
 695
 696         /* Process per-byte until reach word boundary.  XXX This loop could be
 697          * eliminated if we knew that this platform had fast unaligned reads */
 698         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 699             if (*s != span_byte) {
 700                 return s;
 701             }
 702             s++;
 703         }
 704
 705         /* Create a word filled with the bytes we are spanning */
 706         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 707
 708         /* Process per-word as long as we have at least a full word left */
 709         do {
 710
 711             /* Keep going if the whole word is composed of 'span_byte's */
 712             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 713                 s += PERL_WORDSIZE;
 714                 continue;
 715             }
 716
 717             /* Here, at least one byte in the word isn't 'span_byte'.  This xor
 718              * leaves 1 bits only in those non-matching bytes */
 719             span_word ^= * (PERL_UINTMAX_T *) s;
 720
 721             /* Make sure the upper bit of each non-matching byte is set.  This
 722              * makes each such byte look like an ASCII platform variant byte */
 723             span_word |= span_word << 1;
 724             span_word |= span_word << 2;
 725             span_word |= span_word << 4;
 726
 727             /* That reduces the problem to what this function solves */
 728             return s + _variant_byte_number(span_word);
 729
 730         } while (s + PERL_WORDSIZE <= send);
 731     }
 732
 733     /* Process the straggler bytes beyond the final word boundary */
 734     while (s < send) {
 735         if (*s != span_byte) {
 736             return s;
 737         }
 738         s++;
 739     }
 740
 741     return s;
 742 }
 743
 744 STATIC U8 *
 745 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 746 {
 747     /* Returns the position of the first byte in the sequence between 's'
 748      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 749      * returns 'send' if none found.  It uses word-level operations instead of
 750      * byte to speed up the process */
 751
 752     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 753
 754     assert(send >= s);
 755     assert((byte & mask) == byte);
 756
 757     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 758                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 759                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 760     {
 761         PERL_UINTMAX_T word_complemented, mask_word;
 762
 763         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 764             if (((*s) & mask) == byte) {
 765                 return s;
 766             }
 767             s++;
 768         }
 769
 770         word_complemented = ~ (PERL_COUNT_MULTIPLIER * byte);
 771         mask_word =            PERL_COUNT_MULTIPLIER * mask;
 772
 773         do {
 774             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 775
 776             /* If 'masked' contains 'byte' within it, anding with the
 777              * complement will leave those 8 bits 0 */
 778             masked &= word_complemented;
 779
 780             /* This causes the most significant bit to be set to 1 for any
 781              * bytes in the word that aren't completely 0 */
 782             masked |= masked << 1;
 783             masked |= masked << 2;
 784             masked |= masked << 4;
 785
 786             /* The msbits are the same as what marks a byte as variant, so we
 787              * can use this mask.  If all msbits are 1, the word doesn't
 788              * contain 'byte' */
 789             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 790                 s += PERL_WORDSIZE;
 791                 continue;
 792             }
 793
 794             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 795              * and any that are, are 0.  Complement and re-AND to swap that */
 796             masked = ~ masked;
 797             masked &= PERL_VARIANTS_WORD_MASK;
 798
 799             /* This reduces the problem to that solved by this function */
 800             s += _variant_byte_number(masked);
 801             return s;
 802
 803         } while (s + PERL_WORDSIZE <= send);
 804     }
 805
 806     while (s < send) {
 807         if (((*s) & mask) == byte) {
 808             return s;
 809         }
 810         s++;
 811     }
 812
 813     return s;
 814 }
 815
 816 STATIC U8 *
 817 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 818 {
 819     /* Returns the position of the first byte in the sequence between 's' and
 820      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 821      * 'span_byte' should have been ANDed with 'mask' in the call of this
 822      * function.  Returns 'send' if none found.  Works like find_span_end(),
 823      * except for the AND */
 824
 825     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 826
 827     assert(send >= s);
 828     assert((span_byte & mask) == span_byte);
 829
 830     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 831                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 832                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 833     {
 834         PERL_UINTMAX_T span_word, mask_word;
 835
 836         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 837             if (((*s) & mask) != span_byte) {
 838                 return s;
 839             }
 840             s++;
 841         }
 842
 843         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 844         mask_word = PERL_COUNT_MULTIPLIER * mask;
 845
 846         do {
 847             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 848
 849             if (masked == span_word) {
 850                 s += PERL_WORDSIZE;
 851                 continue;
 852             }
 853
 854             masked ^= span_word;
 855             masked |= masked << 1;
 856             masked |= masked << 2;
 857             masked |= masked << 4;
 858             return s + _variant_byte_number(masked);
 859
 860         } while (s + PERL_WORDSIZE <= send);
 861     }
 862
 863     while (s < send) {
 864         if (((*s) & mask) != span_byte) {
 865             return s;
 866         }
 867         s++;
 868     }
 869
 870     return s;
 871 }
 872
 873 /*
 874  * pregexec and friends
 875  */
 876
 877 #ifndef PERL_IN_XSUB_RE
 878 /*
 879  - pregexec - match a regexp against a string
 880  */
 881 I32
 882 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 883          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 884 /* stringarg: the point in the string at which to begin matching */
 885 /* strend:    pointer to null at end of string */
 886 /* strbeg:    real beginning of string */
 887 /* minend:    end of match must be >= minend bytes after stringarg. */
 888 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 889  *            itself is accessed via the pointers above */
 890 /* nosave:    For optimizations. */
 891 {
 892     PERL_ARGS_ASSERT_PREGEXEC;
 893
 894     return
 895         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 896                       nosave ? 0 : REXEC_COPY_STR);
 897 }
 898 #endif
 899
 900
 901
 902 /* re_intuit_start():
 903  *
 904  * Based on some optimiser hints, try to find the earliest position in the
 905  * string where the regex could match.
 906  *
 907  *   rx:     the regex to match against
 908  *   sv:     the SV being matched: only used for utf8 flag; the string
 909  *           itself is accessed via the pointers below. Note that on
 910  *           something like an overloaded SV, SvPOK(sv) may be false
 911  *           and the string pointers may point to something unrelated to
 912  *           the SV itself.
 913  *   strbeg: real beginning of string
 914  *   strpos: the point in the string at which to begin matching
 915  *   strend: pointer to the byte following the last char of the string
 916  *   flags   currently unused; set to 0
 917  *   data:   currently unused; set to NULL
 918  *
 919  * The basic idea of re_intuit_start() is to use some known information
 920  * about the pattern, namely:
 921  *
 922  *   a) the longest known anchored substring (i.e. one that's at a
 923  *      constant offset from the beginning of the pattern; but not
 924  *      necessarily at a fixed offset from the beginning of the
 925  *      string);
 926  *   b) the longest floating substring (i.e. one that's not at a constant
 927  *      offset from the beginning of the pattern);
 928  *   c) Whether the pattern is anchored to the string; either
 929  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 930  *      or anchored to pos(): /\G/;
 931  *   d) A start class: a real or synthetic character class which
 932  *      represents which characters are legal at the start of the pattern;
 933  *
 934  * to either quickly reject the match, or to find the earliest position
 935  * within the string at which the pattern might match, thus avoiding
 936  * running the full NFA engine at those earlier locations, only to
 937  * eventually fail and retry further along.
 938  *
 939  * Returns NULL if the pattern can't match, or returns the address within
 940  * the string which is the earliest place the match could occur.
 941  *
 942  * The longest of the anchored and floating substrings is called 'check'
 943  * and is checked first. The other is called 'other' and is checked
 944  * second. The 'other' substring may not be present.  For example,
 945  *
 946  *    /(abc|xyz)ABC\d{0,3}DEFG/
 947  *
 948  * will have
 949  *
 950  *   check substr (float)    = "DEFG", offset 6..9 chars
 951  *   other substr (anchored) = "ABC",  offset 3..3 chars
 952  *   stclass = [ax]
 953  *
 954  * Be aware that during the course of this function, sometimes 'anchored'
 955  * refers to a substring being anchored relative to the start of the
 956  * pattern, and sometimes to the pattern itself being anchored relative to
 957  * the string. For example:
 958  *
 959  *   /\dabc/:   "abc" is anchored to the pattern;
 960  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 961  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 962  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 963  *                    but the pattern is anchored to the string.
 964  */
 965
 966 char *
 967 Perl_re_intuit_start(pTHX_
 968                     REGEXP * const rx,
 969                     SV *sv,
 970                     const char * const strbeg,
 971                     char *strpos,
 972                     char *strend,
 973                     const U32 flags,
 974                     re_scream_pos_data *data)
 975 {
 976     struct regexp *const prog = ReANY(rx);
 977     SSize_t start_shift = prog->check_offset_min;
 978     /* Should be nonnegative! */
 979     SSize_t end_shift   = 0;
 980     /* current lowest pos in string where the regex can start matching */
 981     char *rx_origin = strpos;
 982     SV *check;
 983     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 984     U8   other_ix = 1 - prog->substrs->check_ix;
 985     bool ml_anch = 0;
 986     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 987     char *check_at = NULL;              /* check substr found at this pos */
 988     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 989     RXi_GET_DECL(prog,progi);
 990     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 991     regmatch_info *const reginfo = &reginfo_buf;
 992     GET_RE_DEBUG_FLAGS_DECL;
 993
 994     PERL_ARGS_ASSERT_RE_INTUIT_START;
 995     PERL_UNUSED_ARG(flags);
 996     PERL_UNUSED_ARG(data);
 997
 998     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 999                 "Intuit: trying to determine minimum start position...\n"));
1000
1001     /* for now, assume that all substr offsets are positive. If at some point
1002      * in the future someone wants to do clever things with lookbehind and
1003      * -ve offsets, they'll need to fix up any code in this function
1004      * which uses these offsets. See the thread beginning
1005      * <20140113145929.GF27210@iabyn.com>
1006      */
1007     assert(prog->substrs->data[0].min_offset >= 0);
1008     assert(prog->substrs->data[0].max_offset >= 0);
1009     assert(prog->substrs->data[1].min_offset >= 0);
1010     assert(prog->substrs->data[1].max_offset >= 0);
1011     assert(prog->substrs->data[2].min_offset >= 0);
1012     assert(prog->substrs->data[2].max_offset >= 0);
1013
1014     /* for now, assume that if both present, that the floating substring
1015      * doesn't start before the anchored substring.
1016      * If you break this assumption (e.g. doing better optimisations
1017      * with lookahead/behind), then you'll need to audit the code in this
1018      * function carefully first
1019      */
1020     assert(
1021             ! (  (prog->anchored_utf8 || prog->anchored_substr)
1022               && (prog->float_utf8    || prog->float_substr))
1023            || (prog->float_min_offset >= prog->anchored_offset));
1024
1025     /* byte rather than char calculation for efficiency. It fails
1026      * to quickly reject some cases that can't match, but will reject
1027      * them later after doing full char arithmetic */
1028     if (prog->minlen > strend - strpos) {
1029         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1030                               "  String too short...\n"));
1031         goto fail;
1032     }
1033
1034     RXp_MATCH_UTF8_set(prog, utf8_target);
1035     reginfo->is_utf8_target = cBOOL(utf8_target);
1036     reginfo->info_aux = NULL;
1037     reginfo->strbeg = strbeg;
1038     reginfo->strend = strend;
1039     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1040     reginfo->intuit = 1;
1041     /* not actually used within intuit, but zero for safety anyway */
1042     reginfo->poscache_maxiter = 0;
1043
1044     if (utf8_target) {
1045         if ((!prog->anchored_utf8 && prog->anchored_substr)
1046                 || (!prog->float_utf8 && prog->float_substr))
1047             to_utf8_substr(prog);
1048         check = prog->check_utf8;
1049     } else {
1050         if (!prog->check_substr && prog->check_utf8) {
1051             if (! to_byte_substr(prog)) {
1052                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1053             }
1054         }
1055         check = prog->check_substr;
1056     }
1057
1058     /* dump the various substring data */
1059     DEBUG_OPTIMISE_MORE_r({
1060         int i;
1061         for (i=0; i<=2; i++) {
1062             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1063                                   : prog->substrs->data[i].substr);
1064             if (!sv)
1065                 continue;
1066
1067             Perl_re_printf( aTHX_
1068                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1069                 " useful=%" IVdf " utf8=%d [%s]\n",
1070                 i,
1071                 (IV)prog->substrs->data[i].min_offset,
1072                 (IV)prog->substrs->data[i].max_offset,
1073                 (IV)prog->substrs->data[i].end_shift,
1074                 BmUSEFUL(sv),
1075                 utf8_target ? 1 : 0,
1076                 SvPEEK(sv));
1077         }
1078     });
1079
1080     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1081
1082         /* ml_anch: check after \n?
1083          *
1084          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1085          * with /.*.../, these flags will have been added by the
1086          * compiler:
1087          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1088          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1089          */
1090         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1091                    && !(prog->intflags & PREGf_IMPLICIT);
1092
1093         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1094             /* we are only allowed to match at BOS or \G */
1095
1096             /* trivially reject if there's a BOS anchor and we're not at BOS.
1097              *
1098              * Note that we don't try to do a similar quick reject for
1099              * \G, since generally the caller will have calculated strpos
1100              * based on pos() and gofs, so the string is already correctly
1101              * anchored by definition; and handling the exceptions would
1102              * be too fiddly (e.g. REXEC_IGNOREPOS).
1103              */
1104             if (   strpos != strbeg
1105                 && (prog->intflags & PREGf_ANCH_SBOL))
1106             {
1107                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1108                                 "  Not at start...\n"));
1109                 goto fail;
1110             }
1111
1112             /* in the presence of an anchor, the anchored (relative to the
1113              * start of the regex) substr must also be anchored relative
1114              * to strpos. So quickly reject if substr isn't found there.
1115              * This works for \G too, because the caller will already have
1116              * subtracted gofs from pos, and gofs is the offset from the
1117              * \G to the start of the regex. For example, in /.abc\Gdef/,
1118              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1119              * caller will have set strpos=pos()-4; we look for the substr
1120              * at position pos()-4+1, which lines up with the "a" */
1121
1122             if (prog->check_offset_min == prog->check_offset_max) {
1123                 /* Substring at constant offset from beg-of-str... */
1124                 SSize_t slen = SvCUR(check);
1125                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1126
1127                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1128                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1129                     (IV)prog->check_offset_min));
1130
1131                 if (SvTAIL(check)) {
1132                     /* In this case, the regex is anchored at the end too.
1133                      * Unless it's a multiline match, the lengths must match
1134                      * exactly, give or take a \n.  NB: slen >= 1 since
1135                      * the last char of check is \n */
1136                     if (!multiline
1137                         && (   strend - s > slen
1138                             || strend - s < slen - 1
1139                             || (strend - s == slen && strend[-1] != '\n')))
1140                     {
1141                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1142                                             "  String too long...\n"));
1143                         goto fail_finish;
1144                     }
1145                     /* Now should match s[0..slen-2] */
1146                     slen--;
1147                 }
1148                 if (slen && (strend - s < slen
1149                     || *SvPVX_const(check) != *s
1150                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1151                 {
1152                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1153                                     "  String not equal...\n"));
1154                     goto fail_finish;
1155                 }
1156
1157                 check_at = s;
1158                 goto success_at_start;
1159             }
1160         }
1161     }
1162
1163     end_shift = prog->check_end_shift;
1164
1165 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1166     if (end_shift < 0)
1167         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1168                    (IV)end_shift, RX_PRECOMP(rx));
1169 #endif
1170
1171   restart:
1172
1173     /* This is the (re)entry point of the main loop in this function.
1174      * The goal of this loop is to:
1175      * 1) find the "check" substring in the region rx_origin..strend
1176      *    (adjusted by start_shift / end_shift). If not found, reject
1177      *    immediately.
1178      * 2) If it exists, look for the "other" substr too if defined; for
1179      *    example, if the check substr maps to the anchored substr, then
1180      *    check the floating substr, and vice-versa. If not found, go
1181      *    back to (1) with rx_origin suitably incremented.
1182      * 3) If we find an rx_origin position that doesn't contradict
1183      *    either of the substrings, then check the possible additional
1184      *    constraints on rx_origin of /^.../m or a known start class.
1185      *    If these fail, then depending on which constraints fail, jump
1186      *    back to here, or to various other re-entry points further along
1187      *    that skip some of the first steps.
1188      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1189      *    substring. If the start position was determined to be at the
1190      *    beginning of the string  - so, not rejected, but not optimised,
1191      *    since we have to run regmatch from position 0 - decrement the
1192      *    BmUSEFUL() count. Otherwise increment it.
1193      */
1194
1195
1196     /* first, look for the 'check' substring */
1197
1198     {
1199         U8* start_point;
1200         U8* end_point;
1201
1202         DEBUG_OPTIMISE_MORE_r({
1203             Perl_re_printf( aTHX_
1204                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1205                 " Start shift: %" IVdf " End shift %" IVdf
1206                 " Real end Shift: %" IVdf "\n",
1207                 (IV)(rx_origin - strbeg),
1208                 (IV)prog->check_offset_min,
1209                 (IV)start_shift,
1210                 (IV)end_shift,
1211                 (IV)prog->check_end_shift);
1212         });
1213
1214         end_point = HOPBACK3(strend, end_shift, rx_origin);
1215         if (!end_point)
1216             goto fail_finish;
1217         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1218         if (!start_point)
1219             goto fail_finish;
1220
1221
1222         /* If the regex is absolutely anchored to either the start of the
1223          * string (SBOL) or to pos() (ANCH_GPOS), then
1224          * check_offset_max represents an upper bound on the string where
1225          * the substr could start. For the ANCH_GPOS case, we assume that
1226          * the caller of intuit will have already set strpos to
1227          * pos()-gofs, so in this case strpos + offset_max will still be
1228          * an upper bound on the substr.
1229          */
1230         if (!ml_anch
1231             && prog->intflags & PREGf_ANCH
1232             && prog->check_offset_max != SSize_t_MAX)
1233         {
1234             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1235             const char * const anchor =
1236                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1237             SSize_t targ_len = (char*)end_point - anchor;
1238
1239             if (check_len > targ_len) {
1240                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1241                               "Target string too short to match required substring...\n"));
1242                 goto fail_finish;
1243             }
1244
1245             /* do a bytes rather than chars comparison. It's conservative;
1246              * so it skips doing the HOP if the result can't possibly end
1247              * up earlier than the old value of end_point.
1248              */
1249             assert(anchor + check_len <= (char *)end_point);
1250             if (prog->check_offset_max + check_len < targ_len) {
1251                 end_point = HOP3lim((U8*)anchor,
1252                                 prog->check_offset_max,
1253                                 end_point - check_len
1254                             )
1255                             + check_len;
1256                 if (end_point < start_point)
1257                     goto fail_finish;
1258             }
1259         }
1260
1261         check_at = fbm_instr( start_point, end_point,
1262                       check, multiline ? FBMrf_MULTILINE : 0);
1263
1264         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1265             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1266             (IV)((char*)start_point - strbeg),
1267             (IV)((char*)end_point   - strbeg),
1268             (IV)(check_at ? check_at - strbeg : -1)
1269         ));
1270
1271         /* Update the count-of-usability, remove useless subpatterns,
1272             unshift s.  */
1273
1274         DEBUG_EXECUTE_r({
1275             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1276                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1277             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1278                               (check_at ? "Found" : "Did not find"),
1279                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1280                     ? "anchored" : "floating"),
1281                 quoted,
1282                 RE_SV_TAIL(check),
1283                 (check_at ? " at offset " : "...\n") );
1284         });
1285
1286         if (!check_at)
1287             goto fail_finish;
1288         /* set rx_origin to the minimum position where the regex could start
1289          * matching, given the constraint of the just-matched check substring.
1290          * But don't set it lower than previously.
1291          */
1292
1293         if (check_at - rx_origin > prog->check_offset_max)
1294             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1295         /* Finish the diagnostic message */
1296         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1297             "%ld (rx_origin now %" IVdf ")...\n",
1298             (long)(check_at - strbeg),
1299             (IV)(rx_origin - strbeg)
1300         ));
1301     }
1302
1303
1304     /* now look for the 'other' substring if defined */
1305
1306     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1307                     : prog->substrs->data[other_ix].substr)
1308     {
1309         /* Take into account the "other" substring. */
1310         char *last, *last1;
1311         char *s;
1312         SV* must;
1313         struct reg_substr_datum *other;
1314
1315       do_other_substr:
1316         other = &prog->substrs->data[other_ix];
1317
1318         /* if "other" is anchored:
1319          * we've previously found a floating substr starting at check_at.
1320          * This means that the regex origin must lie somewhere
1321          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1322          * and max:                 HOP3(check_at, -check_offset_min)
1323          * (except that min will be >= strpos)
1324          * So the fixed  substr must lie somewhere between
1325          *  HOP3(min, anchored_offset)
1326          *  HOP3(max, anchored_offset) + SvCUR(substr)
1327          */
1328
1329         /* if "other" is floating
1330          * Calculate last1, the absolute latest point where the
1331          * floating substr could start in the string, ignoring any
1332          * constraints from the earlier fixed match. It is calculated
1333          * as follows:
1334          *
1335          * strend - prog->minlen (in chars) is the absolute latest
1336          * position within the string where the origin of the regex
1337          * could appear. The latest start point for the floating
1338          * substr is float_min_offset(*) on from the start of the
1339          * regex.  last1 simply combines thee two offsets.
1340          *
1341          * (*) You might think the latest start point should be
1342          * float_max_offset from the regex origin, and technically
1343          * you'd be correct. However, consider
1344          *    /a\d{2,4}bcd\w/
1345          * Here, float min, max are 3,5 and minlen is 7.
1346          * This can match either
1347          *    /a\d\dbcd\w/
1348          *    /a\d\d\dbcd\w/
1349          *    /a\d\d\d\dbcd\w/
1350          * In the first case, the regex matches minlen chars; in the
1351          * second, minlen+1, in the third, minlen+2.
1352          * In the first case, the floating offset is 3 (which equals
1353          * float_min), in the second, 4, and in the third, 5 (which
1354          * equals float_max). In all cases, the floating string bcd
1355          * can never start more than 4 chars from the end of the
1356          * string, which equals minlen - float_min. As the substring
1357          * starts to match more than float_min from the start of the
1358          * regex, it makes the regex match more than minlen chars,
1359          * and the two cancel each other out. So we can always use
1360          * float_min - minlen, rather than float_max - minlen for the
1361          * latest position in the string.
1362          *
1363          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1364          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1365          */
1366
1367         assert(prog->minlen >= other->min_offset);
1368         last1 = HOP3c(strend,
1369                         other->min_offset - prog->minlen, strbeg);
1370
1371         if (other_ix) {/* i.e. if (other-is-float) */
1372             /* last is the latest point where the floating substr could
1373              * start, *given* any constraints from the earlier fixed
1374              * match. This constraint is that the floating string starts
1375              * <= float_max_offset chars from the regex origin (rx_origin).
1376              * If this value is less than last1, use it instead.
1377              */
1378             assert(rx_origin <= last1);
1379             last =
1380                 /* this condition handles the offset==infinity case, and
1381                  * is a short-cut otherwise. Although it's comparing a
1382                  * byte offset to a char length, it does so in a safe way,
1383                  * since 1 char always occupies 1 or more bytes,
1384                  * so if a string range is  (last1 - rx_origin) bytes,
1385                  * it will be less than or equal to  (last1 - rx_origin)
1386                  * chars; meaning it errs towards doing the accurate HOP3
1387                  * rather than just using last1 as a short-cut */
1388                 (last1 - rx_origin) < other->max_offset
1389                     ? last1
1390                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1391         }
1392         else {
1393             assert(strpos + start_shift <= check_at);
1394             last = HOP4c(check_at, other->min_offset - start_shift,
1395                         strbeg, strend);
1396         }
1397
1398         s = HOP3c(rx_origin, other->min_offset, strend);
1399         if (s < other_last)     /* These positions already checked */
1400             s = other_last;
1401
1402         must = utf8_target ? other->utf8_substr : other->substr;
1403         assert(SvPOK(must));
1404         {
1405             char *from = s;
1406             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1407
1408             if (to > strend)
1409                 to = strend;
1410             if (from > to) {
1411                 s = NULL;
1412                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1413                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1414                     (IV)(from - strbeg),
1415                     (IV)(to   - strbeg)
1416                 ));
1417             }
1418             else {
1419                 s = fbm_instr(
1420                     (unsigned char*)from,
1421                     (unsigned char*)to,
1422                     must,
1423                     multiline ? FBMrf_MULTILINE : 0
1424                 );
1425                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1426                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1427                     (IV)(from - strbeg),
1428                     (IV)(to   - strbeg),
1429                     (IV)(s ? s - strbeg : -1)
1430                 ));
1431             }
1432         }
1433
1434         DEBUG_EXECUTE_r({
1435             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1436                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1437             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1438                 s ? "Found" : "Contradicts",
1439                 other_ix ? "floating" : "anchored",
1440                 quoted, RE_SV_TAIL(must));
1441         });
1442
1443
1444         if (!s) {
1445             /* last1 is latest possible substr location. If we didn't
1446              * find it before there, we never will */
1447             if (last >= last1) {
1448                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1449                                         "; giving up...\n"));
1450                 goto fail_finish;
1451             }
1452
1453             /* try to find the check substr again at a later
1454              * position. Maybe next time we'll find the "other" substr
1455              * in range too */
1456             other_last = HOP3c(last, 1, strend) /* highest failure */;
1457             rx_origin =
1458                 other_ix /* i.e. if other-is-float */
1459                     ? HOP3c(rx_origin, 1, strend)
1460                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1461             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1462                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1463                 (other_ix ? "floating" : "anchored"),
1464                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1465                 (IV)(rx_origin - strbeg)
1466             ));
1467             goto restart;
1468         }
1469         else {
1470             if (other_ix) { /* if (other-is-float) */
1471                 /* other_last is set to s, not s+1, since its possible for
1472                  * a floating substr to fail first time, then succeed
1473                  * second time at the same floating position; e.g.:
1474                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1475                  * The first time round, anchored and float match at
1476                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1477                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1478                  */
1479                 other_last = s;
1480             }
1481             else {
1482                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1483                 other_last = HOP3c(s, 1, strend);
1484             }
1485             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1486                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1487                   (long)(s - strbeg),
1488                 (IV)(rx_origin - strbeg)
1489               ));
1490
1491         }
1492     }
1493     else {
1494         DEBUG_OPTIMISE_MORE_r(
1495             Perl_re_printf( aTHX_
1496                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1497                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1498                 " strend:%" IVdf "\n",
1499                 (IV)prog->check_offset_min,
1500                 (IV)prog->check_offset_max,
1501                 (IV)(check_at-strbeg),
1502                 (IV)(rx_origin-strbeg),
1503                 (IV)(rx_origin-check_at),
1504                 (IV)(strend-strbeg)
1505             )
1506         );
1507     }
1508
1509   postprocess_substr_matches:
1510
1511     /* handle the extra constraint of /^.../m if present */
1512
1513     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1514         char *s;
1515
1516         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1517                         "  looking for /^/m anchor"));
1518
1519         /* we have failed the constraint of a \n before rx_origin.
1520          * Find the next \n, if any, even if it's beyond the current
1521          * anchored and/or floating substrings. Whether we should be
1522          * scanning ahead for the next \n or the next substr is debatable.
1523          * On the one hand you'd expect rare substrings to appear less
1524          * often than \n's. On the other hand, searching for \n means
1525          * we're effectively flipping between check_substr and "\n" on each
1526          * iteration as the current "rarest" string candidate, which
1527          * means for example that we'll quickly reject the whole string if
1528          * hasn't got a \n, rather than trying every substr position
1529          * first
1530          */
1531
1532         s = HOP3c(strend, - prog->minlen, strpos);
1533         if (s <= rx_origin ||
1534             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1535         {
1536             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1537                             "  Did not find /%s^%s/m...\n",
1538                             PL_colors[0], PL_colors[1]));
1539             goto fail_finish;
1540         }
1541
1542         /* earliest possible origin is 1 char after the \n.
1543          * (since *rx_origin == '\n', it's safe to ++ here rather than
1544          * HOP(rx_origin, 1)) */
1545         rx_origin++;
1546
1547         if (prog->substrs->check_ix == 0  /* check is anchored */
1548             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1549         {
1550             /* Position contradicts check-string; either because
1551              * check was anchored (and thus has no wiggle room),
1552              * or check was float and rx_origin is above the float range */
1553             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1554                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1555                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1556             goto restart;
1557         }
1558
1559         /* if we get here, the check substr must have been float,
1560          * is in range, and we may or may not have had an anchored
1561          * "other" substr which still contradicts */
1562         assert(prog->substrs->check_ix); /* check is float */
1563
1564         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1565             /* whoops, the anchored "other" substr exists, so we still
1566              * contradict. On the other hand, the float "check" substr
1567              * didn't contradict, so just retry the anchored "other"
1568              * substr */
1569             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1570                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1571                 PL_colors[0], PL_colors[1],
1572                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1573                 (IV)(rx_origin - strbeg)
1574             ));
1575             goto do_other_substr;
1576         }
1577
1578         /* success: we don't contradict the found floating substring
1579          * (and there's no anchored substr). */
1580         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1581             "  Found /%s^%s/m with rx_origin %ld...\n",
1582             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1583     }
1584     else {
1585         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1586             "  (multiline anchor test skipped)\n"));
1587     }
1588
1589   success_at_start:
1590
1591
1592     /* if we have a starting character class, then test that extra constraint.
1593      * (trie stclasses are too expensive to use here, we are better off to
1594      * leave it to regmatch itself) */
1595
1596     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1597         const U8* const str = (U8*)STRING(progi->regstclass);
1598
1599         /* XXX this value could be pre-computed */
1600         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1601                     ?  (reginfo->is_utf8_pat
1602                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1603                         : STR_LEN(progi->regstclass))
1604                     : 1);
1605         char * endpos;
1606         char *s;
1607         /* latest pos that a matching float substr constrains rx start to */
1608         char *rx_max_float = NULL;
1609
1610         /* if the current rx_origin is anchored, either by satisfying an
1611          * anchored substring constraint, or a /^.../m constraint, then we
1612          * can reject the current origin if the start class isn't found
1613          * at the current position. If we have a float-only match, then
1614          * rx_origin is constrained to a range; so look for the start class
1615          * in that range. if neither, then look for the start class in the
1616          * whole rest of the string */
1617
1618         /* XXX DAPM it's not clear what the minlen test is for, and why
1619          * it's not used in the floating case. Nothing in the test suite
1620          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1621          * Here are some old comments, which may or may not be correct:
1622          *
1623          *   minlen == 0 is possible if regstclass is \b or \B,
1624          *   and the fixed substr is ''$.
1625          *   Since minlen is already taken into account, rx_origin+1 is
1626          *   before strend; accidentally, minlen >= 1 guaranties no false
1627          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1628          *   0) below assumes that regstclass does not come from lookahead...
1629          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1630          *   This leaves EXACTF-ish only, which are dealt with in
1631          *   find_byclass().
1632          */
1633
1634         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1635             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1636         else if (prog->float_substr || prog->float_utf8) {
1637             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1638             endpos = HOP3clim(rx_max_float, cl_l, strend);
1639         }
1640         else
1641             endpos= strend;
1642
1643         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1644             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1645             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1646               (IV)start_shift, (IV)(check_at - strbeg),
1647               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1648
1649         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1650                             reginfo);
1651         if (!s) {
1652             if (endpos == strend) {
1653                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1654                                 "  Could not match STCLASS...\n") );
1655                 goto fail;
1656             }
1657             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1658                                "  This position contradicts STCLASS...\n") );
1659             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1660                         && !(prog->intflags & PREGf_IMPLICIT))
1661                 goto fail;
1662
1663             /* Contradict one of substrings */
1664             if (prog->anchored_substr || prog->anchored_utf8) {
1665                 if (prog->substrs->check_ix == 1) { /* check is float */
1666                     /* Have both, check_string is floating */
1667                     assert(rx_origin + start_shift <= check_at);
1668                     if (rx_origin + start_shift != check_at) {
1669                         /* not at latest position float substr could match:
1670                          * Recheck anchored substring, but not floating.
1671                          * The condition above is in bytes rather than
1672                          * chars for efficiency. It's conservative, in
1673                          * that it errs on the side of doing 'goto
1674                          * do_other_substr'. In this case, at worst,
1675                          * an extra anchored search may get done, but in
1676                          * practice the extra fbm_instr() is likely to
1677                          * get skipped anyway. */
1678                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1679                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1680                             (long)(other_last - strbeg),
1681                             (IV)(rx_origin - strbeg)
1682                         ));
1683                         goto do_other_substr;
1684                     }
1685                 }
1686             }
1687             else {
1688                 /* float-only */
1689
1690                 if (ml_anch) {
1691                     /* In the presence of ml_anch, we might be able to
1692                      * find another \n without breaking the current float
1693                      * constraint. */
1694
1695                     /* strictly speaking this should be HOP3c(..., 1, ...),
1696                      * but since we goto a block of code that's going to
1697                      * search for the next \n if any, its safe here */
1698                     rx_origin++;
1699                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1700                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1701                               PL_colors[0], PL_colors[1],
1702                               (long)(rx_origin - strbeg)) );
1703                     goto postprocess_substr_matches;
1704                 }
1705
1706                 /* strictly speaking this can never be true; but might
1707                  * be if we ever allow intuit without substrings */
1708                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1709                     goto fail;
1710
1711                 rx_origin = rx_max_float;
1712             }
1713
1714             /* at this point, any matching substrings have been
1715              * contradicted. Start again... */
1716
1717             rx_origin = HOP3c(rx_origin, 1, strend);
1718
1719             /* uses bytes rather than char calculations for efficiency.
1720              * It's conservative: it errs on the side of doing 'goto restart',
1721              * where there is code that does a proper char-based test */
1722             if (rx_origin + start_shift + end_shift > strend) {
1723                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1724                                        "  Could not match STCLASS...\n") );
1725                 goto fail;
1726             }
1727             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1728                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1729                 (prog->substrs->check_ix ? "floating" : "anchored"),
1730                 (long)(rx_origin + start_shift - strbeg),
1731                 (IV)(rx_origin - strbeg)
1732             ));
1733             goto restart;
1734         }
1735
1736         /* Success !!! */
1737
1738         if (rx_origin != s) {
1739             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1740                         "  By STCLASS: moving %ld --> %ld\n",
1741                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1742                    );
1743         }
1744         else {
1745             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1746                                   "  Does not contradict STCLASS...\n");
1747                    );
1748         }
1749     }
1750
1751     /* Decide whether using the substrings helped */
1752
1753     if (rx_origin != strpos) {
1754         /* Fixed substring is found far enough so that the match
1755            cannot start at strpos. */
1756
1757         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1758         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1759     }
1760     else {
1761         /* The found rx_origin position does not prohibit matching at
1762          * strpos, so calling intuit didn't gain us anything. Decrement
1763          * the BmUSEFUL() count on the check substring, and if we reach
1764          * zero, free it.  */
1765         if (!(prog->intflags & PREGf_NAUGHTY)
1766             && (utf8_target ? (
1767                 prog->check_utf8                /* Could be deleted already */
1768                 && --BmUSEFUL(prog->check_utf8) < 0
1769                 && (prog->check_utf8 == prog->float_utf8)
1770             ) : (
1771                 prog->check_substr              /* Could be deleted already */
1772                 && --BmUSEFUL(prog->check_substr) < 0
1773                 && (prog->check_substr == prog->float_substr)
1774             )))
1775         {
1776             /* If flags & SOMETHING - do not do it many times on the same match */
1777             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1778             /* XXX Does the destruction order has to change with utf8_target? */
1779             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1780             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1781             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1782             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1783             check = NULL;                       /* abort */
1784             /* XXXX This is a remnant of the old implementation.  It
1785                     looks wasteful, since now INTUIT can use many
1786                     other heuristics. */
1787             prog->extflags &= ~RXf_USE_INTUIT;
1788         }
1789     }
1790
1791     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1792             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1793              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1794
1795     return rx_origin;
1796
1797   fail_finish:                          /* Substring not found */
1798     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1799         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1800   fail:
1801     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1802                           PL_colors[4], PL_colors[5]));
1803     return NULL;
1804 }
1805
1806
1807 #define DECL_TRIE_TYPE(scan) \
1808     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1809                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1810                  trie_utf8l, trie_flu8 }                                            \
1811                     trie_type = ((scan->flags == EXACT)                             \
1812                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1813                                  : (scan->flags == EXACTL)                          \
1814                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1815                                     : (scan->flags == EXACTFAA)                     \
1816                                       ? (utf8_target                                \
1817                                          ? trie_utf8_exactfa_fold                   \
1818                                          : trie_latin_utf8_exactfa_fold)            \
1819                                       : (scan->flags == EXACTFLU8                   \
1820                                          ? trie_flu8                                \
1821                                          : (utf8_target                             \
1822                                            ? trie_utf8_fold                         \
1823                                            :   trie_latin_utf8_fold)))
1824
1825 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1826 STMT_START {                                                                        \
1827     STRLEN skiplen;                                                                 \
1828     U8 flags = FOLD_FLAGS_FULL;                                                     \
1829     switch (trie_type) {                                                            \
1830     case trie_flu8:                                                                 \
1831         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1832         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1833             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1834         }                                                                           \
1835         goto do_trie_utf8_fold;                                                     \
1836     case trie_utf8_exactfa_fold:                                                    \
1837         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1838         /* FALLTHROUGH */                                                           \
1839     case trie_utf8_fold:                                                            \
1840       do_trie_utf8_fold:                                                            \
1841         if ( foldlen>0 ) {                                                          \
1842             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1843             foldlen -= len;                                                         \
1844             uscan += len;                                                           \
1845             len=0;                                                                  \
1846         } else {                                                                    \
1847             len = UTF8SKIP(uc);                                                     \
1848             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen,  \
1849                                                                             flags); \
1850             skiplen = UVCHR_SKIP( uvc );                                            \
1851             foldlen -= skiplen;                                                     \
1852             uscan = foldbuf + skiplen;                                              \
1853         }                                                                           \
1854         break;                                                                      \
1855     case trie_latin_utf8_exactfa_fold:                                              \
1856         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1857         /* FALLTHROUGH */                                                           \
1858     case trie_latin_utf8_fold:                                                      \
1859         if ( foldlen>0 ) {                                                          \
1860             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1861             foldlen -= len;                                                         \
1862             uscan += len;                                                           \
1863             len=0;                                                                  \
1864         } else {                                                                    \
1865             len = 1;                                                                \
1866             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1867             skiplen = UVCHR_SKIP( uvc );                                            \
1868             foldlen -= skiplen;                                                     \
1869             uscan = foldbuf + skiplen;                                              \
1870         }                                                                           \
1871         break;                                                                      \
1872     case trie_utf8l:                                                                \
1873         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1874         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1875             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1876         }                                                                           \
1877         /* FALLTHROUGH */                                                           \
1878     case trie_utf8:                                                                 \
1879         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1880         break;                                                                      \
1881     case trie_plain:                                                                \
1882         uvc = (UV)*uc;                                                              \
1883         len = 1;                                                                    \
1884     }                                                                               \
1885     if (uvc < 256) {                                                                \
1886         charid = trie->charmap[ uvc ];                                              \
1887     }                                                                               \
1888     else {                                                                          \
1889         charid = 0;                                                                 \
1890         if (widecharmap) {                                                          \
1891             SV** const svpp = hv_fetch(widecharmap,                                 \
1892                         (char*)&uvc, sizeof(UV), 0);                                \
1893             if (svpp)                                                               \
1894                 charid = (U16)SvIV(*svpp);                                          \
1895         }                                                                           \
1896     }                                                                               \
1897 } STMT_END
1898
1899 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1900     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1901                 startpos, doutf8, depth)
1902
1903 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1904     STMT_START {                                            \
1905         while (s < strend) {                                \
1906             CODE                                            \
1907             s += ((UTF8) ? UTF8SKIP(s) : 1);                \
1908         }                                                   \
1909     } STMT_END
1910
1911 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1912     STMT_START {                                            \
1913         while (s < strend) {                                \
1914             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1915         }                                                   \
1916     } STMT_END
1917
1918 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1919     if (COND) {                                                \
1920         FBC_CHECK_AND_TRY                                      \
1921         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1922         previous_occurrence_end = s;                           \
1923     }                                                          \
1924     else {                                                     \
1925         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1926     }
1927
1928 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1929     if (utf8_target) {                                         \
1930         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1931     }                                                          \
1932     else {                                                     \
1933         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1934     }
1935
1936 /* We keep track of where the next character should start after an occurrence
1937  * of the one we're looking for.  Knowing that, we can see right away if the
1938  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1939  * don't accept the 2nd and succeeding adjacent occurrences */
1940 #define FBC_CHECK_AND_TRY                                      \
1941         if (   (   doevery                                     \
1942                 || s != previous_occurrence_end)               \
1943             && (reginfo->intuit || regtry(reginfo, &s)))       \
1944         {                                                      \
1945             goto got_it;                                       \
1946         }
1947
1948
1949 /* This differs from the above macros in that it calls a function which returns
1950  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1951  * there is no such occurrence. */
1952 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1953     while (s < strend) {                                    \
1954         s = (f);                                            \
1955         if (s >= strend) {                                  \
1956             break;                                          \
1957         }                                                   \
1958                                                             \
1959         FBC_CHECK_AND_TRY                                   \
1960         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1961         previous_occurrence_end = s;                        \
1962     }
1963
1964 /* The three macros below are slightly different versions of the same logic.
1965  *
1966  * The first is for /a and /aa when the target string is UTF-8.  This can only
1967  * match ascii, but it must advance based on UTF-8.   The other two handle the
1968  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1969  * for the boundary (or non-boundary) between a word and non-word character.
1970  * The utf8 and non-utf8 cases have the same logic, but the details must be
1971  * different.  Find the "wordness" of the character just prior to this one, and
1972  * compare it with the wordness of this one.  If they differ, we have a
1973  * boundary.  At the beginning of the string, pretend that the previous
1974  * character was a new-line.
1975  *
1976  * All these macros uncleanly have side-effects with each other and outside
1977  * variables.  So far it's been too much trouble to clean-up
1978  *
1979  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1980  *               a word character or not.
1981  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1982  *               word/non-word
1983  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1984  *
1985  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1986  * are looking for a boundary or for a non-boundary.  If we are looking for a
1987  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1988  * see if this tentative match actually works, and if so, to quit the loop
1989  * here.  And vice-versa if we are looking for a non-boundary.
1990  *
1991  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1992  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1993  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1994  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1995  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1996  * complement.  But in that branch we complement tmp, meaning that at the
1997  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1998  * which means at the top of the loop in the next iteration, it is
1999  * TEST_NON_UTF8(s-1) */
2000 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
2001     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
2002     tmp = TEST_NON_UTF8(tmp);                                                  \
2003     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2004         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
2005             tmp = !tmp;                                                        \
2006             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
2007         }                                                                      \
2008         else {                                                                 \
2009             IF_FAIL;                                                           \
2010         }                                                                      \
2011     );                                                                         \
2012
2013 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2014  * TEST_UTF8 is a macro that for the same input code points returns identically
2015  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
2016 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
2017     if (s == reginfo->strbeg) {                                                \
2018         tmp = '\n';                                                            \
2019     }                                                                          \
2020     else { /* Back-up to the start of the previous character */                \
2021         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
2022         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
2023                                                        0, UTF8_ALLOW_DEFAULT); \
2024     }                                                                          \
2025     tmp = TEST_UV(tmp);                                                        \
2026     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
2027     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2028         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2029             tmp = !tmp;                                                        \
2030             IF_SUCCESS;                                                        \
2031         }                                                                      \
2032         else {                                                                 \
2033             IF_FAIL;                                                           \
2034         }                                                                      \
2035     );
2036
2037 /* Like the above two macros.  UTF8_CODE is the complete code for handling
2038  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2039  * macros below */
2040 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
2041     if (utf8_target) {                                                         \
2042         UTF8_CODE                                                              \
2043     }                                                                          \
2044     else {  /* Not utf8 */                                                     \
2045         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
2046         tmp = TEST_NON_UTF8(tmp);                                              \
2047         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
2048             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
2049                 IF_SUCCESS;                                                    \
2050                 tmp = !tmp;                                                    \
2051             }                                                                  \
2052             else {                                                             \
2053                 IF_FAIL;                                                       \
2054             }                                                                  \
2055         );                                                                     \
2056     }                                                                          \
2057     /* Here, things have been set up by the previous code so that tmp is the   \
2058      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
2059      * utf8ness of the target).  We also have to check if this matches against \
2060      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
2061      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
2062      * string */                                                               \
2063     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2064         IF_SUCCESS;                                                            \
2065     }                                                                          \
2066     else {                                                                     \
2067         IF_FAIL;                                                               \
2068     }
2069
2070 /* This is the macro to use when we want to see if something that looks like it
2071  * could match, actually does, and if so exits the loop */
2072 #define REXEC_FBC_TRYIT                            \
2073     if ((reginfo->intuit || regtry(reginfo, &s)))  \
2074         goto got_it
2075
2076 /* The only difference between the BOUND and NBOUND cases is that
2077  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2078  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2079  * with the other one being empty (PLACEHOLDER is defined as empty).
2080  *
2081  * The TEST_FOO parameters are for operating on different forms of input, but
2082  * all should be ones that return identically for the same underlying code
2083  * points */
2084 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
2085     FBC_BOUND_COMMON(                                                          \
2086           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
2087           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2088
2089 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2090     FBC_BOUND_COMMON(                                                          \
2091             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2092             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2093
2094 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2095     FBC_BOUND_COMMON(                                                          \
2096           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2097           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2098
2099 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2100     FBC_BOUND_COMMON(                                                          \
2101             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2102             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2103
2104 #ifdef DEBUGGING
2105 static IV
2106 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2107   IV cp_out = Perl__invlist_search(invlist, cp_in);
2108   assert(cp_out >= 0);
2109   return cp_out;
2110 }
2111 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2112         invmap[S_get_break_val_cp_checked(invlist, cp)]
2113 #else
2114 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2115         invmap[_invlist_search(invlist, cp)]
2116 #endif
2117
2118 /* Takes a pointer to an inversion list, a pointer to its corresponding
2119  * inversion map, and a code point, and returns the code point's value
2120  * according to the two arrays.  It assumes that all code points have a value.
2121  * This is used as the base macro for macros for particular properties */
2122 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2123         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2124
2125 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2126  * of a code point, returning the value for the first code point in the string.
2127  * And it takes the particular macro name that finds the desired value given a
2128  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2129 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2130              (__ASSERT_(pos < strend)                                          \
2131                  /* Note assumes is valid UTF-8 */                             \
2132              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2133
2134 /* Returns the GCB value for the input code point */
2135 #define getGCB_VAL_CP(cp)                                                      \
2136           _generic_GET_BREAK_VAL_CP(                                           \
2137                                     PL_GCB_invlist,                            \
2138                                     _Perl_GCB_invmap,                          \
2139                                     (cp))
2140
2141 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2142  * bounded by pos and strend */
2143 #define getGCB_VAL_UTF8(pos, strend)                                           \
2144     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2145
2146 /* Returns the LB value for the input code point */
2147 #define getLB_VAL_CP(cp)                                                       \
2148           _generic_GET_BREAK_VAL_CP(                                           \
2149                                     PL_LB_invlist,                             \
2150                                     _Perl_LB_invmap,                           \
2151                                     (cp))
2152
2153 /* Returns the LB value for the first code point in the UTF-8 encoded string
2154  * bounded by pos and strend */
2155 #define getLB_VAL_UTF8(pos, strend)                                            \
2156     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2157
2158
2159 /* Returns the SB value for the input code point */
2160 #define getSB_VAL_CP(cp)                                                       \
2161           _generic_GET_BREAK_VAL_CP(                                           \
2162                                     PL_SB_invlist,                             \
2163                                     _Perl_SB_invmap,                     \
2164                                     (cp))
2165
2166 /* Returns the SB value for the first code point in the UTF-8 encoded string
2167  * bounded by pos and strend */
2168 #define getSB_VAL_UTF8(pos, strend)                                            \
2169     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2170
2171 /* Returns the WB value for the input code point */
2172 #define getWB_VAL_CP(cp)                                                       \
2173           _generic_GET_BREAK_VAL_CP(                                           \
2174                                     PL_WB_invlist,                             \
2175                                     _Perl_WB_invmap,                         \
2176                                     (cp))
2177
2178 /* Returns the WB value for the first code point in the UTF-8 encoded string
2179  * bounded by pos and strend */
2180 #define getWB_VAL_UTF8(pos, strend)                                            \
2181     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2182
2183 /* We know what class REx starts with.  Try to find this position... */
2184 /* if reginfo->intuit, its a dryrun */
2185 /* annoyingly all the vars in this routine have different names from their counterparts
2186    in regmatch. /grrr */
2187 STATIC char *
2188 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2189     const char *strend, regmatch_info *reginfo)
2190 {
2191     dVAR;
2192
2193     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2194     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2195
2196     char *pat_string;   /* The pattern's exactish string */
2197     char *pat_end;          /* ptr to end char of pat_string */
2198     re_fold_t folder;   /* Function for computing non-utf8 folds */
2199     const U8 *fold_array;   /* array for folding ords < 256 */
2200     STRLEN ln;
2201     STRLEN lnc;
2202     U8 c1;
2203     U8 c2;
2204     char *e;
2205
2206     /* In some cases we accept only the first occurence of 'x' in a sequence of
2207      * them.  This variable points to just beyond the end of the previous
2208      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2209      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2210      * hop back.) */
2211     char * previous_occurrence_end = 0;
2212
2213     I32 tmp;            /* Scratch variable */
2214     const bool utf8_target = reginfo->is_utf8_target;
2215     UV utf8_fold_flags = 0;
2216     const bool is_utf8_pat = reginfo->is_utf8_pat;
2217     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2218                                    with a result inverts that result, as 0^1 =
2219                                    1 and 1^1 = 0 */
2220     _char_class_number classnum;
2221
2222     RXi_GET_DECL(prog,progi);
2223
2224     PERL_ARGS_ASSERT_FIND_BYCLASS;
2225
2226     /* We know what class it must start with. */
2227     switch (OP(c)) {
2228     case ANYOFL:
2229         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2230
2231         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2232             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2233         }
2234
2235         /* FALLTHROUGH */
2236     case ANYOFD:
2237     case ANYOF:
2238         if (utf8_target) {
2239             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2240                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2241         }
2242         else if (ANYOF_FLAGS(c)) {
2243             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2244         }
2245         else {
2246             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2247         }
2248         break;
2249
2250     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2251         /* UTF-8ness doesn't matter, so use 0 */
2252         REXEC_FBC_FIND_NEXT_SCAN(0,
2253          (char *) find_next_masked((U8 *) s, (U8 *) strend, ARG(c), FLAGS(c)));
2254         break;
2255
2256     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2257         assert(! is_utf8_pat);
2258         /* FALLTHROUGH */
2259     case EXACTFAA:
2260         if (is_utf8_pat || utf8_target) {
2261             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2262             goto do_exactf_utf8;
2263         }
2264         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
2265         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
2266         goto do_exactf_non_utf8;        /* isn't dealt with by these */
2267
2268     case EXACTF:   /* This node only generated for non-utf8 patterns */
2269         assert(! is_utf8_pat);
2270         if (utf8_target) {
2271             utf8_fold_flags = 0;
2272             goto do_exactf_utf8;
2273         }
2274         fold_array = PL_fold;
2275         folder = foldEQ;
2276         goto do_exactf_non_utf8;
2277
2278     case EXACTFL:
2279         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2280         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2281             utf8_fold_flags = FOLDEQ_LOCALE;
2282             goto do_exactf_utf8;
2283         }
2284         fold_array = PL_fold_locale;
2285         folder = foldEQ_locale;
2286         goto do_exactf_non_utf8;
2287
2288     case EXACTFU_SS:
2289         if (is_utf8_pat) {
2290             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2291         }
2292         goto do_exactf_utf8;
2293
2294     case EXACTFLU8:
2295             if (! utf8_target) {    /* All code points in this node require
2296                                        UTF-8 to express.  */
2297                 break;
2298             }
2299             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2300                                              | FOLDEQ_S2_FOLDS_SANE;
2301             goto do_exactf_utf8;
2302
2303     case EXACTFU:
2304         if (is_utf8_pat || utf8_target) {
2305             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2306             goto do_exactf_utf8;
2307         }
2308
2309         /* Any 'ss' in the pattern should have been replaced by regcomp,
2310          * so we don't have to worry here about this single special case
2311          * in the Latin1 range */
2312         fold_array = PL_fold_latin1;
2313         folder = foldEQ_latin1;
2314
2315         /* FALLTHROUGH */
2316
2317       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2318                            are no glitches with fold-length differences
2319                            between the target string and pattern */
2320
2321         /* The idea in the non-utf8 EXACTF* cases is to first find the
2322          * first character of the EXACTF* node and then, if necessary,
2323          * case-insensitively compare the full text of the node.  c1 is the
2324          * first character.  c2 is its fold.  This logic will not work for
2325          * Unicode semantics and the german sharp ss, which hence should
2326          * not be compiled into a node that gets here. */
2327         pat_string = STRING(c);
2328         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2329
2330         /* We know that we have to match at least 'ln' bytes (which is the
2331          * same as characters, since not utf8).  If we have to match 3
2332          * characters, and there are only 2 availabe, we know without
2333          * trying that it will fail; so don't start a match past the
2334          * required minimum number from the far end */
2335         e = HOP3c(strend, -((SSize_t)ln), s);
2336         if (e < s)
2337             break;
2338
2339         c1 = *pat_string;
2340         c2 = fold_array[c1];
2341         if (c1 == c2) { /* If char and fold are the same */
2342             while (s <= e) {
2343                 s = (char *) memchr(s, c1, e + 1 - s);
2344                 if (s == NULL) {
2345                     break;
2346                 }
2347
2348                 /* Check that the rest of the node matches */
2349                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2350                     && (reginfo->intuit || regtry(reginfo, &s)) )
2351                 {
2352                     goto got_it;
2353                 }
2354                 s++;
2355             }
2356         }
2357         else {
2358             U8 bits_differing = c1 ^ c2;
2359
2360             /* If the folds differ in one bit position only, we can mask to
2361              * match either of them, and can use this faster find method.  Both
2362              * ASCII and EBCDIC tend to have their case folds differ in only
2363              * one position, so this is very likely */
2364             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2365                 bits_differing = ~ bits_differing;
2366                 while (s <= e) {
2367                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2368                                         (c1 & bits_differing), bits_differing);
2369                     if (s > e) {
2370                         break;
2371                     }
2372
2373                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2374                         && (reginfo->intuit || regtry(reginfo, &s)) )
2375                     {
2376                         goto got_it;
2377                     }
2378                     s++;
2379                 }
2380             }
2381             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2382                        should actually happen only in EXACTFL nodes */
2383                 while (s <= e) {
2384                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2385                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2386                         && (reginfo->intuit || regtry(reginfo, &s)) )
2387                     {
2388                         goto got_it;
2389                     }
2390                     s++;
2391                 }
2392             }
2393         }
2394         break;
2395
2396       do_exactf_utf8:
2397       {
2398         unsigned expansion;
2399
2400         /* If one of the operands is in utf8, we can't use the simpler folding
2401          * above, due to the fact that many different characters can have the
2402          * same fold, or portion of a fold, or different- length fold */
2403         pat_string = STRING(c);
2404         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2405         pat_end = pat_string + ln;
2406         lnc = is_utf8_pat       /* length to match in characters */
2407                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2408                 : ln;
2409
2410         /* We have 'lnc' characters to match in the pattern, but because of
2411          * multi-character folding, each character in the target can match
2412          * up to 3 characters (Unicode guarantees it will never exceed
2413          * this) if it is utf8-encoded; and up to 2 if not (based on the
2414          * fact that the Latin 1 folds are already determined, and the
2415          * only multi-char fold in that range is the sharp-s folding to
2416          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2417          * string character.  Adjust lnc accordingly, rounding up, so that
2418          * if we need to match at least 4+1/3 chars, that really is 5. */
2419         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2420         lnc = (lnc + expansion - 1) / expansion;
2421
2422         /* As in the non-UTF8 case, if we have to match 3 characters, and
2423          * only 2 are left, it's guaranteed to fail, so don't start a
2424          * match that would require us to go beyond the end of the string
2425          */
2426         e = HOP3c(strend, -((SSize_t)lnc), s);
2427
2428         /* XXX Note that we could recalculate e to stop the loop earlier,
2429          * as the worst case expansion above will rarely be met, and as we
2430          * go along we would usually find that e moves further to the left.
2431          * This would happen only after we reached the point in the loop
2432          * where if there were no expansion we should fail.  Unclear if
2433          * worth the expense */
2434
2435         while (s <= e) {
2436             char *my_strend= (char *)strend;
2437             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2438                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2439                 && (reginfo->intuit || regtry(reginfo, &s)) )
2440             {
2441                 goto got_it;
2442             }
2443             s += (utf8_target) ? UTF8SKIP(s) : 1;
2444         }
2445         break;
2446     }
2447
2448     case BOUNDL:
2449         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2450         if (FLAGS(c) != TRADITIONAL_BOUND) {
2451             if (! IN_UTF8_CTYPE_LOCALE) {
2452                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2453                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2454             }
2455             goto do_boundu;
2456         }
2457
2458         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2459         break;
2460
2461     case NBOUNDL:
2462         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2463         if (FLAGS(c) != TRADITIONAL_BOUND) {
2464             if (! IN_UTF8_CTYPE_LOCALE) {
2465                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2466                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2467             }
2468             goto do_nboundu;
2469         }
2470
2471         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2472         break;
2473
2474     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2475                    meaning */
2476         assert(FLAGS(c) == TRADITIONAL_BOUND);
2477
2478         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2479         break;
2480
2481     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2482                    meaning */
2483         assert(FLAGS(c) == TRADITIONAL_BOUND);
2484
2485         FBC_BOUND_A(isWORDCHAR_A);
2486         break;
2487
2488     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2489                    meaning */
2490         assert(FLAGS(c) == TRADITIONAL_BOUND);
2491
2492         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2493         break;
2494
2495     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2496                    meaning */
2497         assert(FLAGS(c) == TRADITIONAL_BOUND);
2498
2499         FBC_NBOUND_A(isWORDCHAR_A);
2500         break;
2501
2502     case NBOUNDU:
2503         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2504             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2505             break;
2506         }
2507
2508       do_nboundu:
2509
2510         to_complement = 1;
2511         /* FALLTHROUGH */
2512
2513     case BOUNDU:
2514       do_boundu:
2515         switch((bound_type) FLAGS(c)) {
2516             case TRADITIONAL_BOUND:
2517                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2518                 break;
2519             case GCB_BOUND:
2520                 if (s == reginfo->strbeg) {
2521                     if (reginfo->intuit || regtry(reginfo, &s))
2522                     {
2523                         goto got_it;
2524                     }
2525
2526                     /* Didn't match.  Try at the next position (if there is one) */
2527                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2528                     if (UNLIKELY(s >= reginfo->strend)) {
2529                         break;
2530                     }
2531                 }
2532
2533                 if (utf8_target) {
2534                     GCB_enum before = getGCB_VAL_UTF8(
2535                                                reghop3((U8*)s, -1,
2536                                                        (U8*)(reginfo->strbeg)),
2537                                                (U8*) reginfo->strend);
2538                     while (s < strend) {
2539                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2540                                                         (U8*) reginfo->strend);
2541                         if (   (to_complement ^ isGCB(before,
2542                                                       after,
2543                                                       (U8*) reginfo->strbeg,
2544                                                       (U8*) s,
2545                                                       utf8_target))
2546                             && (reginfo->intuit || regtry(reginfo, &s)))
2547                         {
2548                             goto got_it;
2549                         }
2550                         before = after;
2551                         s += UTF8SKIP(s);
2552                     }
2553                 }
2554                 else {  /* Not utf8.  Everything is a GCB except between CR and
2555                            LF */
2556                     while (s < strend) {
2557                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2558                                               || UCHARAT(s) != '\n'))
2559                             && (reginfo->intuit || regtry(reginfo, &s)))
2560                         {
2561                             goto got_it;
2562                         }
2563                         s++;
2564                     }
2565                 }
2566
2567                 /* And, since this is a bound, it can match after the final
2568                  * character in the string */
2569                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2570                     goto got_it;
2571                 }
2572                 break;
2573
2574             case LB_BOUND:
2575                 if (s == reginfo->strbeg) {
2576                     if (reginfo->intuit || regtry(reginfo, &s)) {
2577                         goto got_it;
2578                     }
2579                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2580                     if (UNLIKELY(s >= reginfo->strend)) {
2581                         break;
2582                     }
2583                 }
2584
2585                 if (utf8_target) {
2586                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2587                                                                -1,
2588                                                                (U8*)(reginfo->strbeg)),
2589                                                        (U8*) reginfo->strend);
2590                     while (s < strend) {
2591                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2592                         if (to_complement ^ isLB(before,
2593                                                  after,
2594                                                  (U8*) reginfo->strbeg,
2595                                                  (U8*) s,
2596                                                  (U8*) reginfo->strend,
2597                                                  utf8_target)
2598                             && (reginfo->intuit || regtry(reginfo, &s)))
2599                         {
2600                             goto got_it;
2601                         }
2602                         before = after;
2603                         s += UTF8SKIP(s);
2604                     }
2605                 }
2606                 else {  /* Not utf8. */
2607                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2608                     while (s < strend) {
2609                         LB_enum after = getLB_VAL_CP((U8) *s);
2610                         if (to_complement ^ isLB(before,
2611                                                  after,
2612                                                  (U8*) reginfo->strbeg,
2613                                                  (U8*) s,
2614                                                  (U8*) reginfo->strend,
2615                                                  utf8_target)
2616                             && (reginfo->intuit || regtry(reginfo, &s)))
2617                         {
2618                             goto got_it;
2619                         }
2620                         before = after;
2621                         s++;
2622                     }
2623                 }
2624
2625                 if (reginfo->intuit || regtry(reginfo, &s)) {
2626                     goto got_it;
2627                 }
2628
2629                 break;
2630
2631             case SB_BOUND:
2632                 if (s == reginfo->strbeg) {
2633                     if (reginfo->intuit || regtry(reginfo, &s)) {
2634                         goto got_it;
2635                     }
2636                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2637                     if (UNLIKELY(s >= reginfo->strend)) {
2638                         break;
2639                     }
2640                 }
2641
2642                 if (utf8_target) {
2643                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2644                                                         -1,
2645                                                         (U8*)(reginfo->strbeg)),
2646                                                       (U8*) reginfo->strend);
2647                     while (s < strend) {
2648                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2649                                                          (U8*) reginfo->strend);
2650                         if ((to_complement ^ isSB(before,
2651                                                   after,
2652                                                   (U8*) reginfo->strbeg,
2653                                                   (U8*) s,
2654                                                   (U8*) reginfo->strend,
2655                                                   utf8_target))
2656                             && (reginfo->intuit || regtry(reginfo, &s)))
2657                         {
2658                             goto got_it;
2659                         }
2660                         before = after;
2661                         s += UTF8SKIP(s);
2662                     }
2663                 }
2664                 else {  /* Not utf8. */
2665                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2666                     while (s < strend) {
2667                         SB_enum after = getSB_VAL_CP((U8) *s);
2668                         if ((to_complement ^ isSB(before,
2669                                                   after,
2670                                                   (U8*) reginfo->strbeg,
2671                                                   (U8*) s,
2672                                                   (U8*) reginfo->strend,
2673                                                   utf8_target))
2674                             && (reginfo->intuit || regtry(reginfo, &s)))
2675                         {
2676                             goto got_it;
2677                         }
2678                         before = after;
2679                         s++;
2680                     }
2681                 }
2682
2683                 /* Here are at the final position in the target string.  The SB
2684                  * value is always true here, so matches, depending on other
2685                  * constraints */
2686                 if (reginfo->intuit || regtry(reginfo, &s)) {
2687                     goto got_it;
2688                 }
2689
2690                 break;
2691
2692             case WB_BOUND:
2693                 if (s == reginfo->strbeg) {
2694                     if (reginfo->intuit || regtry(reginfo, &s)) {
2695                         goto got_it;
2696                     }
2697                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2698                     if (UNLIKELY(s >= reginfo->strend)) {
2699                         break;
2700                     }
2701                 }
2702
2703                 if (utf8_target) {
2704                     /* We are at a boundary between char_sub_0 and char_sub_1.
2705                      * We also keep track of the value for char_sub_-1 as we
2706                      * loop through the line.   Context may be needed to make a
2707                      * determination, and if so, this can save having to
2708                      * recalculate it */
2709                     WB_enum previous = WB_UNKNOWN;
2710                     WB_enum before = getWB_VAL_UTF8(
2711                                               reghop3((U8*)s,
2712                                                       -1,
2713                                                       (U8*)(reginfo->strbeg)),
2714                                               (U8*) reginfo->strend);
2715                     while (s < strend) {
2716                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2717                                                         (U8*) reginfo->strend);
2718                         if ((to_complement ^ isWB(previous,
2719                                                   before,
2720                                                   after,
2721                                                   (U8*) reginfo->strbeg,
2722                                                   (U8*) s,
2723                                                   (U8*) reginfo->strend,
2724                                                   utf8_target))
2725                             && (reginfo->intuit || regtry(reginfo, &s)))
2726                         {
2727                             goto got_it;
2728                         }
2729                         previous = before;
2730                         before = after;
2731                         s += UTF8SKIP(s);
2732                     }
2733                 }
2734                 else {  /* Not utf8. */
2735                     WB_enum previous = WB_UNKNOWN;
2736                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2737                     while (s < strend) {
2738                         WB_enum after = getWB_VAL_CP((U8) *s);
2739                         if ((to_complement ^ isWB(previous,
2740                                                   before,
2741                                                   after,
2742                                                   (U8*) reginfo->strbeg,
2743                                                   (U8*) s,
2744                                                   (U8*) reginfo->strend,
2745                                                   utf8_target))
2746                             && (reginfo->intuit || regtry(reginfo, &s)))
2747                         {
2748                             goto got_it;
2749                         }
2750                         previous = before;
2751                         before = after;
2752                         s++;
2753                     }
2754                 }
2755
2756                 if (reginfo->intuit || regtry(reginfo, &s)) {
2757                     goto got_it;
2758                 }
2759         }
2760         break;
2761
2762     case LNBREAK:
2763         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2764                         is_LNBREAK_latin1_safe(s, strend)
2765         );
2766         break;
2767
2768     case ASCII:
2769         REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
2770         break;
2771
2772     case NASCII:
2773         if (utf8_target) {
2774             REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2775                                                             utf8_target));
2776         }
2777         else {
2778             REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2779                                                             utf8_target));
2780         }
2781
2782         break;
2783
2784     /* The argument to all the POSIX node types is the class number to pass to
2785      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2786
2787     case NPOSIXL:
2788         to_complement = 1;
2789         /* FALLTHROUGH */
2790
2791     case POSIXL:
2792         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2793         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2794                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2795         break;
2796
2797     case NPOSIXD:
2798         to_complement = 1;
2799         /* FALLTHROUGH */
2800
2801     case POSIXD:
2802         if (utf8_target) {
2803             goto posix_utf8;
2804         }
2805         goto posixa;
2806
2807     case NPOSIXA:
2808         if (utf8_target) {
2809             /* The complement of something that matches only ASCII matches all
2810              * non-ASCII, plus everything in ASCII that isn't in the class. */
2811             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2812                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2813             break;
2814         }
2815
2816         to_complement = 1;
2817         goto posixa;
2818
2819     case POSIXA:
2820         /* Don't need to worry about utf8, as it can match only a single
2821          * byte invariant character.  But we do anyway for performance reasons,
2822          * as otherwise we would have to examine all the continuation
2823          * characters */
2824         if (utf8_target) {
2825             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2826             break;
2827         }
2828
2829       posixa:
2830         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2831                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2832         break;
2833
2834     case NPOSIXU:
2835         to_complement = 1;
2836         /* FALLTHROUGH */
2837
2838     case POSIXU:
2839         if (! utf8_target) {
2840             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2841                                  to_complement ^ cBOOL(_generic_isCC(*s,
2842                                                                     FLAGS(c))));
2843         }
2844         else {
2845
2846           posix_utf8:
2847             classnum = (_char_class_number) FLAGS(c);
2848             if (classnum < _FIRST_NON_SWASH_CC) {
2849                 while (s < strend) {
2850
2851                     /* We avoid loading in the swash as long as possible, but
2852                      * should we have to, we jump to a separate loop.  This
2853                      * extra 'if' statement is what keeps this code from being
2854                      * just a call to REXEC_FBC_CLASS_SCAN() */
2855                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2856                         goto found_above_latin1;
2857                     }
2858
2859                     REXEC_FBC_CLASS_SCAN_GUTS(1, (UTF8_IS_INVARIANT(*s)
2860                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2861                                                                 classnum)))
2862                         || (   UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
2863                             && to_complement ^ cBOOL(
2864                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2865                                                                       *(s + 1)),
2866                                               classnum))));
2867                 }
2868             }
2869             else switch (classnum) {    /* These classes are implemented as
2870                                            macros */
2871                 case _CC_ENUM_SPACE:
2872                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2873                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2874                     break;
2875
2876                 case _CC_ENUM_BLANK:
2877                     REXEC_FBC_CLASS_SCAN(1,
2878                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2879                     break;
2880
2881                 case _CC_ENUM_XDIGIT:
2882                     REXEC_FBC_CLASS_SCAN(1,
2883                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2884                     break;
2885
2886                 case _CC_ENUM_VERTSPACE:
2887                     REXEC_FBC_CLASS_SCAN(1,
2888                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2889                     break;
2890
2891                 case _CC_ENUM_CNTRL:
2892                     REXEC_FBC_CLASS_SCAN(1,
2893                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2894                     break;
2895
2896                 default:
2897                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2898                     NOT_REACHED; /* NOTREACHED */
2899             }
2900         }
2901         break;
2902
2903       found_above_latin1:   /* Here we have to load a swash to get the result
2904                                for the current code point */
2905         if (! PL_utf8_swash_ptrs[classnum]) {
2906             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2907             PL_utf8_swash_ptrs[classnum] =
2908                     _core_swash_init("utf8",
2909                                      "",
2910                                      &PL_sv_undef, 1, 0,
2911                                      PL_XPosix_ptrs[classnum], &flags);
2912         }
2913
2914         /* This is a copy of the loop above for swash classes, though using the
2915          * FBC macro instead of being expanded out.  Since we've loaded the
2916          * swash, we don't have to check for that each time through the loop */
2917         REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2918                 to_complement ^ cBOOL(_generic_utf8_safe(
2919                                       classnum,
2920                                       s,
2921                                       strend,
2922                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2923                                                   (U8 *) s, TRUE))));
2924         break;
2925
2926     case AHOCORASICKC:
2927     case AHOCORASICK:
2928         {
2929             DECL_TRIE_TYPE(c);
2930             /* what trie are we using right now */
2931             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2932             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2933             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2934
2935             const char *last_start = strend - trie->minlen;
2936 #ifdef DEBUGGING
2937             const char *real_start = s;
2938 #endif
2939             STRLEN maxlen = trie->maxlen;
2940             SV *sv_points;
2941             U8 **points; /* map of where we were in the input string
2942                             when reading a given char. For ASCII this
2943                             is unnecessary overhead as the relationship
2944                             is always 1:1, but for Unicode, especially
2945                             case folded Unicode this is not true. */
2946             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2947             U8 *bitmap=NULL;
2948
2949
2950             GET_RE_DEBUG_FLAGS_DECL;
2951
2952             /* We can't just allocate points here. We need to wrap it in
2953              * an SV so it gets freed properly if there is a croak while
2954              * running the match */
2955             ENTER;
2956             SAVETMPS;
2957             sv_points=newSV(maxlen * sizeof(U8 *));
2958             SvCUR_set(sv_points,
2959                 maxlen * sizeof(U8 *));
2960             SvPOK_on(sv_points);
2961             sv_2mortal(sv_points);
2962             points=(U8**)SvPV_nolen(sv_points );
2963             if ( trie_type != trie_utf8_fold
2964                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2965             {
2966                 if (trie->bitmap)
2967                     bitmap=(U8*)trie->bitmap;
2968                 else
2969                     bitmap=(U8*)ANYOF_BITMAP(c);
2970             }
2971             /* this is the Aho-Corasick algorithm modified a touch
2972                to include special handling for long "unknown char" sequences.
2973                The basic idea being that we use AC as long as we are dealing
2974                with a possible matching char, when we encounter an unknown char
2975                (and we have not encountered an accepting state) we scan forward
2976                until we find a legal starting char.
2977                AC matching is basically that of trie matching, except that when
2978                we encounter a failing transition, we fall back to the current
2979                states "fail state", and try the current char again, a process
2980                we repeat until we reach the root state, state 1, or a legal
2981                transition. If we fail on the root state then we can either
2982                terminate if we have reached an accepting state previously, or
2983                restart the entire process from the beginning if we have not.
2984
2985              */
2986             while (s <= last_start) {
2987                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2988                 U8 *uc = (U8*)s;
2989                 U16 charid = 0;
2990                 U32 base = 1;
2991                 U32 state = 1;
2992                 UV uvc = 0;
2993                 STRLEN len = 0;
2994                 STRLEN foldlen = 0;
2995                 U8 *uscan = (U8*)NULL;
2996                 U8 *leftmost = NULL;
2997 #ifdef DEBUGGING
2998                 U32 accepted_word= 0;
2999 #endif
3000                 U32 pointpos = 0;
3001
3002                 while ( state && uc <= (U8*)strend ) {
3003                     int failed=0;
3004                     U32 word = aho->states[ state ].wordnum;
3005
3006                     if( state==1 ) {
3007                         if ( bitmap ) {
3008                             DEBUG_TRIE_EXECUTE_r(
3009                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
3010                                     dump_exec_pos( (char *)uc, c, strend, real_start,
3011                                         (char *)uc, utf8_target, 0 );
3012                                     Perl_re_printf( aTHX_
3013                                         " Scanning for legal start char...\n");
3014                                 }
3015                             );
3016                             if (utf8_target) {
3017                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
3018                                     uc += UTF8SKIP(uc);
3019                                 }
3020                             } else {
3021                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
3022                                     uc++;
3023                                 }
3024                             }
3025                             s= (char *)uc;
3026                         }
3027                         if (uc >(U8*)last_start) break;
3028                     }
3029
3030                     if ( word ) {
3031                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
3032                         if (!leftmost || lpos < leftmost) {
3033                             DEBUG_r(accepted_word=word);
3034                             leftmost= lpos;
3035                         }
3036                         if (base==0) break;
3037
3038                     }
3039                     points[pointpos++ % maxlen]= uc;
3040                     if (foldlen || uc < (U8*)strend) {
3041                         REXEC_TRIE_READ_CHAR(trie_type, trie,
3042                                          widecharmap, uc,
3043                                          uscan, len, uvc, charid, foldlen,
3044                                          foldbuf, uniflags);
3045                         DEBUG_TRIE_EXECUTE_r({
3046                             dump_exec_pos( (char *)uc, c, strend,
3047                                         real_start, s, utf8_target, 0);
3048                             Perl_re_printf( aTHX_
3049                                 " Charid:%3u CP:%4" UVxf " ",
3050                                  charid, uvc);
3051                         });
3052                     }
3053                     else {
3054                         len = 0;
3055                         charid = 0;
3056                     }
3057
3058
3059                     do {
3060 #ifdef DEBUGGING
3061                         word = aho->states[ state ].wordnum;
3062 #endif
3063                         base = aho->states[ state ].trans.base;
3064
3065                         DEBUG_TRIE_EXECUTE_r({
3066                             if (failed)
3067                                 dump_exec_pos( (char *)uc, c, strend, real_start,
3068                                     s,   utf8_target, 0 );
3069                             Perl_re_printf( aTHX_
3070                                 "%sState: %4" UVxf ", word=%" UVxf,
3071                                 failed ? " Fail transition to " : "",
3072                                 (UV)state, (UV)word);
3073                         });
3074                         if ( base ) {
3075                             U32 tmp;
3076                             I32 offset;
3077                             if (charid &&
3078                                  ( ((offset = base + charid
3079                                     - 1 - trie->uniquecharcount)) >= 0)
3080                                  && ((U32)offset < trie->lasttrans)
3081                                  && trie->trans[offset].check == state
3082                                  && (tmp=trie->trans[offset].next))
3083                             {
3084                                 DEBUG_TRIE_EXECUTE_r(
3085                                     Perl_re_printf( aTHX_ " - legal\n"));
3086                                 state = tmp;
3087                                 break;
3088                             }
3089                             else {
3090                                 DEBUG_TRIE_EXECUTE_r(
3091                                     Perl_re_printf( aTHX_ " - fail\n"));
3092                                 failed = 1;
3093                                 state = aho->fail[state];
3094                             }
3095                         }
3096                         else {
3097                             /* we must be accepting here */
3098                             DEBUG_TRIE_EXECUTE_r(
3099                                     Perl_re_printf( aTHX_ " - accepting\n"));
3100                             failed = 1;
3101                             break;
3102                         }
3103                     } while(state);
3104                     uc += len;
3105                     if (failed) {
3106                         if (leftmost)
3107                             break;
3108                         if (!state) state = 1;
3109                     }
3110                 }
3111                 if ( aho->states[ state ].wordnum ) {
3112                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3113                     if (!leftmost || lpos < leftmost) {
3114                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3115                         leftmost = lpos;
3116                     }
3117                 }
3118                 if (leftmost) {
3119                     s = (char*)leftmost;
3120                     DEBUG_TRIE_EXECUTE_r({
3121                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3122                             (UV)accepted_word, (IV)(s - real_start)
3123                         );
3124                     });
3125                     if (reginfo->intuit || regtry(reginfo, &s)) {
3126                         FREETMPS;
3127                         LEAVE;
3128                         goto got_it;
3129                     }
3130                     s = HOPc(s,1);
3131                     DEBUG_TRIE_EXECUTE_r({
3132                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3133                     });
3134                 } else {
3135                     DEBUG_TRIE_EXECUTE_r(
3136                         Perl_re_printf( aTHX_ "No match.\n"));
3137                     break;
3138                 }
3139             }
3140             FREETMPS;
3141             LEAVE;
3142         }
3143         break;
3144     default:
3145         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3146     }
3147     return 0;
3148   got_it:
3149     return s;
3150 }
3151
3152 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3153  * flags have same meanings as with regexec_flags() */
3154
3155 static void
3156 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3157                             char *strbeg,
3158                             char *strend,
3159                             SV *sv,
3160                             U32 flags,
3161                             bool utf8_target)
3162 {
3163     struct regexp *const prog = ReANY(rx);
3164
3165     if (flags & REXEC_COPY_STR) {
3166 #ifdef PERL_ANY_COW
3167         if (SvCANCOW(sv)) {
3168             DEBUG_C(Perl_re_printf( aTHX_
3169                               "Copy on write: regexp capture, type %d\n",
3170                                     (int) SvTYPE(sv)));
3171             /* Create a new COW SV to share the match string and store
3172              * in saved_copy, unless the current COW SV in saved_copy
3173              * is valid and suitable for our purpose */
3174             if ((   prog->saved_copy
3175                  && SvIsCOW(prog->saved_copy)
3176                  && SvPOKp(prog->saved_copy)
3177                  && SvIsCOW(sv)
3178                  && SvPOKp(sv)
3179                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3180             {
3181                 /* just reuse saved_copy SV */
3182                 if (RXp_MATCH_COPIED(prog)) {
3183                     Safefree(prog->subbeg);
3184                     RXp_MATCH_COPIED_off(prog);
3185                 }
3186             }
3187             else {
3188                 /* create new COW SV to share string */
3189                 RXp_MATCH_COPY_FREE(prog);
3190                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3191             }
3192             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3193             assert (SvPOKp(prog->saved_copy));
3194             prog->sublen  = strend - strbeg;
3195             prog->suboffset = 0;
3196             prog->subcoffset = 0;
3197         } else
3198 #endif
3199         {
3200             SSize_t min = 0;
3201             SSize_t max = strend - strbeg;
3202             SSize_t sublen;
3203
3204             if (    (flags & REXEC_COPY_SKIP_POST)
3205                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3206                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3207             ) { /* don't copy $' part of string */
3208                 U32 n = 0;
3209                 max = -1;
3210                 /* calculate the right-most part of the string covered
3211                  * by a capture. Due to lookahead, this may be to
3212                  * the right of $&, so we have to scan all captures */
3213                 while (n <= prog->lastparen) {
3214                     if (prog->offs[n].end > max)
3215                         max = prog->offs[n].end;
3216                     n++;
3217                 }
3218                 if (max == -1)
3219                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3220                             ? prog->offs[0].start
3221                             : 0;
3222                 assert(max >= 0 && max <= strend - strbeg);
3223             }
3224
3225             if (    (flags & REXEC_COPY_SKIP_PRE)
3226                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3227                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3228             ) { /* don't copy $` part of string */
3229                 U32 n = 0;
3230                 min = max;
3231                 /* calculate the left-most part of the string covered
3232                  * by a capture. Due to lookbehind, this may be to
3233                  * the left of $&, so we have to scan all captures */
3234                 while (min && n <= prog->lastparen) {
3235                     if (   prog->offs[n].start != -1
3236                         && prog->offs[n].start < min)
3237                     {
3238                         min = prog->offs[n].start;
3239                     }
3240                     n++;
3241                 }
3242                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3243                     && min >  prog->offs[0].end
3244                 )
3245                     min = prog->offs[0].end;
3246
3247             }
3248
3249             assert(min >= 0 && min <= max && min <= strend - strbeg);
3250             sublen = max - min;
3251
3252             if (RXp_MATCH_COPIED(prog)) {
3253                 if (sublen > prog->sublen)
3254                     prog->subbeg =
3255                             (char*)saferealloc(prog->subbeg, sublen+1);
3256             }
3257             else
3258                 prog->subbeg = (char*)safemalloc(sublen+1);
3259             Copy(strbeg + min, prog->subbeg, sublen, char);
3260             prog->subbeg[sublen] = '\0';
3261             prog->suboffset = min;
3262             prog->sublen = sublen;
3263             RXp_MATCH_COPIED_on(prog);
3264         }
3265         prog->subcoffset = prog->suboffset;
3266         if (prog->suboffset && utf8_target) {
3267             /* Convert byte offset to chars.
3268              * XXX ideally should only compute this if @-/@+
3269              * has been seen, a la PL_sawampersand ??? */
3270
3271             /* If there's a direct correspondence between the
3272              * string which we're matching and the original SV,
3273              * then we can use the utf8 len cache associated with
3274              * the SV. In particular, it means that under //g,
3275              * sv_pos_b2u() will use the previously cached
3276              * position to speed up working out the new length of
3277              * subcoffset, rather than counting from the start of
3278              * the string each time. This stops
3279              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3280              * from going quadratic */
3281             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3282                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3283                                                 SV_GMAGIC|SV_CONST_RETURN);
3284             else
3285                 prog->subcoffset = utf8_length((U8*)strbeg,
3286                                     (U8*)(strbeg+prog->suboffset));
3287         }
3288     }
3289     else {
3290         RXp_MATCH_COPY_FREE(prog);
3291         prog->subbeg = strbeg;
3292         prog->suboffset = 0;
3293         prog->subcoffset = 0;
3294         prog->sublen = strend - strbeg;
3295     }
3296 }
3297
3298
3299
3300
3301 /*
3302  - regexec_flags - match a regexp against a string
3303  */
3304 I32
3305 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3306               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3307 /* stringarg: the point in the string at which to begin matching */
3308 /* strend:    pointer to null at end of string */
3309 /* strbeg:    real beginning of string */
3310 /* minend:    end of match must be >= minend bytes after stringarg. */
3311 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3312  *            itself is accessed via the pointers above */
3313 /* data:      May be used for some additional optimizations.
3314               Currently unused. */
3315 /* flags:     For optimizations. See REXEC_* in regexp.h */
3316
3317 {
3318     struct regexp *const prog = ReANY(rx);
3319     char *s;
3320     regnode *c;
3321     char *startpos;
3322     SSize_t minlen;             /* must match at least this many chars */
3323     SSize_t dontbother = 0;     /* how many characters not to try at end */
3324     const bool utf8_target = cBOOL(DO_UTF8(sv));
3325     I32 multiline;
3326     RXi_GET_DECL(prog,progi);
3327     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3328     regmatch_info *const reginfo = &reginfo_buf;
3329     regexp_paren_pair *swap = NULL;
3330     I32 oldsave;
3331     GET_RE_DEBUG_FLAGS_DECL;
3332
3333     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3334     PERL_UNUSED_ARG(data);
3335
3336     /* Be paranoid... */
3337     if (prog == NULL) {
3338         Perl_croak(aTHX_ "NULL regexp parameter");
3339     }
3340
3341     DEBUG_EXECUTE_r(
3342         debug_start_match(rx, utf8_target, stringarg, strend,
3343         "Matching");
3344     );
3345
3346     startpos = stringarg;
3347
3348     /* set these early as they may be used by the HOP macros below */
3349     reginfo->strbeg = strbeg;
3350     reginfo->strend = strend;
3351     reginfo->is_utf8_target = cBOOL(utf8_target);
3352
3353     if (prog->intflags & PREGf_GPOS_SEEN) {
3354         MAGIC *mg;
3355
3356         /* set reginfo->ganch, the position where \G can match */
3357
3358         reginfo->ganch =
3359             (flags & REXEC_IGNOREPOS)
3360             ? stringarg /* use start pos rather than pos() */
3361             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3362               /* Defined pos(): */
3363             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3364             : strbeg; /* pos() not defined; use start of string */
3365
3366         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3367             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3368
3369         /* in the presence of \G, we may need to start looking earlier in
3370          * the string than the suggested start point of stringarg:
3371          * if prog->gofs is set, then that's a known, fixed minimum
3372          * offset, such as
3373          * /..\G/:   gofs = 2
3374          * /ab|c\G/: gofs = 1
3375          * or if the minimum offset isn't known, then we have to go back
3376          * to the start of the string, e.g. /w+\G/
3377          */
3378
3379         if (prog->intflags & PREGf_ANCH_GPOS) {
3380             if (prog->gofs) {
3381                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3382                 if (!startpos ||
3383                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3384                 {
3385                     DEBUG_r(Perl_re_printf( aTHX_
3386                             "fail: ganch-gofs before earliest possible start\n"));
3387                     return 0;
3388                 }
3389             }
3390             else
3391                 startpos = reginfo->ganch;
3392         }
3393         else if (prog->gofs) {
3394             startpos = HOPBACKc(startpos, prog->gofs);
3395             if (!startpos)
3396                 startpos = strbeg;
3397         }
3398         else if (prog->intflags & PREGf_GPOS_FLOAT)
3399             startpos = strbeg;
3400     }
3401
3402     minlen = prog->minlen;
3403     if ((startpos + minlen) > strend || startpos < strbeg) {
3404         DEBUG_r(Perl_re_printf( aTHX_
3405                     "Regex match can't succeed, so not even tried\n"));
3406         return 0;
3407     }
3408
3409     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3410      * which will call destuctors to reset PL_regmatch_state, free higher
3411      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3412      * regmatch_info_aux_eval */
3413
3414     oldsave = PL_savestack_ix;
3415
3416     s = startpos;
3417
3418     if ((prog->extflags & RXf_USE_INTUIT)
3419         && !(flags & REXEC_CHECKED))
3420     {
3421         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3422                                     flags, NULL);
3423         if (!s)
3424             return 0;
3425
3426         if (prog->extflags & RXf_CHECK_ALL) {
3427             /* we can match based purely on the result of INTUIT.
3428              * Set up captures etc just for $& and $-[0]
3429              * (an intuit-only match wont have $1,$2,..) */
3430             assert(!prog->nparens);
3431
3432             /* s/// doesn't like it if $& is earlier than where we asked it to
3433              * start searching (which can happen on something like /.\G/) */
3434             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3435                     && (s < stringarg))
3436             {
3437                 /* this should only be possible under \G */
3438                 assert(prog->intflags & PREGf_GPOS_SEEN);
3439                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3440                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3441                 goto phooey;
3442             }
3443
3444             /* match via INTUIT shouldn't have any captures.
3445              * Let @-, @+, $^N know */
3446             prog->lastparen = prog->lastcloseparen = 0;
3447             RXp_MATCH_UTF8_set(prog, utf8_target);
3448             prog->offs[0].start = s - strbeg;
3449             prog->offs[0].end = utf8_target
3450                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3451                 : s - strbeg + prog->minlenret;
3452             if ( !(flags & REXEC_NOT_FIRST) )
3453                 S_reg_set_capture_string(aTHX_ rx,
3454                                         strbeg, strend,
3455                                         sv, flags, utf8_target);
3456
3457             return 1;
3458         }
3459     }
3460
3461     multiline = prog->extflags & RXf_PMf_MULTILINE;
3462
3463     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3464         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3465                               "String too short [regexec_flags]...\n"));
3466         goto phooey;
3467     }
3468
3469     /* Check validity of program. */
3470     if (UCHARAT(progi->program) != REG_MAGIC) {
3471         Perl_croak(aTHX_ "corrupted regexp program");
3472     }
3473
3474     RXp_MATCH_TAINTED_off(prog);
3475     RXp_MATCH_UTF8_set(prog, utf8_target);
3476
3477     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3478     reginfo->intuit = 0;
3479     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3480     reginfo->warned = FALSE;
3481     reginfo->sv = sv;
3482     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3483     /* see how far we have to get to not match where we matched before */
3484     reginfo->till = stringarg + minend;
3485
3486     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3487         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3488            S_cleanup_regmatch_info_aux has executed (registered by
3489            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3490            magic belonging to this SV.
3491            Not newSVsv, either, as it does not COW.
3492         */
3493         reginfo->sv = newSV(0);
3494         SvSetSV_nosteal(reginfo->sv, sv);
3495         SAVEFREESV(reginfo->sv);
3496     }
3497
3498     /* reserve next 2 or 3 slots in PL_regmatch_state:
3499      * slot N+0: may currently be in use: skip it
3500      * slot N+1: use for regmatch_info_aux struct
3501      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3502      * slot N+3: ready for use by regmatch()
3503      */
3504
3505     {
3506         regmatch_state *old_regmatch_state;
3507         regmatch_slab  *old_regmatch_slab;
3508         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3509
3510         /* on first ever match, allocate first slab */
3511         if (!PL_regmatch_slab) {
3512             Newx(PL_regmatch_slab, 1, regmatch_slab);
3513             PL_regmatch_slab->prev = NULL;
3514             PL_regmatch_slab->next = NULL;
3515             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3516         }
3517
3518         old_regmatch_state = PL_regmatch_state;
3519         old_regmatch_slab  = PL_regmatch_slab;
3520
3521         for (i=0; i <= max; i++) {
3522             if (i == 1)
3523                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3524             else if (i ==2)
3525                 reginfo->info_aux_eval =
3526                 reginfo->info_aux->info_aux_eval =
3527                             &(PL_regmatch_state->u.info_aux_eval);
3528
3529             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3530                 PL_regmatch_state = S_push_slab(aTHX);
3531         }
3532
3533         /* note initial PL_regmatch_state position; at end of match we'll
3534          * pop back to there and free any higher slabs */
3535
3536         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3537         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3538         reginfo->info_aux->poscache = NULL;
3539
3540         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3541
3542         if ((prog->extflags & RXf_EVAL_SEEN))
3543             S_setup_eval_state(aTHX_ reginfo);
3544         else
3545             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3546     }
3547
3548     /* If there is a "must appear" string, look for it. */
3549
3550     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3551         /* We have to be careful. If the previous successful match
3552            was from this regex we don't want a subsequent partially
3553            successful match to clobber the old results.
3554            So when we detect this possibility we add a swap buffer
3555            to the re, and switch the buffer each match. If we fail,
3556            we switch it back; otherwise we leave it swapped.
3557         */
3558         swap = prog->offs;
3559         /* do we need a save destructor here for eval dies? */
3560         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3561         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3562             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3563             0,
3564             PTR2UV(prog),
3565             PTR2UV(swap),
3566             PTR2UV(prog->offs)
3567         ));
3568     }
3569
3570     if (prog->recurse_locinput)
3571         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3572
3573     /* Simplest case: anchored match need be tried only once, or with
3574      * MBOL, only at the beginning of each line.
3575      *
3576      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3577      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3578      * match at the start of the string then it won't match anywhere else
3579      * either; while with /.*.../, if it doesn't match at the beginning,
3580      * the earliest it could match is at the start of the next line */
3581
3582     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3583         char *end;
3584
3585         if (regtry(reginfo, &s))
3586             goto got_it;
3587
3588         if (!(prog->intflags & PREGf_ANCH_MBOL))
3589             goto phooey;
3590
3591         /* didn't match at start, try at other newline positions */
3592
3593         if (minlen)
3594             dontbother = minlen - 1;
3595         end = HOP3c(strend, -dontbother, strbeg) - 1;
3596
3597         /* skip to next newline */
3598
3599         while (s <= end) { /* note it could be possible to match at the end of the string */
3600             /* NB: newlines are the same in unicode as they are in latin */
3601             if (*s++ != '\n')
3602                 continue;
3603             if (prog->check_substr || prog->check_utf8) {
3604             /* note that with PREGf_IMPLICIT, intuit can only fail
3605              * or return the start position, so it's of limited utility.
3606              * Nevertheless, I made the decision that the potential for
3607              * quick fail was still worth it - DAPM */
3608                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3609                 if (!s)
3610                     goto phooey;
3611             }
3612             if (regtry(reginfo, &s))
3613                 goto got_it;
3614         }
3615         goto phooey;
3616     } /* end anchored search */
3617
3618     if (prog->intflags & PREGf_ANCH_GPOS)
3619     {
3620         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3621         assert(prog->intflags & PREGf_GPOS_SEEN);
3622         /* For anchored \G, the only position it can match from is
3623          * (ganch-gofs); we already set startpos to this above; if intuit
3624          * moved us on from there, we can't possibly succeed */
3625         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3626         if (s == startpos && regtry(reginfo, &s))
3627             goto got_it;
3628         goto phooey;
3629     }
3630
3631     /* Messy cases:  unanchored match. */
3632     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3633         /* we have /x+whatever/ */
3634         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3635         char ch;
3636 #ifdef DEBUGGING
3637         int did_match = 0;
3638 #endif
3639         if (utf8_target) {
3640             if (! prog->anchored_utf8) {
3641                 to_utf8_substr(prog);
3642             }
3643             ch = SvPVX_const(prog->anchored_utf8)[0];
3644             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3645                 if (*s == ch) {
3646                     DEBUG_EXECUTE_r( did_match = 1 );
3647                     if (regtry(reginfo, &s)) goto got_it;
3648                     s += UTF8SKIP(s);
3649                     while (s < strend && *s == ch)
3650                         s += UTF8SKIP(s);
3651                 }
3652             );
3653
3654         }
3655         else {
3656             if (! prog->anchored_substr) {
3657                 if (! to_byte_substr(prog)) {
3658                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3659                 }
3660             }
3661             ch = SvPVX_const(prog->anchored_substr)[0];
3662             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3663                 if (*s == ch) {
3664                     DEBUG_EXECUTE_r( did_match = 1 );
3665                     if (regtry(reginfo, &s)) goto got_it;
3666                     s++;
3667                     while (s < strend && *s == ch)
3668                         s++;
3669                 }
3670             );
3671         }
3672         DEBUG_EXECUTE_r(if (!did_match)
3673                 Perl_re_printf( aTHX_
3674                                   "Did not find anchored character...\n")
3675                );
3676     }
3677     else if (prog->anchored_substr != NULL
3678               || prog->anchored_utf8 != NULL
3679               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3680                   && prog->float_max_offset < strend - s)) {
3681         SV *must;
3682         SSize_t back_max;
3683         SSize_t back_min;
3684         char *last;
3685         char *last1;            /* Last position checked before */
3686 #ifdef DEBUGGING
3687         int did_match = 0;
3688 #endif
3689         if (prog->anchored_substr || prog->anchored_utf8) {
3690             if (utf8_target) {
3691                 if (! prog->anchored_utf8) {
3692                     to_utf8_substr(prog);
3693                 }
3694                 must = prog->anchored_utf8;
3695             }
3696             else {
3697                 if (! prog->anchored_substr) {
3698                     if (! to_byte_substr(prog)) {
3699                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3700                     }
3701                 }
3702                 must = prog->anchored_substr;
3703             }
3704             back_max = back_min = prog->anchored_offset;
3705         } else {
3706             if (utf8_target) {
3707                 if (! prog->float_utf8) {
3708                     to_utf8_substr(prog);
3709                 }
3710                 must = prog->float_utf8;
3711             }
3712             else {
3713                 if (! prog->float_substr) {
3714                     if (! to_byte_substr(prog)) {
3715                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3716                     }
3717                 }
3718                 must = prog->float_substr;
3719             }
3720             back_max = prog->float_max_offset;
3721             back_min = prog->float_min_offset;
3722         }
3723
3724         if (back_min<0) {
3725             last = strend;
3726         } else {
3727             last = HOP3c(strend,        /* Cannot start after this */
3728                   -(SSize_t)(CHR_SVLEN(must)
3729                          - (SvTAIL(must) != 0) + back_min), strbeg);
3730         }
3731         if (s > reginfo->strbeg)
3732             last1 = HOPc(s, -1);
3733         else
3734             last1 = s - 1;      /* bogus */
3735
3736         /* XXXX check_substr already used to find "s", can optimize if
3737            check_substr==must. */
3738         dontbother = 0;
3739         strend = HOPc(strend, -dontbother);
3740         while ( (s <= last) &&
3741                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3742                                   (unsigned char*)strend, must,
3743                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3744             DEBUG_EXECUTE_r( did_match = 1 );
3745             if (HOPc(s, -back_max) > last1) {
3746                 last1 = HOPc(s, -back_min);
3747                 s = HOPc(s, -back_max);
3748             }
3749             else {
3750                 char * const t = (last1 >= reginfo->strbeg)
3751                                     ? HOPc(last1, 1) : last1 + 1;
3752
3753                 last1 = HOPc(s, -back_min);
3754                 s = t;
3755             }
3756             if (utf8_target) {
3757                 while (s <= last1) {
3758                     if (regtry(reginfo, &s))
3759                         goto got_it;
3760                     if (s >= last1) {
3761                         s++; /* to break out of outer loop */
3762                         break;
3763                     }
3764                     s += UTF8SKIP(s);
3765                 }
3766             }
3767             else {
3768                 while (s <= last1) {
3769                     if (regtry(reginfo, &s))
3770                         goto got_it;
3771                     s++;
3772                 }
3773             }
3774         }
3775         DEBUG_EXECUTE_r(if (!did_match) {
3776             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3777                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3778             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3779                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3780                                ? "anchored" : "floating"),
3781                 quoted, RE_SV_TAIL(must));
3782         });
3783         goto phooey;
3784     }
3785     else if ( (c = progi->regstclass) ) {
3786         if (minlen) {
3787             const OPCODE op = OP(progi->regstclass);
3788             /* don't bother with what can't match */
3789             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3790                 strend = HOPc(strend, -(minlen - 1));
3791         }
3792         DEBUG_EXECUTE_r({
3793             SV * const prop = sv_newmortal();
3794             regprop(prog, prop, c, reginfo, NULL);
3795             {
3796                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3797                     s,strend-s,PL_dump_re_max_len);
3798                 Perl_re_printf( aTHX_
3799                     "Matching stclass %.*s against %s (%d bytes)\n",
3800                     (int)SvCUR(prop), SvPVX_const(prop),
3801                      quoted, (int)(strend - s));
3802             }
3803         });
3804         if (find_byclass(prog, c, s, strend, reginfo))
3805             goto got_it;
3806         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3807     }
3808     else {
3809         dontbother = 0;
3810         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3811             /* Trim the end. */
3812             char *last= NULL;
3813             SV* float_real;
3814             STRLEN len;
3815             const char *little;
3816
3817             if (utf8_target) {
3818                 if (! prog->float_utf8) {
3819                     to_utf8_substr(prog);
3820                 }
3821                 float_real = prog->float_utf8;
3822             }
3823             else {
3824                 if (! prog->float_substr) {
3825                     if (! to_byte_substr(prog)) {
3826                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3827                     }
3828                 }
3829                 float_real = prog->float_substr;
3830             }
3831
3832             little = SvPV_const(float_real, len);
3833             if (SvTAIL(float_real)) {
3834                     /* This means that float_real contains an artificial \n on
3835                      * the end due to the presence of something like this:
3836                      * /foo$/ where we can match both "foo" and "foo\n" at the
3837                      * end of the string.  So we have to compare the end of the
3838                      * string first against the float_real without the \n and
3839                      * then against the full float_real with the string.  We
3840                      * have to watch out for cases where the string might be
3841                      * smaller than the float_real or the float_real without
3842                      * the \n. */
3843                     char *checkpos= strend - len;
3844                     DEBUG_OPTIMISE_r(
3845                         Perl_re_printf( aTHX_
3846                             "%sChecking for float_real.%s\n",
3847                             PL_colors[4], PL_colors[5]));
3848                     if (checkpos + 1 < strbeg) {
3849                         /* can't match, even if we remove the trailing \n
3850                          * string is too short to match */
3851                         DEBUG_EXECUTE_r(
3852                             Perl_re_printf( aTHX_
3853                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3854                                 PL_colors[4], PL_colors[5]));
3855                         goto phooey;
3856                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3857                         /* can match, the end of the string matches without the
3858                          * "\n" */
3859                         last = checkpos + 1;
3860                     } else if (checkpos < strbeg) {
3861                         /* cant match, string is too short when the "\n" is
3862                          * included */
3863                         DEBUG_EXECUTE_r(
3864                             Perl_re_printf( aTHX_
3865                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3866                                 PL_colors[4], PL_colors[5]));
3867                         goto phooey;
3868                     } else if (!multiline) {
3869                         /* non multiline match, so compare with the "\n" at the
3870                          * end of the string */
3871                         if (memEQ(checkpos, little, len)) {
3872                             last= checkpos;
3873                         } else {
3874                             DEBUG_EXECUTE_r(
3875                                 Perl_re_printf( aTHX_
3876                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3877                                     PL_colors[4], PL_colors[5]));
3878                             goto phooey;
3879                         }
3880                     } else {
3881                         /* multiline match, so we have to search for a place
3882                          * where the full string is located */
3883                         goto find_last;
3884                     }
3885             } else {
3886                   find_last:
3887                     if (len)
3888                         last = rninstr(s, strend, little, little + len);
3889                     else
3890                         last = strend;  /* matching "$" */
3891             }
3892             if (!last) {
3893                 /* at one point this block contained a comment which was
3894                  * probably incorrect, which said that this was a "should not
3895                  * happen" case.  Even if it was true when it was written I am
3896                  * pretty sure it is not anymore, so I have removed the comment
3897                  * and replaced it with this one. Yves */
3898                 DEBUG_EXECUTE_r(
3899                     Perl_re_printf( aTHX_
3900                         "%sString does not contain required substring, cannot match.%s\n",
3901                         PL_colors[4], PL_colors[5]
3902                     ));
3903                 goto phooey;
3904             }
3905             dontbother = strend - last + prog->float_min_offset;
3906         }
3907         if (minlen && (dontbother < minlen))
3908             dontbother = minlen - 1;
3909         strend -= dontbother;              /* this one's always in bytes! */
3910         /* We don't know much -- general case. */
3911         if (utf8_target) {
3912             for (;;) {
3913                 if (regtry(reginfo, &s))
3914                     goto got_it;
3915                 if (s >= strend)
3916                     break;
3917                 s += UTF8SKIP(s);
3918             };
3919         }
3920         else {
3921             do {
3922                 if (regtry(reginfo, &s))
3923                     goto got_it;
3924             } while (s++ < strend);
3925         }
3926     }
3927
3928     /* Failure. */
3929     goto phooey;
3930
3931   got_it:
3932     /* s/// doesn't like it if $& is earlier than where we asked it to
3933      * start searching (which can happen on something like /.\G/) */
3934     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3935             && (prog->offs[0].start < stringarg - strbeg))
3936     {
3937         /* this should only be possible under \G */
3938         assert(prog->intflags & PREGf_GPOS_SEEN);
3939         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3940             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3941         goto phooey;
3942     }
3943
3944     DEBUG_BUFFERS_r(
3945         if (swap)
3946             Perl_re_exec_indentf( aTHX_
3947                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3948                 0,
3949                 PTR2UV(prog),
3950                 PTR2UV(swap)
3951             );
3952     );
3953     Safefree(swap);
3954
3955     /* clean up; this will trigger destructors that will free all slabs
3956      * above the current one, and cleanup the regmatch_info_aux
3957      * and regmatch_info_aux_eval sructs */
3958
3959     LEAVE_SCOPE(oldsave);
3960
3961     if (RXp_PAREN_NAMES(prog))
3962         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3963
3964     /* make sure $`, $&, $', and $digit will work later */
3965     if ( !(flags & REXEC_NOT_FIRST) )
3966         S_reg_set_capture_string(aTHX_ rx,
3967                                     strbeg, reginfo->strend,
3968                                     sv, flags, utf8_target);
3969
3970     return 1;
3971
3972   phooey:
3973     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3974                           PL_colors[4], PL_colors[5]));
3975
3976     /* clean up; this will trigger destructors that will free all slabs
3977      * above the current one, and cleanup the regmatch_info_aux
3978      * and regmatch_info_aux_eval sructs */
3979
3980     LEAVE_SCOPE(oldsave);
3981
3982     if (swap) {
3983         /* we failed :-( roll it back */
3984         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3985             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3986             0,
3987             PTR2UV(prog),
3988             PTR2UV(prog->offs),
3989             PTR2UV(swap)
3990         ));
3991         Safefree(prog->offs);
3992         prog->offs = swap;
3993     }
3994     return 0;
3995 }
3996
3997
3998 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3999  * Do inc before dec, in case old and new rex are the same */
4000 #define SET_reg_curpm(Re2)                          \
4001     if (reginfo->info_aux_eval) {                   \
4002         (void)ReREFCNT_inc(Re2);                    \
4003         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
4004         PM_SETRE((PL_reg_curpm), (Re2));            \
4005     }
4006
4007
4008 /*
4009  - regtry - try match at specific point
4010  */
4011 STATIC bool                     /* 0 failure, 1 success */
4012 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4013 {
4014     CHECKPOINT lastcp;
4015     REGEXP *const rx = reginfo->prog;
4016     regexp *const prog = ReANY(rx);
4017     SSize_t result;
4018 #ifdef DEBUGGING
4019     U32 depth = 0; /* used by REGCP_SET */
4020 #endif
4021     RXi_GET_DECL(prog,progi);
4022     GET_RE_DEBUG_FLAGS_DECL;
4023
4024     PERL_ARGS_ASSERT_REGTRY;
4025
4026     reginfo->cutpoint=NULL;
4027
4028     prog->offs[0].start = *startposp - reginfo->strbeg;
4029     prog->lastparen = 0;
4030     prog->lastcloseparen = 0;
4031
4032     /* XXXX What this code is doing here?!!!  There should be no need
4033        to do this again and again, prog->lastparen should take care of
4034        this!  --ilya*/
4035
4036     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4037      * Actually, the code in regcppop() (which Ilya may be meaning by
4038      * prog->lastparen), is not needed at all by the test suite
4039      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4040      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4041      * Meanwhile, this code *is* needed for the
4042      * above-mentioned test suite tests to succeed.  The common theme
4043      * on those tests seems to be returning null fields from matches.
4044      * --jhi updated by dapm */
4045
4046     /* After encountering a variant of the issue mentioned above I think
4047      * the point Ilya was making is that if we properly unwind whenever
4048      * we set lastparen to a smaller value then we should not need to do
4049      * this every time, only when needed. So if we have tests that fail if
4050      * we remove this, then it suggests somewhere else we are improperly
4051      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4052      * places it is called, and related regcp() routines. - Yves */
4053 #if 1
4054     if (prog->nparens) {
4055         regexp_paren_pair *pp = prog->offs;
4056         I32 i;
4057         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4058             ++pp;
4059             pp->start = -1;
4060             pp->end = -1;
4061         }
4062     }
4063 #endif
4064     REGCP_SET(lastcp);
4065     result = regmatch(reginfo, *startposp, progi->program + 1);
4066     if (result != -1) {
4067         prog->offs[0].end = result;
4068         return 1;
4069     }
4070     if (reginfo->cutpoint)
4071         *startposp= reginfo->cutpoint;
4072     REGCP_UNWIND(lastcp);
4073     return 0;
4074 }
4075
4076
4077 #define sayYES goto yes
4078 #define sayNO goto no
4079 #define sayNO_SILENT goto no_silent
4080
4081 /* we dont use STMT_START/END here because it leads to
4082    "unreachable code" warnings, which are bogus, but distracting. */
4083 #define CACHEsayNO \
4084     if (ST.cache_mask) \
4085        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
4086     sayNO
4087
4088 /* this is used to determine how far from the left messages like
4089    'failed...' are printed in regexec.c. It should be set such that
4090    messages are inline with the regop output that created them.
4091 */
4092 #define REPORT_CODE_OFF 29
4093 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4094 #ifdef DEBUGGING
4095 int
4096 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4097 {
4098     va_list ap;
4099     int result;
4100     PerlIO *f= Perl_debug_log;
4101     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4102     va_start(ap, depth);
4103     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4104     result = PerlIO_vprintf(f, fmt, ap);
4105     va_end(ap);
4106     return result;
4107 }
4108 #endif /* DEBUGGING */
4109
4110
4111 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4112 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4113 #define CHRTEST_NOT_A_CP_1 -999
4114 #define CHRTEST_NOT_A_CP_2 -998
4115
4116 /* grab a new slab and return the first slot in it */
4117
4118 STATIC regmatch_state *
4119 S_push_slab(pTHX)
4120 {
4121     regmatch_slab *s = PL_regmatch_slab->next;
4122     if (!s) {
4123         Newx(s, 1, regmatch_slab);
4124         s->prev = PL_regmatch_slab;
4125         s->next = NULL;
4126         PL_regmatch_slab->next = s;
4127     }
4128     PL_regmatch_slab = s;
4129     return SLAB_FIRST(s);
4130 }
4131
4132
4133 /* push a new state then goto it */
4134
4135 #define PUSH_STATE_GOTO(state, node, input) \
4136     pushinput = input; \
4137     scan = node; \
4138     st->resume_state = state; \
4139     goto push_state;
4140
4141 /* push a new state with success backtracking, then goto it */
4142
4143 #define PUSH_YES_STATE_GOTO(state, node, input) \
4144     pushinput = input; \
4145     scan = node; \
4146     st->resume_state = state; \
4147     goto push_yes_state;
4148
4149
4150
4151
4152 /*
4153
4154 regmatch() - main matching routine
4155
4156 This is basically one big switch statement in a loop. We execute an op,
4157 set 'next' to point the next op, and continue. If we come to a point which
4158 we may need to backtrack to on failure such as (A|B|C), we push a
4159 backtrack state onto the backtrack stack. On failure, we pop the top
4160 state, and re-enter the loop at the state indicated. If there are no more
4161 states to pop, we return failure.
4162
4163 Sometimes we also need to backtrack on success; for example /A+/, where
4164 after successfully matching one A, we need to go back and try to
4165 match another one; similarly for lookahead assertions: if the assertion
4166 completes successfully, we backtrack to the state just before the assertion
4167 and then carry on.  In these cases, the pushed state is marked as
4168 'backtrack on success too'. This marking is in fact done by a chain of
4169 pointers, each pointing to the previous 'yes' state. On success, we pop to
4170 the nearest yes state, discarding any intermediate failure-only states.
4171 Sometimes a yes state is pushed just to force some cleanup code to be
4172 called at the end of a successful match or submatch; e.g. (??{$re}) uses
4173 it to free the inner regex.
4174
4175 Note that failure backtracking rewinds the cursor position, while
4176 success backtracking leaves it alone.
4177
4178 A pattern is complete when the END op is executed, while a subpattern
4179 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
4180 ops trigger the "pop to last yes state if any, otherwise return true"
4181 behaviour.
4182
4183 A common convention in this function is to use A and B to refer to the two
4184 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
4185 the subpattern to be matched possibly multiple times, while B is the entire
4186 rest of the pattern. Variable and state names reflect this convention.
4187
4188 The states in the main switch are the union of ops and failure/success of
4189 substates associated with with that op.  For example, IFMATCH is the op
4190 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
4191 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
4192 successfully matched A and IFMATCH_A_fail is a state saying that we have
4193 just failed to match A. Resume states always come in pairs. The backtrack
4194 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
4195 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
4196 on success or failure.
4197
4198 The struct that holds a backtracking state is actually a big union, with
4199 one variant for each major type of op. The variable st points to the
4200 top-most backtrack struct. To make the code clearer, within each
4201 block of code we #define ST to alias the relevant union.
4202
4203 Here's a concrete example of a (vastly oversimplified) IFMATCH
4204 implementation:
4205
4206     switch (state) {
4207     ....
4208
4209 #define ST st->u.ifmatch
4210
4211     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4212         ST.foo = ...; // some state we wish to save
4213         ...
4214         // push a yes backtrack state with a resume value of
4215         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
4216         // first node of A:
4217         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
4218         // NOTREACHED
4219
4220     case IFMATCH_A: // we have successfully executed A; now continue with B
4221         next = B;
4222         bar = ST.foo; // do something with the preserved value
4223         break;
4224
4225     case IFMATCH_A_fail: // A failed, so the assertion failed
4226         ...;   // do some housekeeping, then ...
4227         sayNO; // propagate the failure
4228
4229 #undef ST
4230
4231     ...
4232     }
4233
4234 For any old-timers reading this who are familiar with the old recursive
4235 approach, the code above is equivalent to:
4236
4237     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4238     {
4239         int foo = ...
4240         ...
4241         if (regmatch(A)) {
4242             next = B;
4243             bar = foo;
4244             break;
4245         }
4246         ...;   // do some housekeeping, then ...
4247         sayNO; // propagate the failure
4248     }
4249
4250 The topmost backtrack state, pointed to by st, is usually free. If you
4251 want to claim it, populate any ST.foo fields in it with values you wish to
4252 save, then do one of
4253
4254         PUSH_STATE_GOTO(resume_state, node, newinput);
4255         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
4256
4257 which sets that backtrack state's resume value to 'resume_state', pushes a
4258 new free entry to the top of the backtrack stack, then goes to 'node'.
4259 On backtracking, the free slot is popped, and the saved state becomes the
4260 new free state. An ST.foo field in this new top state can be temporarily
4261 accessed to retrieve values, but once the main loop is re-entered, it
4262 becomes available for reuse.
4263
4264 Note that the depth of the backtrack stack constantly increases during the
4265 left-to-right execution of the pattern, rather than going up and down with
4266 the pattern nesting. For example the stack is at its maximum at Z at the
4267 end of the pattern, rather than at X in the following:
4268
4269     /(((X)+)+)+....(Y)+....Z/
4270
4271 The only exceptions to this are lookahead/behind assertions and the cut,
4272 (?>A), which pop all the backtrack states associated with A before
4273 continuing.
4274
4275 Backtrack state structs are allocated in slabs of about 4K in size.
4276 PL_regmatch_state and st always point to the currently active state,
4277 and PL_regmatch_slab points to the slab currently containing
4278 PL_regmatch_state.  The first time regmatch() is called, the first slab is
4279 allocated, and is never freed until interpreter destruction. When the slab
4280 is full, a new one is allocated and chained to the end. At exit from
4281 regmatch(), slabs allocated since entry are freed.
4282
4283 */
4284
4285
4286 #define DEBUG_STATE_pp(pp)                                  \
4287     DEBUG_STATE_r({                                         \
4288         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
4289         Perl_re_printf( aTHX_                                           \
4290             "%*s" pp " %s%s%s%s%s\n",                       \
4291             INDENT_CHARS(depth), "",                        \
4292             PL_reg_name[st->resume_state],                  \
4293             ((st==yes_state||st==mark_state) ? "[" : ""),   \
4294             ((st==yes_state) ? "Y" : ""),                   \
4295             ((st==mark_state) ? "M" : ""),                  \
4296             ((st==yes_state||st==mark_state) ? "]" : "")    \
4297         );                                                  \
4298     });
4299
4300
4301 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4302
4303 #ifdef DEBUGGING
4304
4305 STATIC void
4306 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4307     const char *start, const char *end, const char *blurb)
4308 {
4309     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4310
4311     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4312
4313     if (!PL_colorset)
4314             reginitcolors();
4315     {
4316         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4317             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4318
4319         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4320             start, end - start, PL_dump_re_max_len);
4321
4322         Perl_re_printf( aTHX_
4323             "%s%s REx%s %s against %s\n",
4324                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4325
4326         if (utf8_target||utf8_pat)
4327             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4328                 utf8_pat ? "pattern" : "",
4329                 utf8_pat && utf8_target ? " and " : "",
4330                 utf8_target ? "string" : ""
4331             );
4332     }
4333 }
4334
4335 STATIC void
4336 S_dump_exec_pos(pTHX_ const char *locinput,
4337                       const regnode *scan,
4338                       const char *loc_regeol,
4339                       const char *loc_bostr,
4340                       const char *loc_reg_starttry,
4341                       const bool utf8_target,
4342                       const U32 depth
4343                 )
4344 {
4345     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4346     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4347     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4348     /* The part of the string before starttry has one color
4349        (pref0_len chars), between starttry and current
4350        position another one (pref_len - pref0_len chars),
4351        after the current position the third one.
4352        We assume that pref0_len <= pref_len, otherwise we
4353        decrease pref0_len.  */
4354     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4355         ? (5 + taill) - l : locinput - loc_bostr;
4356     int pref0_len;
4357
4358     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4359
4360     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4361         pref_len++;
4362     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4363     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4364         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4365               ? (5 + taill) - pref_len : loc_regeol - locinput);
4366     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4367         l--;
4368     if (pref0_len < 0)
4369         pref0_len = 0;
4370     if (pref0_len > pref_len)
4371         pref0_len = pref_len;
4372     {
4373         const int is_uni = utf8_target ? 1 : 0;
4374
4375         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4376             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4377
4378         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4379                     (locinput - pref_len + pref0_len),
4380                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4381
4382         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4383                     locinput, loc_regeol - locinput, 10, 0, 1);
4384
4385         const STRLEN tlen=len0+len1+len2;
4386         Perl_re_printf( aTHX_
4387                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4388                     (IV)(locinput - loc_bostr),
4389                     len0, s0,
4390                     len1, s1,
4391                     (docolor ? "" : "> <"),
4392                     len2, s2,
4393                     (int)(tlen > 19 ? 0 :  19 - tlen),
4394                     "",
4395                     depth);
4396     }
4397 }
4398
4399 #endif
4400
4401 /* reg_check_named_buff_matched()
4402  * Checks to see if a named buffer has matched. The data array of
4403  * buffer numbers corresponding to the buffer is expected to reside
4404  * in the regexp->data->data array in the slot stored in the ARG() of
4405  * node involved. Note that this routine doesn't actually care about the
4406  * name, that information is not preserved from compilation to execution.
4407  * Returns the index of the leftmost defined buffer with the given name
4408  * or 0 if non of the buffers matched.
4409  */
4410 STATIC I32
4411 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4412 {
4413     I32 n;
4414     RXi_GET_DECL(rex,rexi);
4415     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4416     I32 *nums=(I32*)SvPVX(sv_dat);
4417
4418     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4419
4420     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4421         if ((I32)rex->lastparen >= nums[n] &&
4422             rex->offs[nums[n]].end != -1)
4423         {
4424             return nums[n];
4425         }
4426     }
4427     return 0;
4428 }
4429
4430
4431 static bool
4432 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4433         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4434 {
4435     /* This function determines if there are one or two characters that match
4436      * the first character of the passed-in EXACTish node <text_node>, and if
4437      * so, returns them in the passed-in pointers.
4438      *
4439      * If it determines that no possible character in the target string can
4440      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4441      * the first character in <text_node> requires UTF-8 to represent, and the
4442      * target string isn't in UTF-8.)
4443      *
4444      * If there are more than two characters that could match the beginning of
4445      * <text_node>, or if more context is required to determine a match or not,
4446      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4447      *
4448      * The motiviation behind this function is to allow the caller to set up
4449      * tight loops for matching.  If <text_node> is of type EXACT, there is
4450      * only one possible character that can match its first character, and so
4451      * the situation is quite simple.  But things get much more complicated if
4452      * folding is involved.  It may be that the first character of an EXACTFish
4453      * node doesn't participate in any possible fold, e.g., punctuation, so it
4454      * can be matched only by itself.  The vast majority of characters that are
4455      * in folds match just two things, their lower and upper-case equivalents.
4456      * But not all are like that; some have multiple possible matches, or match
4457      * sequences of more than one character.  This function sorts all that out.
4458      *
4459      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4460      * loop of trying to match A*, we know we can't exit where the thing
4461      * following it isn't a B.  And something can't be a B unless it is the
4462      * beginning of B.  By putting a quick test for that beginning in a tight
4463      * loop, we can rule out things that can't possibly be B without having to
4464      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4465      * character, we can make a tight loop matching A*, using the outputs of
4466      * this function.
4467      *
4468      * If the target string to match isn't in UTF-8, and there aren't
4469      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4470      * the one or two possible octets (which are characters in this situation)
4471      * that can match.  In all cases, if there is only one character that can
4472      * match, *<c1p> and *<c2p> will be identical.
4473      *
4474      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4475      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4476      * can match the beginning of <text_node>.  They should be declared with at
4477      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4478      * undefined what these contain.)  If one or both of the buffers are
4479      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4480      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4481      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4482      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4483      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4484
4485     const bool utf8_target = reginfo->is_utf8_target;
4486
4487     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4488     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4489     bool use_chrtest_void = FALSE;
4490     const bool is_utf8_pat = reginfo->is_utf8_pat;
4491
4492     /* Used when we have both utf8 input and utf8 output, to avoid converting
4493      * to/from code points */
4494     bool utf8_has_been_setup = FALSE;
4495
4496     dVAR;
4497
4498     U8 *pat = (U8*)STRING(text_node);
4499     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4500
4501     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4502
4503         /* In an exact node, only one thing can be matched, that first
4504          * character.  If both the pat and the target are UTF-8, we can just
4505          * copy the input to the output, avoiding finding the code point of
4506          * that character */
4507         if (!is_utf8_pat) {
4508             c2 = c1 = *pat;
4509         }
4510         else if (utf8_target) {
4511             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4512             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4513             utf8_has_been_setup = TRUE;
4514         }
4515         else {
4516             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4517         }
4518     }
4519     else { /* an EXACTFish node */
4520         U8 *pat_end = pat + STR_LEN(text_node);
4521
4522         /* An EXACTFL node has at least some characters unfolded, because what
4523          * they match is not known until now.  So, now is the time to fold
4524          * the first few of them, as many as are needed to determine 'c1' and
4525          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4526          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4527          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4528          * need to fold as many characters as a single character can fold to,
4529          * so that later we can check if the first ones are such a multi-char
4530          * fold.  But, in such a pattern only locale-problematic characters
4531          * aren't folded, so we can skip this completely if the first character
4532          * in the node isn't one of the tricky ones */
4533         if (OP(text_node) == EXACTFL) {
4534
4535             if (! is_utf8_pat) {
4536                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4537                 {
4538                     folded[0] = folded[1] = 's';
4539                     pat = folded;
4540                     pat_end = folded + 2;
4541                 }
4542             }
4543             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4544                 U8 *s = pat;
4545                 U8 *d = folded;
4546                 int i;
4547
4548                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4549                     if (isASCII(*s)) {
4550                         *(d++) = (U8) toFOLD_LC(*s);
4551                         s++;
4552                     }
4553                     else {
4554                         STRLEN len;
4555                         _toFOLD_utf8_flags(s,
4556                                            pat_end,
4557                                            d,
4558                                            &len,
4559                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4560                         d += len;
4561                         s += UTF8SKIP(s);
4562                     }
4563                 }
4564
4565                 pat = folded;
4566                 pat_end = d;
4567             }
4568         }
4569
4570         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4571              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4572         {
4573             /* Multi-character folds require more context to sort out.  Also
4574              * PL_utf8_foldclosures used below doesn't handle them, so have to
4575              * be handled outside this routine */
4576             use_chrtest_void = TRUE;
4577         }
4578         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4579             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4580             if (c1 > 255) {
4581                 /* Load the folds hash, if not already done */
4582                 SV** listp;
4583                 if (! PL_utf8_foldclosures) {
4584                     _load_PL_utf8_foldclosures();
4585                 }
4586
4587                 /* The fold closures data structure is a hash with the keys
4588                  * being the UTF-8 of every character that is folded to, like
4589                  * 'k', and the values each an array of all code points that
4590                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4591                  * Multi-character folds are not included */
4592                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4593                                         (char *) pat,
4594                                         UTF8SKIP(pat),
4595                                         FALSE))))
4596                 {
4597                     /* Not found in the hash, therefore there are no folds
4598                     * containing it, so there is only a single character that
4599                     * could match */
4600                     c2 = c1;
4601                 }
4602                 else {  /* Does participate in folds */
4603                     AV* list = (AV*) *listp;
4604                     if (av_tindex_skip_len_mg(list) != 1) {
4605
4606                         /* If there aren't exactly two folds to this, it is
4607                          * outside the scope of this function */
4608                         use_chrtest_void = TRUE;
4609                     }
4610                     else {  /* There are two.  Get them */
4611                         SV** c_p = av_fetch(list, 0, FALSE);
4612                         if (c_p == NULL) {
4613                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4614                         }
4615                         c1 = SvUV(*c_p);
4616
4617                         c_p = av_fetch(list, 1, FALSE);
4618                         if (c_p == NULL) {
4619                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4620                         }
4621                         c2 = SvUV(*c_p);
4622
4623                         /* Folds that cross the 255/256 boundary are forbidden
4624                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFAA and
4625                          * one is ASCIII.  Since the pattern character is above
4626                          * 255, and its only other match is below 256, the only
4627                          * legal match will be to itself.  We have thrown away
4628                          * the original, so have to compute which is the one
4629                          * above 255. */
4630                         if ((c1 < 256) != (c2 < 256)) {
4631                             if ((OP(text_node) == EXACTFL
4632                                  && ! IN_UTF8_CTYPE_LOCALE)
4633                                 || ((OP(text_node) == EXACTFAA
4634                                     || OP(text_node) == EXACTFAA_NO_TRIE)
4635                                     && (isASCII(c1) || isASCII(c2))))
4636                             {
4637                                 if (c1 < 256) {
4638                                     c1 = c2;
4639                                 }
4640                                 else {
4641                                     c2 = c1;
4642                                 }
4643                             }
4644                         }
4645                     }
4646                 }
4647             }
4648             else /* Here, c1 is <= 255 */
4649                 if (utf8_target
4650                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4651                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4652                     && ((OP(text_node) != EXACTFAA
4653                         && OP(text_node) != EXACTFAA_NO_TRIE)
4654                         || ! isASCII(c1)))
4655             {
4656                 /* Here, there could be something above Latin1 in the target
4657                  * which folds to this character in the pattern.  All such
4658                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4659                  * than two characters involved in their folds, so are outside
4660                  * the scope of this function */
4661                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4662                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4663                 }
4664                 else {
4665                     use_chrtest_void = TRUE;
4666                 }
4667             }
4668             else { /* Here nothing above Latin1 can fold to the pattern
4669                       character */
4670                 switch (OP(text_node)) {
4671
4672                     case EXACTFL:   /* /l rules */
4673                         c2 = PL_fold_locale[c1];
4674                         break;
4675
4676                     case EXACTF:   /* This node only generated for non-utf8
4677                                     patterns */
4678                         assert(! is_utf8_pat);
4679                         if (! utf8_target) {    /* /d rules */
4680                             c2 = PL_fold[c1];
4681                             break;
4682                         }
4683                         /* FALLTHROUGH */
4684                         /* /u rules for all these.  This happens to work for
4685                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4686                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4687                                                 non-utf8 patterns */
4688                         assert(! is_utf8_pat);
4689                         /* FALLTHROUGH */
4690                     case EXACTFAA:
4691                     case EXACTFU_SS:
4692                     case EXACTFU:
4693                         c2 = PL_fold_latin1[c1];
4694                         break;
4695
4696                     default:
4697                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4698                         NOT_REACHED; /* NOTREACHED */
4699                 }
4700             }
4701         }
4702     }
4703
4704     /* Here have figured things out.  Set up the returns */
4705     if (use_chrtest_void) {
4706         *c2p = *c1p = CHRTEST_VOID;
4707     }
4708     else if (utf8_target) {
4709         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4710             uvchr_to_utf8(c1_utf8, c1);
4711             uvchr_to_utf8(c2_utf8, c2);
4712         }
4713
4714         /* Invariants are stored in both the utf8 and byte outputs; Use
4715          * negative numbers otherwise for the byte ones.  Make sure that the
4716          * byte ones are the same iff the utf8 ones are the same */
4717         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4718         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4719                 ? *c2_utf8
4720                 : (c1 == c2)
4721                   ? CHRTEST_NOT_A_CP_1
4722                   : CHRTEST_NOT_A_CP_2;
4723     }
4724     else if (c1 > 255) {
4725        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4726                            can represent */
4727            return FALSE;
4728        }
4729
4730        *c1p = *c2p = c2;    /* c2 is the only representable value */
4731     }
4732     else {  /* c1 is representable; see about c2 */
4733        *c1p = c1;
4734        *c2p = (c2 < 256) ? c2 : c1;
4735     }
4736
4737     return TRUE;
4738 }
4739
4740 STATIC bool
4741 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4742 {
4743     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4744      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4745
4746     PERL_ARGS_ASSERT_ISGCB;
4747
4748     switch (GCB_table[before][after]) {
4749         case GCB_BREAKABLE:
4750             return TRUE;
4751
4752         case GCB_NOBREAK:
4753             return FALSE;
4754
4755         case GCB_RI_then_RI:
4756             {
4757                 int RI_count = 1;
4758                 U8 * temp_pos = (U8 *) curpos;
4759
4760                 /* Do not break within emoji flag sequences. That is, do not
4761                  * break between regional indicator (RI) symbols if there is an
4762                  * odd number of RI characters before the break point.
4763                  *  GB12   sot (RI RI)* RI × RI
4764                  *  GB13 [^RI] (RI RI)* RI × RI */
4765
4766                 while (backup_one_GCB(strbeg,
4767                                     &temp_pos,
4768                                     utf8_target) == GCB_Regional_Indicator)
4769                 {
4770                     RI_count++;
4771                 }
4772
4773                 return RI_count % 2 != 1;
4774             }
4775
4776         case GCB_EX_then_EM:
4777
4778             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4779             {
4780                 U8 * temp_pos = (U8 *) curpos;
4781                 GCB_enum prev;
4782
4783                 do {
4784                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4785                 }
4786                 while (prev == GCB_Extend);
4787
4788                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4789             }
4790
4791         default:
4792             break;
4793     }
4794
4795 #ifdef DEBUGGING
4796     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4797                                   before, after, GCB_table[before][after]);
4798     assert(0);
4799 #endif
4800     return TRUE;
4801 }
4802
4803 STATIC GCB_enum
4804 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4805 {
4806     GCB_enum gcb;
4807
4808     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4809
4810     if (*curpos < strbeg) {
4811         return GCB_EDGE;
4812     }
4813
4814     if (utf8_target) {
4815         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4816         U8 * prev_prev_char_pos;
4817
4818         if (! prev_char_pos) {
4819             return GCB_EDGE;
4820         }
4821
4822         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4823             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4824             *curpos = prev_char_pos;
4825             prev_char_pos = prev_prev_char_pos;
4826         }
4827         else {
4828             *curpos = (U8 *) strbeg;
4829             return GCB_EDGE;
4830         }
4831     }
4832     else {
4833         if (*curpos - 2 < strbeg) {
4834             *curpos = (U8 *) strbeg;
4835             return GCB_EDGE;
4836         }
4837         (*curpos)--;
4838         gcb = getGCB_VAL_CP(*(*curpos - 1));
4839     }
4840
4841     return gcb;
4842 }
4843
4844 /* Combining marks attach to most classes that precede them, but this defines
4845  * the exceptions (from TR14) */
4846 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4847                                      || prev == LB_Mandatory_Break      \
4848                                      || prev == LB_Carriage_Return      \
4849                                      || prev == LB_Line_Feed            \
4850                                      || prev == LB_Next_Line            \
4851                                      || prev == LB_Space                \
4852                                      || prev == LB_ZWSpace))
4853
4854 STATIC bool
4855 S_isLB(pTHX_ LB_enum before,
4856              LB_enum after,
4857              const U8 * const strbeg,
4858              const U8 * const curpos,
4859              const U8 * const strend,
4860              const bool utf8_target)
4861 {
4862     U8 * temp_pos = (U8 *) curpos;
4863     LB_enum prev = before;
4864
4865     /* Is the boundary between 'before' and 'after' line-breakable?
4866      * Most of this is just a table lookup of a generated table from Unicode
4867      * rules.  But some rules require context to decide, and so have to be
4868      * implemented in code */
4869
4870     PERL_ARGS_ASSERT_ISLB;
4871
4872     /* Rule numbers in the comments below are as of Unicode 9.0 */
4873
4874   redo:
4875     before = prev;
4876     switch (LB_table[before][after]) {
4877         case LB_BREAKABLE:
4878             return TRUE;
4879
4880         case LB_NOBREAK:
4881         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4882             return FALSE;
4883
4884         case LB_SP_foo + LB_BREAKABLE:
4885         case LB_SP_foo + LB_NOBREAK:
4886         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4887
4888             /* When we have something following a SP, we have to look at the
4889              * context in order to know what to do.
4890              *
4891              * SP SP should not reach here because LB7: Do not break before
4892              * spaces.  (For two spaces in a row there is nothing that
4893              * overrides that) */
4894             assert(after != LB_Space);
4895
4896             /* Here we have a space followed by a non-space.  Mostly this is a
4897              * case of LB18: "Break after spaces".  But there are complications
4898              * as the handling of spaces is somewhat tricky.  They are in a
4899              * number of rules, which have to be applied in priority order, but
4900              * something earlier in the string can cause a rule to be skipped
4901              * and a lower priority rule invoked.  A prime example is LB7 which
4902              * says don't break before a space.  But rule LB8 (lower priority)
4903              * says that the first break opportunity after a ZW is after any
4904              * span of spaces immediately after it.  If a ZW comes before a SP
4905              * in the input, rule LB8 applies, and not LB7.  Other such rules
4906              * involve combining marks which are rules 9 and 10, but they may
4907              * override higher priority rules if they come earlier in the
4908              * string.  Since we're doing random access into the middle of the
4909              * string, we have to look for rules that should get applied based
4910              * on both string position and priority.  Combining marks do not
4911              * attach to either ZW nor SP, so we don't have to consider them
4912              * until later.
4913              *
4914              * To check for LB8, we have to find the first non-space character
4915              * before this span of spaces */
4916             do {
4917                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4918             }
4919             while (prev == LB_Space);
4920
4921             /* LB8 Break before any character following a zero-width space,
4922              * even if one or more spaces intervene.
4923              *      ZW SP* ÷
4924              * So if we have a ZW just before this span, and to get here this
4925              * is the final space in the span. */
4926             if (prev == LB_ZWSpace) {
4927                 return TRUE;
4928             }
4929
4930             /* Here, not ZW SP+.  There are several rules that have higher
4931              * priority than LB18 and can be resolved now, as they don't depend
4932              * on anything earlier in the string (except ZW, which we have
4933              * already handled).  One of these rules is LB11 Do not break
4934              * before Word joiner, but we have specially encoded that in the
4935              * lookup table so it is caught by the single test below which
4936              * catches the other ones. */
4937             if (LB_table[LB_Space][after] - LB_SP_foo
4938                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4939             {
4940                 return FALSE;
4941             }
4942
4943             /* If we get here, we have to XXX consider combining marks. */
4944             if (prev == LB_Combining_Mark) {
4945
4946                 /* What happens with these depends on the character they
4947                  * follow.  */
4948                 do {
4949                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4950                 }
4951                 while (prev == LB_Combining_Mark);
4952
4953                 /* Most times these attach to and inherit the characteristics
4954                  * of that character, but not always, and when not, they are to
4955                  * be treated as AL by rule LB10. */
4956                 if (! LB_CM_ATTACHES_TO(prev)) {
4957                     prev = LB_Alphabetic;
4958                 }
4959             }
4960
4961             /* Here, we have the character preceding the span of spaces all set
4962              * up.  We follow LB18: "Break after spaces" unless the table shows
4963              * that is overriden */
4964             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4965
4966         case LB_CM_ZWJ_foo:
4967
4968             /* We don't know how to treat the CM except by looking at the first
4969              * non-CM character preceding it.  ZWJ is treated as CM */
4970             do {
4971                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4972             }
4973             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4974
4975             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4976              * attatches to it, then it inherits the behavior of 'prev'.  If it
4977              * doesn't attach, it is to be treated as an AL */
4978             if (! LB_CM_ATTACHES_TO(prev)) {
4979                 prev = LB_Alphabetic;
4980             }
4981
4982             goto redo;
4983
4984         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4985         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4986
4987             /* LB21a Don't break after Hebrew + Hyphen.
4988              * HL (HY | BA) × */
4989
4990             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4991                                                           == LB_Hebrew_Letter)
4992             {
4993                 return FALSE;
4994             }
4995
4996             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4997
4998         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4999         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5000
5001             /* LB25a (PR | PO) × ( OP | HY )? NU */
5002             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5003                 return FALSE;
5004             }
5005
5006             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5007                                                                 == LB_BREAKABLE;
5008
5009         case LB_SY_or_IS_then_various + LB_BREAKABLE:
5010         case LB_SY_or_IS_then_various + LB_NOBREAK:
5011         {
5012             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5013
5014             LB_enum temp = prev;
5015             do {
5016                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5017             }
5018             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5019             if (temp == LB_Numeric) {
5020                 return FALSE;
5021             }
5022
5023             return LB_table[prev][after] - LB_SY_or_IS_then_various
5024                                                                == LB_BREAKABLE;
5025         }
5026
5027         case LB_various_then_PO_or_PR + LB_BREAKABLE:
5028         case LB_various_then_PO_or_PR + LB_NOBREAK:
5029         {
5030             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5031
5032             LB_enum temp = prev;
5033             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5034             {
5035                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5036             }
5037             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5038                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5039             }
5040             if (temp == LB_Numeric) {
5041                 return FALSE;
5042             }
5043             return LB_various_then_PO_or_PR;
5044         }
5045
5046         case LB_RI_then_RI + LB_NOBREAK:
5047         case LB_RI_then_RI + LB_BREAKABLE:
5048             {
5049                 int RI_count = 1;
5050
5051                 /* LB30a Break between two regional indicator symbols if and
5052                  * only if there are an even number of regional indicators
5053                  * preceding the position of the break.
5054                  *
5055                  *    sot (RI RI)* RI × RI
5056                  *  [^RI] (RI RI)* RI × RI */
5057
5058                 while (backup_one_LB(strbeg,
5059                                      &temp_pos,
5060                                      utf8_target) == LB_Regional_Indicator)
5061                 {
5062                     RI_count++;
5063                 }
5064
5065                 return RI_count % 2 == 0;
5066             }
5067
5068         default:
5069             break;
5070     }
5071
5072 #ifdef DEBUGGING
5073     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5074                                   before, after, LB_table[before][after]);
5075     assert(0);
5076 #endif
5077     return TRUE;
5078 }
5079
5080 STATIC LB_enum
5081 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5082 {
5083     LB_enum lb;
5084
5085     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5086
5087     if (*curpos >= strend) {
5088         return LB_EDGE;
5089     }
5090
5091     if (utf8_target) {
5092         *curpos += UTF8SKIP(*curpos);
5093         if (*curpos >= strend) {
5094             return LB_EDGE;
5095         }
5096         lb = getLB_VAL_UTF8(*curpos, strend);
5097     }
5098     else {
5099         (*curpos)++;
5100         if (*curpos >= strend) {
5101             return LB_EDGE;
5102         }
5103         lb = getLB_VAL_CP(**curpos);
5104     }
5105
5106     return lb;
5107 }
5108
5109 STATIC LB_enum
5110 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5111 {
5112     LB_enum lb;
5113
5114     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5115
5116     if (*curpos < strbeg) {
5117         return LB_EDGE;
5118     }
5119
5120     if (utf8_target) {
5121         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5122         U8 * prev_prev_char_pos;
5123
5124         if (! prev_char_pos) {
5125             return LB_EDGE;
5126         }
5127
5128         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5129             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5130             *curpos = prev_char_pos;
5131             prev_char_pos = prev_prev_char_pos;
5132         }
5133         else {
5134             *curpos = (U8 *) strbeg;
5135             return LB_EDGE;
5136         }
5137     }
5138     else {
5139         if (*curpos - 2 < strbeg) {
5140             *curpos = (U8 *) strbeg;
5141             return LB_EDGE;
5142         }
5143         (*curpos)--;
5144         lb = getLB_VAL_CP(*(*curpos - 1));
5145     }
5146
5147     return lb;
5148 }
5149
5150 STATIC bool
5151 S_isSB(pTHX_ SB_enum before,
5152              SB_enum after,
5153              const U8 * const strbeg,
5154              const U8 * const curpos,
5155              const U8 * const strend,
5156              const bool utf8_target)
5157 {
5158     /* returns a boolean indicating if there is a Sentence Boundary Break
5159      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
5160
5161     U8 * lpos = (U8 *) curpos;
5162     bool has_para_sep = FALSE;
5163     bool has_sp = FALSE;
5164
5165     PERL_ARGS_ASSERT_ISSB;
5166
5167     /* Break at the start and end of text.
5168         SB1.  sot  ÷
5169         SB2.  ÷  eot
5170       But unstated in Unicode is don't break if the text is empty */
5171     if (before == SB_EDGE || after == SB_EDGE) {
5172         return before != after;
5173     }
5174
5175     /* SB 3: Do not break within CRLF. */
5176     if (before == SB_CR && after == SB_LF) {
5177         return FALSE;
5178     }
5179
5180     /* Break after paragraph separators.  CR and LF are considered
5181      * so because Unicode views text as like word processing text where there
5182      * are no newlines except between paragraphs, and the word processor takes
5183      * care of wrapping without there being hard line-breaks in the text *./
5184        SB4.  Sep | CR | LF  ÷ */
5185     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5186         return TRUE;
5187     }
5188
5189     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5190      * (See Section 6.2, Replacing Ignore Rules.)
5191         SB5.  X (Extend | Format)*  →  X */
5192     if (after == SB_Extend || after == SB_Format) {
5193
5194         /* Implied is that the these characters attach to everything
5195          * immediately prior to them except for those separator-type
5196          * characters.  And the rules earlier have already handled the case
5197          * when one of those immediately precedes the extend char */
5198         return FALSE;
5199     }
5200
5201     if (before == SB_Extend || before == SB_Format) {
5202         U8 * temp_pos = lpos;
5203         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5204         if (   backup != SB_EDGE
5205             && backup != SB_Sep
5206             && backup != SB_CR
5207             && backup != SB_LF)
5208         {
5209             before = backup;
5210             lpos = temp_pos;
5211         }
5212
5213         /* Here, both 'before' and 'backup' are these types; implied is that we
5214          * don't break between them */
5215         if (backup == SB_Extend || backup == SB_Format) {
5216             return FALSE;
5217         }
5218     }
5219
5220     /* Do not break after ambiguous terminators like period, if they are
5221      * immediately followed by a number or lowercase letter, if they are
5222      * between uppercase letters, if the first following letter (optionally
5223      * after certain punctuation) is lowercase, or if they are followed by
5224      * "continuation" punctuation such as comma, colon, or semicolon. For
5225      * example, a period may be an abbreviation or numeric period, and thus may
5226      * not mark the end of a sentence.
5227
5228      * SB6. ATerm  ×  Numeric */
5229     if (before == SB_ATerm && after == SB_Numeric) {
5230         return FALSE;
5231     }
5232
5233     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5234     if (before == SB_ATerm && after == SB_Upper) {
5235         U8 * temp_pos = lpos;
5236         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5237         if (backup == SB_Upper || backup == SB_Lower) {
5238             return FALSE;
5239         }
5240     }
5241
5242     /* The remaining rules that aren't the final one, all require an STerm or
5243      * an ATerm after having backed up over some Close* Sp*, and in one case an
5244      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5245      * So do that backup now, setting flags if either Sp or a paragraph
5246      * separator are found */
5247
5248     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5249         has_para_sep = TRUE;
5250         before = backup_one_SB(strbeg, &lpos, utf8_target);
5251     }
5252
5253     if (before == SB_Sp) {
5254         has_sp = TRUE;
5255         do {
5256             before = backup_one_SB(strbeg, &lpos, utf8_target);
5257         }
5258         while (before == SB_Sp);
5259     }
5260
5261     while (before == SB_Close) {
5262         before = backup_one_SB(strbeg, &lpos, utf8_target);
5263     }
5264
5265     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5266      * most cases an STerm */
5267     if (before == SB_STerm || before == SB_ATerm) {
5268
5269         /* So, here the lhs matches
5270          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5271          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5272          * The rules that apply here are:
5273          *
5274          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5275                                            | LF | STerm | ATerm) )* Lower
5276            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5277            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5278            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5279            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5280          */
5281
5282         /* And all but SB11 forbid having seen a paragraph separator */
5283         if (! has_para_sep) {
5284             if (before == SB_ATerm) {          /* SB8 */
5285                 U8 * rpos = (U8 *) curpos;
5286                 SB_enum later = after;
5287
5288                 while (    later != SB_OLetter
5289                         && later != SB_Upper
5290                         && later != SB_Lower
5291                         && later != SB_Sep
5292                         && later != SB_CR
5293                         && later != SB_LF
5294                         && later != SB_STerm
5295                         && later != SB_ATerm
5296                         && later != SB_EDGE)
5297                 {
5298                     later = advance_one_SB(&rpos, strend, utf8_target);
5299                 }
5300                 if (later == SB_Lower) {
5301                     return FALSE;
5302                 }
5303             }
5304
5305             if (   after == SB_SContinue    /* SB8a */
5306                 || after == SB_STerm
5307                 || after == SB_ATerm)
5308             {
5309                 return FALSE;
5310             }
5311
5312             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5313                 if (   after == SB_Close
5314                     || after == SB_Sp
5315                     || after == SB_Sep
5316                     || after == SB_CR
5317                     || after == SB_LF)
5318                 {
5319                     return FALSE;
5320                 }
5321             }
5322
5323             /* SB10.  This and SB9 could probably be combined some way, but khw
5324              * has decided to follow the Unicode rule book precisely for
5325              * simplified maintenance */
5326             if (   after == SB_Sp
5327                 || after == SB_Sep
5328                 || after == SB_CR
5329                 || after == SB_LF)
5330             {
5331                 return FALSE;
5332             }
5333         }
5334
5335         /* SB11.  */
5336         return TRUE;
5337     }
5338
5339     /* Otherwise, do not break.
5340     SB12.  Any  ×  Any */
5341
5342     return FALSE;
5343 }
5344
5345 STATIC SB_enum
5346 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5347 {
5348     SB_enum sb;
5349
5350     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5351
5352     if (*curpos >= strend) {
5353         return SB_EDGE;
5354     }
5355
5356     if (utf8_target) {
5357         do {
5358             *curpos += UTF8SKIP(*curpos);
5359             if (*curpos >= strend) {
5360                 return SB_EDGE;
5361             }
5362             sb = getSB_VAL_UTF8(*curpos, strend);
5363         } while (sb == SB_Extend || sb == SB_Format);
5364     }
5365     else {
5366         do {
5367             (*curpos)++;
5368             if (*curpos >= strend) {
5369                 return SB_EDGE;
5370             }
5371             sb = getSB_VAL_CP(**curpos);
5372         } while (sb == SB_Extend || sb == SB_Format);
5373     }
5374
5375     return sb;
5376 }
5377
5378 STATIC SB_enum
5379 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5380 {
5381     SB_enum sb;
5382
5383     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5384
5385     if (*curpos < strbeg) {
5386         return SB_EDGE;
5387     }
5388
5389     if (utf8_target) {
5390         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5391         if (! prev_char_pos) {
5392             return SB_EDGE;
5393         }
5394
5395         /* Back up over Extend and Format.  curpos is always just to the right
5396          * of the characater whose value we are getting */
5397         do {
5398             U8 * prev_prev_char_pos;
5399             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5400                                                                       strbeg)))
5401             {
5402                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5403                 *curpos = prev_char_pos;
5404                 prev_char_pos = prev_prev_char_pos;
5405             }
5406             else {
5407                 *curpos = (U8 *) strbeg;
5408                 return SB_EDGE;
5409             }
5410         } while (sb == SB_Extend || sb == SB_Format);
5411     }
5412     else {
5413         do {
5414             if (*curpos - 2 < strbeg) {
5415                 *curpos = (U8 *) strbeg;
5416                 return SB_EDGE;
5417             }
5418             (*curpos)--;
5419             sb = getSB_VAL_CP(*(*curpos - 1));
5420         } while (sb == SB_Extend || sb == SB_Format);
5421     }
5422
5423     return sb;
5424 }
5425
5426 STATIC bool
5427 S_isWB(pTHX_ WB_enum previous,
5428              WB_enum before,
5429              WB_enum after,
5430              const U8 * const strbeg,
5431              const U8 * const curpos,
5432              const U8 * const strend,
5433              const bool utf8_target)
5434 {
5435     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5436      *  a Unicode word break, using their published algorithm, but tailored for
5437      *  Perl by treating spans of white space as one unit.  Context may be
5438      *  needed to make this determination.  If the value for the character
5439      *  before 'before' is known, it is passed as 'previous'; otherwise that
5440      *  should be set to WB_UNKNOWN.  The other input parameters give the
5441      *  boundaries and current position in the matching of the string.  That
5442      *  is, 'curpos' marks the position where the character whose wb value is
5443      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5444
5445     U8 * before_pos = (U8 *) curpos;
5446     U8 * after_pos = (U8 *) curpos;
5447     WB_enum prev = before;
5448     WB_enum next;
5449
5450     PERL_ARGS_ASSERT_ISWB;
5451
5452     /* Rule numbers in the comments below are as of Unicode 9.0 */
5453
5454   redo:
5455     before = prev;
5456     switch (WB_table[before][after]) {
5457         case WB_BREAKABLE:
5458             return TRUE;
5459
5460         case WB_NOBREAK:
5461             return FALSE;
5462
5463         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5464             next = advance_one_WB(&after_pos, strend, utf8_target,
5465                                  FALSE /* Don't skip Extend nor Format */ );
5466             /* A space immediately preceeding an Extend or Format is attached
5467              * to by them, and hence gets separated from previous spaces.
5468              * Otherwise don't break between horizontal white space */
5469             return next == WB_Extend || next == WB_Format;
5470
5471         /* WB4 Ignore Format and Extend characters, except when they appear at
5472          * the beginning of a region of text.  This code currently isn't
5473          * general purpose, but it works as the rules are currently and likely
5474          * to be laid out.  The reason it works is that when 'they appear at
5475          * the beginning of a region of text', the rule is to break before
5476          * them, just like any other character.  Therefore, the default rule
5477          * applies and we don't have to look in more depth.  Should this ever
5478          * change, we would have to have 2 'case' statements, like in the rules
5479          * below, and backup a single character (not spacing over the extend
5480          * ones) and then see if that is one of the region-end characters and
5481          * go from there */
5482         case WB_Ex_or_FO_or_ZWJ_then_foo:
5483             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5484             goto redo;
5485
5486         case WB_DQ_then_HL + WB_BREAKABLE:
5487         case WB_DQ_then_HL + WB_NOBREAK:
5488
5489             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5490
5491             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5492                                                             == WB_Hebrew_Letter)
5493             {
5494                 return FALSE;
5495             }
5496
5497              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5498
5499         case WB_HL_then_DQ + WB_BREAKABLE:
5500         case WB_HL_then_DQ + WB_NOBREAK:
5501
5502             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5503
5504             if (advance_one_WB(&after_pos, strend, utf8_target,
5505                                        TRUE /* Do skip Extend and Format */ )
5506                                                             == WB_Hebrew_Letter)
5507             {
5508                 return FALSE;
5509             }
5510
5511             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5512
5513         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5514         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5515
5516             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5517              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5518
5519             next = advance_one_WB(&after_pos, strend, utf8_target,
5520                                        TRUE /* Do skip Extend and Format */ );
5521
5522             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5523             {
5524                 return FALSE;
5525             }
5526
5527             return WB_table[before][after]
5528                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5529
5530         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5531         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5532
5533             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5534              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5535
5536             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5537             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5538             {
5539                 return FALSE;
5540             }
5541
5542             return WB_table[before][after]
5543                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5544
5545         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5546         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5547
5548             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5549              * */
5550
5551             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5552                                                             == WB_Numeric)
5553             {
5554                 return FALSE;
5555             }
5556
5557             return WB_table[before][after]
5558                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5559
5560         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5561         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5562
5563             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5564
5565             if (advance_one_WB(&after_pos, strend, utf8_target,
5566                                        TRUE /* Do skip Extend and Format */ )
5567                                                             == WB_Numeric)
5568             {
5569                 return FALSE;
5570             }
5571
5572             return WB_table[before][after]
5573                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5574
5575         case WB_RI_then_RI + WB_NOBREAK:
5576         case WB_RI_then_RI + WB_BREAKABLE:
5577             {
5578                 int RI_count = 1;
5579
5580                 /* Do not break within emoji flag sequences. That is, do not
5581                  * break between regional indicator (RI) symbols if there is an
5582                  * odd number of RI characters before the potential break
5583                  * point.
5584                  *
5585                  * WB15   sot (RI RI)* RI × RI
5586                  * WB16 [^RI] (RI RI)* RI × RI */
5587
5588                 while (backup_one_WB(&previous,
5589                                      strbeg,
5590                                      &before_pos,
5591                                      utf8_target) == WB_Regional_Indicator)
5592                 {
5593                     RI_count++;
5594                 }
5595
5596                 return RI_count % 2 != 1;
5597             }
5598
5599         default:
5600             break;
5601     }
5602
5603 #ifdef DEBUGGING
5604     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5605                                   before, after, WB_table[before][after]);
5606     assert(0);
5607 #endif
5608     return TRUE;
5609 }
5610
5611 STATIC WB_enum
5612 S_advance_one_WB(pTHX_ U8 ** curpos,
5613                        const U8 * const strend,
5614                        const bool utf8_target,
5615                        const bool skip_Extend_Format)
5616 {
5617     WB_enum wb;
5618
5619     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5620
5621     if (*curpos >= strend) {
5622         return WB_EDGE;
5623     }
5624
5625     if (utf8_target) {
5626
5627         /* Advance over Extend and Format */
5628         do {
5629             *curpos += UTF8SKIP(*curpos);
5630             if (*curpos >= strend) {
5631                 return WB_EDGE;
5632             }
5633             wb = getWB_VAL_UTF8(*curpos, strend);
5634         } while (    skip_Extend_Format
5635                  && (wb == WB_Extend || wb == WB_Format));
5636     }
5637     else {
5638         do {
5639             (*curpos)++;
5640             if (*curpos >= strend) {
5641                 return WB_EDGE;
5642             }
5643             wb = getWB_VAL_CP(**curpos);
5644         } while (    skip_Extend_Format
5645                  && (wb == WB_Extend || wb == WB_Format));
5646     }
5647
5648     return wb;
5649 }
5650
5651 STATIC WB_enum
5652 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5653 {
5654     WB_enum wb;
5655
5656     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5657
5658     /* If we know what the previous character's break value is, don't have
5659         * to look it up */
5660     if (*previous != WB_UNKNOWN) {
5661         wb = *previous;
5662
5663         /* But we need to move backwards by one */
5664         if (utf8_target) {
5665             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5666             if (! *curpos) {
5667                 *previous = WB_EDGE;
5668                 *curpos = (U8 *) strbeg;
5669             }
5670             else {
5671                 *previous = WB_UNKNOWN;
5672             }
5673         }
5674         else {
5675             (*curpos)--;
5676             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5677         }
5678
5679         /* And we always back up over these three types */
5680         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5681             return wb;
5682         }
5683     }
5684
5685     if (*curpos < strbeg) {
5686         return WB_EDGE;
5687     }
5688
5689     if (utf8_target) {
5690         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5691         if (! prev_char_pos) {
5692             return WB_EDGE;
5693         }
5694
5695         /* Back up over Extend and Format.  curpos is always just to the right
5696          * of the characater whose value we are getting */
5697         do {
5698             U8 * prev_prev_char_pos;
5699             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5700                                                    -1,
5701                                                    strbeg)))
5702             {
5703                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5704                 *curpos = prev_char_pos;
5705                 prev_char_pos = prev_prev_char_pos;
5706             }
5707             else {
5708                 *curpos = (U8 *) strbeg;
5709                 return WB_EDGE;
5710             }
5711         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5712     }
5713     else {
5714         do {
5715             if (*curpos - 2 < strbeg) {
5716                 *curpos = (U8 *) strbeg;
5717                 return WB_EDGE;
5718             }
5719             (*curpos)--;
5720             wb = getWB_VAL_CP(*(*curpos - 1));
5721         } while (wb == WB_Extend || wb == WB_Format);
5722     }
5723
5724     return wb;
5725 }
5726
5727 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5728 (                                                           \
5729     (   ( st )                                         ) && \
5730     (   ( st )->u.eval.close_paren                     ) && \
5731     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5732 )
5733
5734 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5735 (                                                           \
5736     (   ( st )                                         ) && \
5737     (   ( st )->u.eval.close_paren                     ) && \
5738     (   ( expr )                                       ) && \
5739     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5740 )
5741
5742
5743 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5744     (st)->u.eval.close_paren = ( (expr) + 1 )
5745
5746 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5747     (st)->u.eval.close_paren = 0
5748
5749 /* returns -1 on failure, $+[0] on success */
5750 STATIC SSize_t
5751 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5752 {
5753     dVAR;
5754     const bool utf8_target = reginfo->is_utf8_target;
5755     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5756     REGEXP *rex_sv = reginfo->prog;
5757     regexp *rex = ReANY(rex_sv);
5758     RXi_GET_DECL(rex,rexi);
5759     /* the current state. This is a cached copy of PL_regmatch_state */
5760     regmatch_state *st;
5761     /* cache heavy used fields of st in registers */
5762     regnode *scan;
5763     regnode *next;
5764     U32 n = 0;  /* general value; init to avoid compiler warning */
5765     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5766     SSize_t endref = 0; /* offset of end of backref when ln is start */
5767     char *locinput = startpos;
5768     char *pushinput; /* where to continue after a PUSH */
5769     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5770
5771     bool result = 0;        /* return value of S_regmatch */
5772     U32 depth = 0;            /* depth of backtrack stack */
5773     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5774     const U32 max_nochange_depth =
5775         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5776         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5777     regmatch_state *yes_state = NULL; /* state to pop to on success of
5778                                                             subpattern */
5779     /* mark_state piggy backs on the yes_state logic so that when we unwind
5780        the stack on success we can update the mark_state as we go */
5781     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5782     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5783     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5784     U32 state_num;
5785     bool no_final = 0;      /* prevent failure from backtracking? */
5786     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5787     char *startpoint = locinput;
5788     SV *popmark = NULL;     /* are we looking for a mark? */
5789     SV *sv_commit = NULL;   /* last mark name seen in failure */
5790     SV *sv_yes_mark = NULL; /* last mark name we have seen
5791                                during a successful match */
5792     U32 lastopen = 0;       /* last open we saw */
5793     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5794     SV* const oreplsv = GvSVn(PL_replgv);
5795     /* these three flags are set by various ops to signal information to
5796      * the very next op. They have a useful lifetime of exactly one loop
5797      * iteration, and are not preserved or restored by state pushes/pops
5798      */
5799     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5800     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5801     int logical = 0;        /* the following EVAL is:
5802                                 0: (?{...})
5803                                 1: (?(?{...})X|Y)
5804                                 2: (??{...})
5805                                or the following IFMATCH/UNLESSM is:
5806                                 false: plain (?=foo)
5807                                 true:  used as a condition: (?(?=foo))
5808                             */
5809     PAD* last_pad = NULL;
5810     dMULTICALL;
5811     U8 gimme = G_SCALAR;
5812     CV *caller_cv = NULL;       /* who called us */
5813     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5814     U32 maxopenparen = 0;       /* max '(' index seen so far */
5815     int to_complement;  /* Invert the result? */
5816     _char_class_number classnum;
5817     bool is_utf8_pat = reginfo->is_utf8_pat;
5818     bool match = FALSE;
5819     I32 orig_savestack_ix = PL_savestack_ix;
5820     U8 * script_run_begin = NULL;
5821
5822 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5823 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5824 #  define SOLARIS_BAD_OPTIMIZER
5825     const U32 *pl_charclass_dup = PL_charclass;
5826 #  define PL_charclass pl_charclass_dup
5827 #endif
5828
5829 #ifdef DEBUGGING
5830     GET_RE_DEBUG_FLAGS_DECL;
5831 #endif
5832
5833     /* protect against undef(*^R) */
5834     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5835
5836     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5837     multicall_oldcatch = 0;
5838     PERL_UNUSED_VAR(multicall_cop);
5839
5840     PERL_ARGS_ASSERT_REGMATCH;
5841
5842     st = PL_regmatch_state;
5843
5844     /* Note that nextchr is a byte even in UTF */
5845     SET_nextchr;
5846     scan = prog;
5847
5848     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5849             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5850             Perl_re_printf( aTHX_ "regmatch start\n" );
5851     }));
5852
5853     while (scan != NULL) {
5854         next = scan + NEXT_OFF(scan);
5855         if (next == scan)
5856             next = NULL;
5857         state_num = OP(scan);
5858
5859       reenter_switch:
5860         DEBUG_EXECUTE_r(
5861             if (state_num <= REGNODE_MAX) {
5862                 SV * const prop = sv_newmortal();
5863                 regnode *rnext = regnext(scan);
5864
5865                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5866                 regprop(rex, prop, scan, reginfo, NULL);
5867                 Perl_re_printf( aTHX_
5868                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5869                     INDENT_CHARS(depth), "",
5870                     (IV)(scan - rexi->program),
5871                     SvPVX_const(prop),
5872                     (PL_regkind[OP(scan)] == END || !rnext) ?
5873                         0 : (IV)(rnext - rexi->program));
5874             }
5875         );
5876
5877         to_complement = 0;
5878
5879         SET_nextchr;
5880         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5881
5882         switch (state_num) {
5883         case SBOL: /*  /^../ and /\A../  */
5884             if (locinput == reginfo->strbeg)
5885                 break;
5886             sayNO;
5887
5888         case MBOL: /*  /^../m  */
5889             if (locinput == reginfo->strbeg ||
5890                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5891             {
5892                 break;
5893             }
5894             sayNO;
5895
5896         case GPOS: /*  \G  */
5897             if (locinput == reginfo->ganch)
5898                 break;
5899             sayNO;
5900
5901         case KEEPS: /*   \K  */
5902             /* update the startpoint */
5903             st->u.keeper.val = rex->offs[0].start;
5904             rex->offs[0].start = locinput - reginfo->strbeg;
5905             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5906             NOT_REACHED; /* NOTREACHED */
5907
5908         case KEEPS_next_fail:
5909             /* rollback the start point change */
5910             rex->offs[0].start = st->u.keeper.val;
5911             sayNO_SILENT;
5912             NOT_REACHED; /* NOTREACHED */
5913
5914         case MEOL: /* /..$/m  */
5915             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5916                 sayNO;
5917             break;
5918
5919         case SEOL: /* /..$/  */
5920             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5921                 sayNO;
5922             if (reginfo->strend - locinput > 1)
5923                 sayNO;
5924             break;
5925
5926         case EOS: /*  \z  */
5927             if (!NEXTCHR_IS_EOS)
5928                 sayNO;
5929             break;
5930
5931         case SANY: /*  /./s  */
5932             if (NEXTCHR_IS_EOS)
5933                 sayNO;
5934             goto increment_locinput;
5935
5936         case REG_ANY: /*  /./  */
5937             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5938                 sayNO;
5939             goto increment_locinput;
5940
5941
5942 #undef  ST
5943 #define ST st->u.trie
5944         case TRIEC: /* (ab|cd) with known charclass */
5945             /* In this case the charclass data is available inline so
5946                we can fail fast without a lot of extra overhead.
5947              */
5948             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5949                 DEBUG_EXECUTE_r(
5950                     Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
5951                               depth, PL_colors[4], PL_colors[5])
5952                 );
5953                 sayNO_SILENT;
5954                 NOT_REACHED; /* NOTREACHED */
5955             }
5956             /* FALLTHROUGH */
5957         case TRIE:  /* (ab|cd)  */
5958             /* the basic plan of execution of the trie is:
5959              * At the beginning, run though all the states, and
5960              * find the longest-matching word. Also remember the position
5961              * of the shortest matching word. For example, this pattern:
5962              *    1  2 3 4    5
5963              *    ab|a|x|abcd|abc
5964              * when matched against the string "abcde", will generate
5965              * accept states for all words except 3, with the longest
5966              * matching word being 4, and the shortest being 2 (with
5967              * the position being after char 1 of the string).
5968              *
5969              * Then for each matching word, in word order (i.e. 1,2,4,5),
5970              * we run the remainder of the pattern; on each try setting
5971              * the current position to the character following the word,
5972              * returning to try the next word on failure.
5973              *
5974              * We avoid having to build a list of words at runtime by
5975              * using a compile-time structure, wordinfo[].prev, which
5976              * gives, for each word, the previous accepting word (if any).
5977              * In the case above it would contain the mappings 1->2, 2->0,
5978              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5979              * the longest word (4 above), a list of all words, by
5980              * following the list of prev pointers; this gives us the
5981              * unordered list 4,5,1,2. Then given the current word we have
5982              * just tried, we can go through the list and find the
5983              * next-biggest word to try (so if we just failed on word 2,
5984              * the next in the list is 4).
5985              *
5986              * Since at runtime we don't record the matching position in
5987              * the string for each word, we have to work that out for
5988              * each word we're about to process. The wordinfo table holds
5989              * the character length of each word; given that we recorded
5990              * at the start: the position of the shortest word and its
5991              * length in chars, we just need to move the pointer the
5992              * difference between the two char lengths. Depending on
5993              * Unicode status and folding, that's cheap or expensive.
5994              *
5995              * This algorithm is optimised for the case where are only a
5996              * small number of accept states, i.e. 0,1, or maybe 2.
5997              * With lots of accepts states, and having to try all of them,
5998              * it becomes quadratic on number of accept states to find all
5999              * the next words.
6000              */
6001
6002             {
6003                 /* what type of TRIE am I? (utf8 makes this contextual) */
6004                 DECL_TRIE_TYPE(scan);
6005
6006                 /* what trie are we using right now */
6007                 reg_trie_data * const trie
6008                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6009                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6010                 U32 state = trie->startstate;
6011
6012                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6013                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6014                     if (utf8_target
6015                         && nextchr >= 0 /* guard against negative EOS value in nextchr */
6016                         && UTF8_IS_ABOVE_LATIN1(nextchr)
6017                         && scan->flags == EXACTL)
6018                     {
6019                         /* We only output for EXACTL, as we let the folder
6020                          * output this message for EXACTFLU8 to avoid
6021                          * duplication */
6022                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6023                                                                reginfo->strend);
6024                     }
6025                 }
6026                 if (   trie->bitmap
6027                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
6028                 {
6029                     if (trie->states[ state ].wordnum) {
6030                          DEBUG_EXECUTE_r(
6031                             Perl_re_exec_indentf( aTHX_  "%smatched empty string...%s\n",
6032                                           depth, PL_colors[4], PL_colors[5])
6033                         );
6034                         if (!trie->jump)
6035                             break;
6036                     } else {
6037                         DEBUG_EXECUTE_r(
6038                             Perl_re_exec_indentf( aTHX_  "%sfailed to match trie start class...%s\n",
6039                                           depth, PL_colors[4], PL_colors[5])
6040                         );
6041                         sayNO_SILENT;
6042                    }
6043                 }
6044
6045             {
6046                 U8 *uc = ( U8* )locinput;
6047
6048                 STRLEN len = 0;
6049                 STRLEN foldlen = 0;
6050                 U8 *uscan = (U8*)NULL;
6051                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6052                 U32 charcount = 0; /* how many input chars we have matched */
6053                 U32 accepted = 0; /* have we seen any accepting states? */
6054
6055                 ST.jump = trie->jump;
6056                 ST.me = scan;
6057                 ST.firstpos = NULL;
6058                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6059                 ST.nextword = 0;
6060
6061                 /* fully traverse the TRIE; note the position of the
6062                    shortest accept state and the wordnum of the longest
6063                    accept state */
6064
6065                 while ( state && uc <= (U8*)(reginfo->strend) ) {
6066                     U32 base = trie->states[ state ].trans.base;
6067                     UV uvc = 0;
6068                     U16 charid = 0;
6069                     U16 wordnum;
6070                     wordnum = trie->states[ state ].wordnum;
6071
6072                     if (wordnum) { /* it's an accept state */
6073                         if (!accepted) {
6074                             accepted = 1;
6075                             /* record first match position */
6076                             if (ST.longfold) {
6077                                 ST.firstpos = (U8*)locinput;
6078                                 ST.firstchars = 0;
6079                             }
6080                             else {
6081                                 ST.firstpos = uc;
6082                                 ST.firstchars = charcount;
6083                             }
6084                         }
6085                         if (!ST.nextword || wordnum < ST.nextword)
6086                             ST.nextword = wordnum;
6087                         ST.topword = wordnum;
6088                     }
6089
6090                     DEBUG_TRIE_EXECUTE_r({
6091                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6092                                 /* HERE */
6093                                 PerlIO_printf( Perl_debug_log,
6094                                     "%*s%sState: %4" UVxf " Accepted: %c ",
6095                                     INDENT_CHARS(depth), "", PL_colors[4],
6096                                     (UV)state, (accepted ? 'Y' : 'N'));
6097                     });
6098
6099                     /* read a char and goto next state */
6100                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6101                         I32 offset;
6102                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6103                                              uscan, len, uvc, charid, foldlen,
6104                                              foldbuf, uniflags);
6105                         charcount++;
6106                         if (foldlen>0)
6107                             ST.longfold = TRUE;
6108                         if (charid &&
6109                              ( ((offset =
6110                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6111
6112                              && ((U32)offset < trie->lasttrans)
6113                              && trie->trans[offset].check == state)
6114                         {
6115                             state = trie->trans[offset].next;
6116                         }
6117                         else {
6118                             state = 0;
6119                         }
6120                         uc += len;
6121
6122                     }
6123                     else {
6124                         state = 0;
6125                     }
6126                     DEBUG_TRIE_EXECUTE_r(
6127                         Perl_re_printf( aTHX_
6128                             "Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6129                             charid, uvc, (UV)state, PL_colors[5] );
6130                     );
6131                 }
6132                 if (!accepted)
6133                    sayNO;
6134
6135                 /* calculate total number of accept states */
6136                 {
6137                     U16 w = ST.topword;
6138                     accepted = 0;
6139                     while (w) {
6140                         w = trie->wordinfo[w].prev;
6141                         accepted++;
6142                     }
6143                     ST.accepted = accepted;
6144                 }
6145
6146                 DEBUG_EXECUTE_r(
6147                     Perl_re_exec_indentf( aTHX_  "%sgot %" IVdf " possible matches%s\n",
6148                         depth,
6149                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6150                 );
6151                 goto trie_first_try; /* jump into the fail handler */
6152             }}
6153             NOT_REACHED; /* NOTREACHED */
6154
6155         case TRIE_next_fail: /* we failed - try next alternative */
6156         {
6157             U8 *uc;
6158             if ( ST.jump ) {
6159                 /* undo any captures done in the tail part of a branch,
6160                  * e.g.
6161                  *    /(?:X(.)(.)|Y(.)).../
6162                  * where the trie just matches X then calls out to do the
6163                  * rest of the branch */
6164                 REGCP_UNWIND(ST.cp);
6165                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6166             }
6167             if (!--ST.accepted) {
6168                 DEBUG_EXECUTE_r({
6169                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6170                         depth,
6171                         PL_colors[4],
6172                         PL_colors[5] );
6173                 });
6174                 sayNO_SILENT;
6175             }
6176             {
6177                 /* Find next-highest word to process.  Note that this code
6178                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6179                 U16 min = 0;
6180                 U16 word;
6181                 U16 const nextword = ST.nextword;
6182                 reg_trie_wordinfo * const wordinfo
6183                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6184                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6185                     if (word > nextword && (!min || word < min))
6186                         min = word;
6187                 }
6188                 ST.nextword = min;
6189             }
6190
6191           trie_first_try:
6192             if (do_cutgroup) {
6193                 do_cutgroup = 0;
6194                 no_final = 0;
6195             }
6196
6197             if ( ST.jump ) {
6198                 ST.lastparen = rex->lastparen;
6199                 ST.lastcloseparen = rex->lastcloseparen;
6200                 REGCP_SET(ST.cp);
6201             }
6202
6203             /* find start char of end of current word */
6204             {
6205                 U32 chars; /* how many chars to skip */
6206                 reg_trie_data * const trie
6207                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6208
6209                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6210                             >=  ST.firstchars);
6211                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6212                             - ST.firstchars;
6213                 uc = ST.firstpos;
6214
6215                 if (ST.longfold) {
6216                     /* the hard option - fold each char in turn and find
6217                      * its folded length (which may be different */
6218                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6219                     STRLEN foldlen;
6220                     STRLEN len;
6221                     UV uvc;
6222                     U8 *uscan;
6223
6224                     while (chars) {
6225                         if (utf8_target) {
6226                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6227                                                     uniflags);
6228                             uc += len;
6229                         }
6230                         else {
6231                             uvc = *uc;
6232                             uc++;
6233                         }
6234                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6235                         uscan = foldbuf;
6236                         while (foldlen) {
6237                             if (!--chars)
6238                                 break;
6239                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
6240                                             uniflags);
6241                             uscan += len;
6242                             foldlen -= len;
6243                         }
6244                     }
6245                 }
6246                 else {
6247                     if (utf8_target)
6248                         while (chars--)
6249                             uc += UTF8SKIP(uc);
6250                     else
6251                         uc += chars;
6252                 }
6253             }
6254
6255             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6256                             ? ST.jump[ST.nextword]
6257                             : NEXT_OFF(ST.me));
6258
6259             DEBUG_EXECUTE_r({
6260                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6261                     depth,
6262                     PL_colors[4],
6263                     ST.nextword,
6264                     PL_colors[5]
6265                     );
6266             });
6267
6268             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6269                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6270                 NOT_REACHED; /* NOTREACHED */
6271             }
6272             /* only one choice left - just continue */
6273             DEBUG_EXECUTE_r({
6274                 AV *const trie_words
6275                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6276                 SV ** const tmp = trie_words
6277                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6278                 SV *sv= tmp ? sv_newmortal() : NULL;
6279
6280                 Perl_re_exec_indentf( aTHX_  "%sonly one match left, short-circuiting: #%d <%s>%s\n",
6281                     depth, PL_colors[4],
6282                     ST.nextword,
6283                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6284                             PL_colors[0], PL_colors[1],
6285                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6286                         )
6287                     : "not compiled under -Dr",
6288                     PL_colors[5] );
6289             });
6290
6291             locinput = (char*)uc;
6292             continue; /* execute rest of RE */
6293             /* NOTREACHED */
6294         }
6295 #undef  ST
6296
6297         case EXACTL:             /*  /abc/l       */
6298             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6299
6300             /* Complete checking would involve going through every character
6301              * matched by the string to see if any is above latin1.  But the
6302              * comparision otherwise might very well be a fast assembly
6303              * language routine, and I (khw) don't think slowing things down
6304              * just to check for this warning is worth it.  So this just checks
6305              * the first character */
6306             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6307                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6308             }
6309             /* FALLTHROUGH */
6310         case EXACT: {            /*  /abc/        */
6311             char *s = STRING(scan);
6312             ln = STR_LEN(scan);
6313             if (utf8_target != is_utf8_pat) {
6314                 /* The target and the pattern have differing utf8ness. */
6315                 char *l = locinput;
6316                 const char * const e = s + ln;
6317
6318                 if (utf8_target) {
6319                     /* The target is utf8, the pattern is not utf8.
6320                      * Above-Latin1 code points can't match the pattern;
6321                      * invariants match exactly, and the other Latin1 ones need
6322                      * to be downgraded to a single byte in order to do the
6323                      * comparison.  (If we could be confident that the target
6324                      * is not malformed, this could be refactored to have fewer
6325                      * tests by just assuming that if the first bytes match, it
6326                      * is an invariant, but there are tests in the test suite
6327                      * dealing with (??{...}) which violate this) */
6328                     while (s < e) {
6329                         if (l >= reginfo->strend
6330                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6331                         {
6332                             sayNO;
6333                         }
6334                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6335                             if (*l != *s) {
6336                                 sayNO;
6337                             }
6338                             l++;
6339                         }
6340                         else {
6341                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6342                             {
6343                                 sayNO;
6344                             }
6345                             l += 2;
6346                         }
6347                         s++;
6348                     }
6349                 }
6350                 else {
6351                     /* The target is not utf8, the pattern is utf8. */
6352                     while (s < e) {
6353                         if (l >= reginfo->strend
6354                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6355                         {
6356                             sayNO;
6357                         }
6358                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6359                             if (*s != *l) {
6360                                 sayNO;
6361                             }
6362                             s++;
6363                         }
6364                         else {
6365                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6366                             {
6367                                 sayNO;
6368                             }
6369                             s += 2;
6370                         }
6371                         l++;
6372                     }
6373                 }
6374                 locinput = l;
6375             }
6376             else {
6377                 /* The target and the pattern have the same utf8ness. */
6378                 /* Inline the first character, for speed. */
6379                 if (reginfo->strend - locinput < ln
6380                     || UCHARAT(s) != nextchr
6381                     || (ln > 1 && memNE(s, locinput, ln)))
6382                 {
6383                     sayNO;
6384                 }
6385                 locinput += ln;
6386             }
6387             break;
6388             }
6389
6390         case EXACTFL: {          /*  /abc/il      */
6391             re_fold_t folder;
6392             const U8 * fold_array;
6393             const char * s;
6394             U32 fold_utf8_flags;
6395
6396             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6397             folder = foldEQ_locale;
6398             fold_array = PL_fold_locale;
6399             fold_utf8_flags = FOLDEQ_LOCALE;
6400             goto do_exactf;
6401
6402         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6403                                       is effectively /u; hence to match, target
6404                                       must be UTF-8. */
6405             if (! utf8_target) {
6406                 sayNO;
6407             }
6408             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6409                                              | FOLDEQ_S1_FOLDS_SANE;
6410             folder = foldEQ_latin1;
6411             fold_array = PL_fold_latin1;
6412             goto do_exactf;
6413
6414         case EXACTFU_SS:         /*  /\x{df}/iu   */
6415         case EXACTFU:            /*  /abc/iu      */
6416             folder = foldEQ_latin1;
6417             fold_array = PL_fold_latin1;
6418             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6419             goto do_exactf;
6420
6421         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6422                                    patterns */
6423             assert(! is_utf8_pat);
6424             /* FALLTHROUGH */
6425         case EXACTFAA:            /*  /abc/iaa     */
6426             folder = foldEQ_latin1;
6427             fold_array = PL_fold_latin1;
6428             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6429             goto do_exactf;
6430
6431         case EXACTF:             /*  /abc/i    This node only generated for
6432                                                non-utf8 patterns */
6433             assert(! is_utf8_pat);
6434             folder = foldEQ;
6435             fold_array = PL_fold;
6436             fold_utf8_flags = 0;
6437
6438           do_exactf:
6439             s = STRING(scan);
6440             ln = STR_LEN(scan);
6441
6442             if (utf8_target
6443                 || is_utf8_pat
6444                 || state_num == EXACTFU_SS
6445                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6446             {
6447               /* Either target or the pattern are utf8, or has the issue where
6448                * the fold lengths may differ. */
6449                 const char * const l = locinput;
6450                 char *e = reginfo->strend;
6451
6452                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6453                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6454                 {
6455                     sayNO;
6456                 }
6457                 locinput = e;
6458                 break;
6459             }
6460
6461             /* Neither the target nor the pattern are utf8 */
6462             if (UCHARAT(s) != nextchr
6463                 && !NEXTCHR_IS_EOS
6464                 && UCHARAT(s) != fold_array[nextchr])
6465             {
6466                 sayNO;
6467             }
6468             if (reginfo->strend - locinput < ln)
6469                 sayNO;
6470             if (ln > 1 && ! folder(s, locinput, ln))
6471                 sayNO;
6472             locinput += ln;
6473             break;
6474         }
6475
6476         case NBOUNDL: /*  /\B/l  */
6477             to_complement = 1;
6478             /* FALLTHROUGH */
6479
6480         case BOUNDL:  /*  /\b/l  */
6481         {
6482             bool b1, b2;
6483             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6484
6485             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6486                 if (! IN_UTF8_CTYPE_LOCALE) {
6487                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6488                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6489                 }
6490                 goto boundu;
6491             }
6492
6493             if (utf8_target) {
6494                 if (locinput == reginfo->strbeg)
6495                     b1 = isWORDCHAR_LC('\n');
6496                 else {
6497                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6498                                                         (U8*)(reginfo->strbeg)),
6499                                                  (U8*)(reginfo->strend));
6500                 }
6501                 b2 = (NEXTCHR_IS_EOS)
6502                     ? isWORDCHAR_LC('\n')
6503                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6504                                               (U8*) reginfo->strend);
6505             }
6506             else { /* Here the string isn't utf8 */
6507                 b1 = (locinput == reginfo->strbeg)
6508                      ? isWORDCHAR_LC('\n')
6509                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6510                 b2 = (NEXTCHR_IS_EOS)
6511                     ? isWORDCHAR_LC('\n')
6512                     : isWORDCHAR_LC(nextchr);
6513             }
6514             if (to_complement ^ (b1 == b2)) {
6515                 sayNO;
6516             }
6517             break;
6518         }
6519
6520         case NBOUND:  /*  /\B/   */
6521             to_complement = 1;
6522             /* FALLTHROUGH */
6523
6524         case BOUND:   /*  /\b/   */
6525             if (utf8_target) {
6526                 goto bound_utf8;
6527             }
6528             goto bound_ascii_match_only;
6529
6530         case NBOUNDA: /*  /\B/a  */
6531             to_complement = 1;
6532             /* FALLTHROUGH */
6533
6534         case BOUNDA:  /*  /\b/a  */
6535         {
6536             bool b1, b2;
6537
6538           bound_ascii_match_only:
6539             /* Here the string isn't utf8, or is utf8 and only ascii characters
6540              * are to match \w.  In the latter case looking at the byte just
6541              * prior to the current one may be just the final byte of a
6542              * multi-byte character.  This is ok.  There are two cases:
6543              * 1) it is a single byte character, and then the test is doing
6544              *    just what it's supposed to.
6545              * 2) it is a multi-byte character, in which case the final byte is
6546              *    never mistakable for ASCII, and so the test will say it is
6547              *    not a word character, which is the correct answer. */
6548             b1 = (locinput == reginfo->strbeg)
6549                  ? isWORDCHAR_A('\n')
6550                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6551             b2 = (NEXTCHR_IS_EOS)
6552                 ? isWORDCHAR_A('\n')
6553                 : isWORDCHAR_A(nextchr);
6554             if (to_complement ^ (b1 == b2)) {
6555                 sayNO;
6556             }
6557             break;
6558         }
6559
6560         case NBOUNDU: /*  /\B/u  */
6561             to_complement = 1;
6562             /* FALLTHROUGH */
6563
6564         case BOUNDU:  /*  /\b/u  */
6565
6566           boundu:
6567             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6568                 match = FALSE;
6569             }
6570             else if (utf8_target) {
6571               bound_utf8:
6572                 switch((bound_type) FLAGS(scan)) {
6573                     case TRADITIONAL_BOUND:
6574                     {
6575                         bool b1, b2;
6576                         b1 = (locinput == reginfo->strbeg)
6577                              ? 0 /* isWORDCHAR_L1('\n') */
6578                              : isWORDCHAR_utf8_safe(
6579                                                reghop3((U8*)locinput,
6580                                                        -1,
6581                                                        (U8*)(reginfo->strbeg)),
6582                                                     (U8*) reginfo->strend);
6583                         b2 = (NEXTCHR_IS_EOS)
6584                             ? 0 /* isWORDCHAR_L1('\n') */
6585                             : isWORDCHAR_utf8_safe((U8*)locinput,
6586                                                    (U8*) reginfo->strend);
6587                         match = cBOOL(b1 != b2);
6588                         break;
6589                     }
6590                     case GCB_BOUND:
6591                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6592                             match = TRUE; /* GCB always matches at begin and
6593                                              end */
6594                         }
6595                         else {
6596                             /* Find the gcb values of previous and current
6597                              * chars, then see if is a break point */
6598                             match = isGCB(getGCB_VAL_UTF8(
6599                                                 reghop3((U8*)locinput,
6600                                                         -1,
6601                                                         (U8*)(reginfo->strbeg)),
6602                                                 (U8*) reginfo->strend),
6603                                           getGCB_VAL_UTF8((U8*) locinput,
6604                                                         (U8*) reginfo->strend),
6605                                           (U8*) reginfo->strbeg,
6606                                           (U8*) locinput,
6607                                           utf8_target);
6608                         }
6609                         break;
6610
6611                     case LB_BOUND:
6612                         if (locinput == reginfo->strbeg) {
6613                             match = FALSE;
6614                         }
6615                         else if (NEXTCHR_IS_EOS) {
6616                             match = TRUE;
6617                         }
6618                         else {
6619                             match = isLB(getLB_VAL_UTF8(
6620                                                 reghop3((U8*)locinput,
6621                                                         -1,
6622                                                         (U8*)(reginfo->strbeg)),
6623                                                 (U8*) reginfo->strend),
6624                                           getLB_VAL_UTF8((U8*) locinput,
6625                                                         (U8*) reginfo->strend),
6626                                           (U8*) reginfo->strbeg,
6627                                           (U8*) locinput,
6628                                           (U8*) reginfo->strend,
6629                                           utf8_target);
6630                         }
6631                         break;
6632
6633                     case SB_BOUND: /* Always matches at begin and end */
6634                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6635                             match = TRUE;
6636                         }
6637                         else {
6638                             match = isSB(getSB_VAL_UTF8(
6639                                                 reghop3((U8*)locinput,
6640                                                         -1,
6641                                                         (U8*)(reginfo->strbeg)),
6642                                                 (U8*) reginfo->strend),
6643                                           getSB_VAL_UTF8((U8*) locinput,
6644                                                         (U8*) reginfo->strend),
6645                                           (U8*) reginfo->strbeg,
6646                                           (U8*) locinput,
6647                                           (U8*) reginfo->strend,
6648                                           utf8_target);
6649                         }
6650                         break;
6651
6652                     case WB_BOUND:
6653                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6654                             match = TRUE;
6655                         }
6656                         else {
6657                             match = isWB(WB_UNKNOWN,
6658                                          getWB_VAL_UTF8(
6659                                                 reghop3((U8*)locinput,
6660                                                         -1,
6661                                                         (U8*)(reginfo->strbeg)),
6662                                                 (U8*) reginfo->strend),
6663                                           getWB_VAL_UTF8((U8*) locinput,
6664                                                         (U8*) reginfo->strend),
6665                                           (U8*) reginfo->strbeg,
6666                                           (U8*) locinput,
6667                                           (U8*) reginfo->strend,
6668                                           utf8_target);
6669                         }
6670                         break;
6671                 }
6672             }
6673             else {  /* Not utf8 target */
6674                 switch((bound_type) FLAGS(scan)) {
6675                     case TRADITIONAL_BOUND:
6676                     {
6677                         bool b1, b2;
6678                         b1 = (locinput == reginfo->strbeg)
6679                             ? 0 /* isWORDCHAR_L1('\n') */
6680                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6681                         b2 = (NEXTCHR_IS_EOS)
6682                             ? 0 /* isWORDCHAR_L1('\n') */
6683                             : isWORDCHAR_L1(nextchr);
6684                         match = cBOOL(b1 != b2);
6685                         break;
6686                     }
6687
6688                     case GCB_BOUND:
6689                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6690                             match = TRUE; /* GCB always matches at begin and
6691                                              end */
6692                         }
6693                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6694                                    range */
6695                             match =    UCHARAT(locinput - 1) != '\r'
6696                                     || UCHARAT(locinput) != '\n';
6697                         }
6698                         break;
6699
6700                     case LB_BOUND:
6701                         if (locinput == reginfo->strbeg) {
6702                             match = FALSE;
6703                         }
6704                         else if (NEXTCHR_IS_EOS) {
6705                             match = TRUE;
6706                         }
6707                         else {
6708                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6709                                          getLB_VAL_CP(UCHARAT(locinput)),
6710                                          (U8*) reginfo->strbeg,
6711                                          (U8*) locinput,
6712                                          (U8*) reginfo->strend,
6713                                          utf8_target);
6714                         }
6715                         break;
6716
6717                     case SB_BOUND: /* Always matches at begin and end */
6718                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6719                             match = TRUE;
6720                         }
6721                         else {
6722                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6723                                          getSB_VAL_CP(UCHARAT(locinput)),
6724                                          (U8*) reginfo->strbeg,
6725                                          (U8*) locinput,
6726                                          (U8*) reginfo->strend,
6727                                          utf8_target);
6728                         }
6729                         break;
6730
6731                     case WB_BOUND:
6732                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6733                             match = TRUE;
6734                         }
6735                         else {
6736                             match = isWB(WB_UNKNOWN,
6737                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6738                                          getWB_VAL_CP(UCHARAT(locinput)),
6739                                          (U8*) reginfo->strbeg,
6740                                          (U8*) locinput,
6741                                          (U8*) reginfo->strend,
6742                                          utf8_target);
6743                         }
6744                         break;
6745                 }
6746             }
6747
6748             if (to_complement ^ ! match) {
6749                 sayNO;
6750             }
6751             break;
6752
6753         case ANYOFL:  /*  /[abc]/l      */
6754             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6755
6756             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6757             {
6758               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6759             }
6760             /* FALLTHROUGH */
6761         case ANYOFD:  /*   /[abc]/d       */
6762         case ANYOF:  /*   /[abc]/       */
6763             if (NEXTCHR_IS_EOS)
6764                 sayNO;
6765             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6766                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6767                                                                    utf8_target))
6768                     sayNO;
6769                 locinput += UTF8SKIP(locinput);
6770             }
6771             else {
6772                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6773                     sayNO;
6774                 locinput++;
6775             }
6776             break;
6777
6778         case ANYOFM:
6779             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
6780                 sayNO;
6781             }
6782             locinput++;
6783             break;
6784
6785         case ASCII:
6786             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6787                 sayNO;
6788             }
6789
6790             locinput++;     /* ASCII is always single byte */
6791             break;
6792
6793         case NASCII:
6794             if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6795                 sayNO;
6796             }
6797
6798             goto increment_locinput;
6799             break;
6800
6801         /* The argument (FLAGS) to all the POSIX node types is the class number
6802          * */
6803
6804         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6805             to_complement = 1;
6806             /* FALLTHROUGH */
6807
6808         case POSIXL:    /* \w or [:punct:] etc. under /l */
6809             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6810             if (NEXTCHR_IS_EOS)
6811                 sayNO;
6812
6813             /* Use isFOO_lc() for characters within Latin1.  (Note that
6814              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6815              * wouldn't be invariant) */
6816             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6817                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6818                     sayNO;
6819                 }
6820
6821                 locinput++;
6822                 break;
6823             }
6824
6825             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6826                 /* An above Latin-1 code point, or malformed */
6827                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6828                                                        reginfo->strend);
6829                 goto utf8_posix_above_latin1;
6830             }
6831
6832             /* Here is a UTF-8 variant code point below 256 and the target is
6833              * UTF-8 */
6834             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6835                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6836                                             *(locinput + 1))))))
6837             {
6838                 sayNO;
6839             }
6840
6841             goto increment_locinput;
6842
6843         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6844             to_complement = 1;
6845             /* FALLTHROUGH */
6846
6847         case POSIXD:    /* \w or [:punct:] etc. under /d */
6848             if (utf8_target) {
6849                 goto utf8_posix;
6850             }
6851             goto posixa;
6852
6853         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6854
6855             if (NEXTCHR_IS_EOS) {
6856                 sayNO;
6857             }
6858
6859             /* All UTF-8 variants match */
6860             if (! UTF8_IS_INVARIANT(nextchr)) {
6861                 goto increment_locinput;
6862             }
6863
6864             to_complement = 1;
6865             goto join_nposixa;
6866
6867         case POSIXA:    /* \w or [:punct:] etc. under /a */
6868
6869           posixa:
6870             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6871              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6872              * character is a single byte */
6873
6874             if (NEXTCHR_IS_EOS) {
6875                 sayNO;
6876             }
6877
6878           join_nposixa:
6879
6880             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6881                                                                 FLAGS(scan)))))
6882             {
6883                 sayNO;
6884             }
6885
6886             /* Here we are either not in utf8, or we matched a utf8-invariant,
6887              * so the next char is the next byte */
6888             locinput++;
6889             break;
6890
6891         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6892             to_complement = 1;
6893             /* FALLTHROUGH */
6894
6895         case POSIXU:    /* \w or [:punct:] etc. under /u */
6896           utf8_posix:
6897             if (NEXTCHR_IS_EOS) {
6898                 sayNO;
6899             }
6900
6901             /* Use _generic_isCC() for characters within Latin1.  (Note that
6902              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6903              * wouldn't be invariant) */
6904             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6905                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6906                                                            FLAGS(scan)))))
6907                 {
6908                     sayNO;
6909                 }
6910                 locinput++;
6911             }
6912             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6913                 if (! (to_complement
6914                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6915                                                                *(locinput + 1)),
6916                                              FLAGS(scan)))))
6917                 {
6918                     sayNO;
6919                 }
6920                 locinput += 2;
6921             }
6922             else {  /* Handle above Latin-1 code points */
6923               utf8_posix_above_latin1:
6924                 classnum = (_char_class_number) FLAGS(scan);
6925                 if (classnum < _FIRST_NON_SWASH_CC) {
6926
6927                     /* Here, uses a swash to find such code points.  Load if if
6928                      * not done already */
6929                     if (! PL_utf8_swash_ptrs[classnum]) {
6930                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6931                         PL_utf8_swash_ptrs[classnum]
6932                                 = _core_swash_init("utf8",
6933                                         "",
6934                                         &PL_sv_undef, 1, 0,
6935                                         PL_XPosix_ptrs[classnum], &flags);
6936                     }
6937                     if (! (to_complement
6938                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6939                                                (U8 *) locinput, TRUE))))
6940                     {
6941                         sayNO;
6942                     }
6943                 }
6944                 else {  /* Here, uses macros to find above Latin-1 code points */
6945                     switch (classnum) {
6946                         case _CC_ENUM_SPACE:
6947                             if (! (to_complement
6948                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
6949                             {
6950                                 sayNO;
6951                             }
6952                             break;
6953                         case _CC_ENUM_BLANK:
6954                             if (! (to_complement
6955                                             ^ cBOOL(is_HORIZWS_high(locinput))))
6956                             {
6957                                 sayNO;
6958                             }
6959                             break;
6960                         case _CC_ENUM_XDIGIT:
6961                             if (! (to_complement
6962                                             ^ cBOOL(is_XDIGIT_high(locinput))))
6963                             {
6964                                 sayNO;
6965                             }
6966                             break;
6967                         case _CC_ENUM_VERTSPACE:
6968                             if (! (to_complement
6969                                             ^ cBOOL(is_VERTWS_high(locinput))))
6970                             {
6971                                 sayNO;
6972                             }
6973                             break;
6974                         default:    /* The rest, e.g. [:cntrl:], can't match
6975                                        above Latin1 */
6976                             if (! to_complement) {
6977                                 sayNO;
6978                             }
6979                             break;
6980                     }
6981                 }
6982                 locinput += UTF8SKIP(locinput);
6983             }
6984             break;
6985
6986         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6987                        a Unicode extended Grapheme Cluster */
6988             if (NEXTCHR_IS_EOS)
6989                 sayNO;
6990             if  (! utf8_target) {
6991
6992                 /* Match either CR LF  or '.', as all the other possibilities
6993                  * require utf8 */
6994                 locinput++;         /* Match the . or CR */
6995                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6996                                        match the LF */
6997                     && locinput < reginfo->strend
6998                     && UCHARAT(locinput) == '\n')
6999                 {
7000                     locinput++;
7001                 }
7002             }
7003             else {
7004
7005                 /* Get the gcb type for the current character */
7006                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7007                                                        (U8*) reginfo->strend);
7008
7009                 /* Then scan through the input until we get to the first
7010                  * character whose type is supposed to be a gcb with the
7011                  * current character.  (There is always a break at the
7012                  * end-of-input) */
7013                 locinput += UTF8SKIP(locinput);
7014                 while (locinput < reginfo->strend) {
7015                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7016                                                          (U8*) reginfo->strend);
7017                     if (isGCB(prev_gcb, cur_gcb,
7018                               (U8*) reginfo->strbeg, (U8*) locinput,
7019                               utf8_target))
7020                     {
7021                         break;
7022                     }
7023
7024                     prev_gcb = cur_gcb;
7025                     locinput += UTF8SKIP(locinput);
7026                 }
7027
7028
7029             }
7030             break;
7031
7032         case NREFFL:  /*  /\g{name}/il  */
7033         {   /* The capture buffer cases.  The ones beginning with N for the
7034                named buffers just convert to the equivalent numbered and
7035                pretend they were called as the corresponding numbered buffer
7036                op.  */
7037             /* don't initialize these in the declaration, it makes C++
7038                unhappy */
7039             const char *s;
7040             char type;
7041             re_fold_t folder;
7042             const U8 *fold_array;
7043             UV utf8_fold_flags;
7044
7045             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7046             folder = foldEQ_locale;
7047             fold_array = PL_fold_locale;
7048             type = REFFL;
7049             utf8_fold_flags = FOLDEQ_LOCALE;
7050             goto do_nref;
7051
7052         case NREFFA:  /*  /\g{name}/iaa  */
7053             folder = foldEQ_latin1;
7054             fold_array = PL_fold_latin1;
7055             type = REFFA;
7056             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7057             goto do_nref;
7058
7059         case NREFFU:  /*  /\g{name}/iu  */
7060             folder = foldEQ_latin1;
7061             fold_array = PL_fold_latin1;
7062             type = REFFU;
7063             utf8_fold_flags = 0;
7064             goto do_nref;
7065
7066         case NREFF:  /*  /\g{name}/i  */
7067             folder = foldEQ;
7068             fold_array = PL_fold;
7069             type = REFF;
7070             utf8_fold_flags = 0;
7071             goto do_nref;
7072
7073         case NREF:  /*  /\g{name}/   */
7074             type = REF;
7075             folder = NULL;
7076             fold_array = NULL;
7077             utf8_fold_flags = 0;
7078           do_nref:
7079
7080             /* For the named back references, find the corresponding buffer
7081              * number */
7082             n = reg_check_named_buff_matched(rex,scan);
7083
7084             if ( ! n ) {
7085                 sayNO;
7086             }
7087             goto do_nref_ref_common;
7088
7089         case REFFL:  /*  /\1/il  */
7090             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7091             folder = foldEQ_locale;
7092             fold_array = PL_fold_locale;
7093             utf8_fold_flags = FOLDEQ_LOCALE;
7094             goto do_ref;
7095
7096         case REFFA:  /*  /\1/iaa  */
7097             folder = foldEQ_latin1;
7098             fold_array = PL_fold_latin1;
7099             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7100             goto do_ref;
7101
7102         case REFFU:  /*  /\1/iu  */
7103             folder = foldEQ_latin1;
7104             fold_array = PL_fold_latin1;
7105             utf8_fold_flags = 0;
7106             goto do_ref;
7107
7108         case REFF:  /*  /\1/i  */
7109             folder = foldEQ;
7110             fold_array = PL_fold;
7111             utf8_fold_flags = 0;
7112             goto do_ref;
7113
7114         case REF:  /*  /\1/    */
7115             folder = NULL;
7116             fold_array = NULL;
7117             utf8_fold_flags = 0;
7118
7119           do_ref:
7120             type = OP(scan);
7121             n = ARG(scan);  /* which paren pair */
7122
7123           do_nref_ref_common:
7124             ln = rex->offs[n].start;
7125             endref = rex->offs[n].end;
7126             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7127             if (rex->lastparen < n || ln == -1 || endref == -1)
7128                 sayNO;                  /* Do not match unless seen CLOSEn. */
7129             if (ln == endref)
7130                 break;
7131
7132             s = reginfo->strbeg + ln;
7133             if (type != REF     /* REF can do byte comparison */
7134                 && (utf8_target || type == REFFU || type == REFFL))
7135             {
7136                 char * limit = reginfo->strend;
7137
7138                 /* This call case insensitively compares the entire buffer
7139                     * at s, with the current input starting at locinput, but
7140                     * not going off the end given by reginfo->strend, and
7141                     * returns in <limit> upon success, how much of the
7142                     * current input was matched */
7143                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7144                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7145                 {
7146                     sayNO;
7147                 }
7148                 locinput = limit;
7149                 break;
7150             }
7151
7152             /* Not utf8:  Inline the first character, for speed. */
7153             if (!NEXTCHR_IS_EOS &&
7154                 UCHARAT(s) != nextchr &&
7155                 (type == REF ||
7156                  UCHARAT(s) != fold_array[nextchr]))
7157                 sayNO;
7158             ln = endref - ln;
7159             if (locinput + ln > reginfo->strend)
7160                 sayNO;
7161             if (ln > 1 && (type == REF
7162                            ? memNE(s, locinput, ln)
7163                            : ! folder(s, locinput, ln)))
7164                 sayNO;
7165             locinput += ln;
7166             break;
7167         }
7168
7169         case NOTHING: /* null op; e.g. the 'nothing' following
7170                        * the '*' in m{(a+|b)*}' */
7171             break;
7172         case TAIL: /* placeholder while compiling (A|B|C) */
7173             break;
7174
7175 #undef  ST
7176 #define ST st->u.eval
7177 #define CUR_EVAL cur_eval->u.eval
7178
7179         {
7180             SV *ret;
7181             REGEXP *re_sv;
7182             regexp *re;
7183             regexp_internal *rei;
7184             regnode *startpoint;
7185             U32 arg;
7186
7187         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7188             arg= (U32)ARG(scan);
7189             if (cur_eval && cur_eval->locinput == locinput) {
7190                 if ( ++nochange_depth > max_nochange_depth )
7191                     Perl_croak(aTHX_
7192                         "Pattern subroutine nesting without pos change"
7193                         " exceeded limit in regex");
7194             } else {
7195                 nochange_depth = 0;
7196             }
7197             re_sv = rex_sv;
7198             re = rex;
7199             rei = rexi;
7200             startpoint = scan + ARG2L(scan);
7201             EVAL_CLOSE_PAREN_SET( st, arg );
7202             /* Detect infinite recursion
7203              *
7204              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7205              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7206              * So we track the position in the string we are at each time
7207              * we recurse and if we try to enter the same routine twice from
7208              * the same position we throw an error.
7209              */
7210             if ( rex->recurse_locinput[arg] == locinput ) {
7211                 /* FIXME: we should show the regop that is failing as part
7212                  * of the error message. */
7213                 Perl_croak(aTHX_ "Infinite recursion in regex");
7214             } else {
7215                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7216                 rex->recurse_locinput[arg]= locinput;
7217
7218                 DEBUG_r({
7219                     GET_RE_DEBUG_FLAGS_DECL;
7220                     DEBUG_STACK_r({
7221                         Perl_re_exec_indentf( aTHX_
7222                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7223                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7224                         );
7225                     });
7226                 });
7227             }
7228
7229             /* Save all the positions seen so far. */
7230             ST.cp = regcppush(rex, 0, maxopenparen);
7231             REGCP_SET(ST.lastcp);
7232
7233             /* and then jump to the code we share with EVAL */
7234             goto eval_recurse_doit;
7235             /* NOTREACHED */
7236
7237         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7238             if (cur_eval && cur_eval->locinput==locinput) {
7239                 if ( ++nochange_depth > max_nochange_depth )
7240                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7241             } else {
7242                 nochange_depth = 0;
7243             }
7244             {
7245                 /* execute the code in the {...} */
7246
7247                 dSP;
7248                 IV before;
7249                 OP * const oop = PL_op;
7250                 COP * const ocurcop = PL_curcop;
7251                 OP *nop;
7252                 CV *newcv;
7253
7254                 /* save *all* paren positions */
7255                 regcppush(rex, 0, maxopenparen);
7256                 REGCP_SET(ST.lastcp);
7257
7258                 if (!caller_cv)
7259                     caller_cv = find_runcv(NULL);
7260
7261                 n = ARG(scan);
7262
7263                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7264                     newcv = (ReANY(
7265                                     (REGEXP*)(rexi->data->data[n])
7266                             ))->qr_anoncv;
7267                     nop = (OP*)rexi->data->data[n+1];
7268                 }
7269                 else if (rexi->data->what[n] == 'l') { /* literal code */
7270                     newcv = caller_cv;
7271                     nop = (OP*)rexi->data->data[n];
7272                     assert(CvDEPTH(newcv));
7273                 }
7274                 else {
7275                     /* literal with own CV */
7276                     assert(rexi->data->what[n] == 'L');
7277                     newcv = rex->qr_anoncv;
7278                     nop = (OP*)rexi->data->data[n];
7279                 }
7280
7281                 /* Some notes about MULTICALL and the context and save stacks.
7282                  *
7283                  * In something like
7284                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7285                  * since codeblocks don't introduce a new scope (so that
7286                  * local() etc accumulate), at the end of a successful
7287                  * match there will be a SAVEt_CLEARSV on the savestack
7288                  * for each of $x, $y, $z. If the three code blocks above
7289                  * happen to have come from different CVs (e.g. via
7290                  * embedded qr//s), then we must ensure that during any
7291                  * savestack unwinding, PL_comppad always points to the
7292                  * right pad at each moment. We achieve this by
7293                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7294                  * there is a change of pad.
7295                  * In theory whenever we call a code block, we should
7296                  * push a CXt_SUB context, then pop it on return from
7297                  * that code block. This causes a bit of an issue in that
7298                  * normally popping a context also clears the savestack
7299                  * back to cx->blk_oldsaveix, but here we specifically
7300                  * don't want to clear the save stack on exit from the
7301                  * code block.
7302                  * Also for efficiency we don't want to keep pushing and
7303                  * popping the single SUB context as we backtrack etc.
7304                  * So instead, we push a single context the first time
7305                  * we need, it, then hang onto it until the end of this
7306                  * function. Whenever we encounter a new code block, we
7307                  * update the CV etc if that's changed. During the times
7308                  * in this function where we're not executing a code
7309                  * block, having the SUB context still there is a bit
7310                  * naughty - but we hope that no-one notices.
7311                  * When the SUB context is initially pushed, we fake up
7312                  * cx->blk_oldsaveix to be as if we'd pushed this context
7313                  * on first entry to S_regmatch rather than at some random
7314                  * point during the regexe execution. That way if we
7315                  * croak, popping the context stack will ensure that
7316                  * *everything* SAVEd by this function is undone and then
7317                  * the context popped, rather than e.g., popping the
7318                  * context (and restoring the original PL_comppad) then
7319                  * popping more of the savestack and restoring a bad
7320                  * PL_comppad.
7321                  */
7322
7323                 /* If this is the first EVAL, push a MULTICALL. On
7324                  * subsequent calls, if we're executing a different CV, or
7325                  * if PL_comppad has got messed up from backtracking
7326                  * through SAVECOMPPADs, then refresh the context.
7327                  */
7328                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7329                 {
7330                     U8 flags = (CXp_SUB_RE |
7331                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7332                     SAVECOMPPAD();
7333                     if (last_pushed_cv) {
7334                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7335                     }
7336                     else {
7337                         PUSH_MULTICALL_FLAGS(newcv, flags);
7338                     }
7339                     /* see notes above */
7340                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7341
7342                     last_pushed_cv = newcv;
7343                 }
7344                 else {
7345                     /* these assignments are just to silence compiler
7346                      * warnings */
7347                     multicall_cop = NULL;
7348                 }
7349                 last_pad = PL_comppad;
7350
7351                 /* the initial nextstate you would normally execute
7352                  * at the start of an eval (which would cause error
7353                  * messages to come from the eval), may be optimised
7354                  * away from the execution path in the regex code blocks;
7355                  * so manually set PL_curcop to it initially */
7356                 {
7357                     OP *o = cUNOPx(nop)->op_first;
7358                     assert(o->op_type == OP_NULL);
7359                     if (o->op_targ == OP_SCOPE) {
7360                         o = cUNOPo->op_first;
7361                     }
7362                     else {
7363                         assert(o->op_targ == OP_LEAVE);
7364                         o = cUNOPo->op_first;
7365                         assert(o->op_type == OP_ENTER);
7366                         o = OpSIBLING(o);
7367                     }
7368
7369                     if (o->op_type != OP_STUB) {
7370                         assert(    o->op_type == OP_NEXTSTATE
7371                                 || o->op_type == OP_DBSTATE
7372                                 || (o->op_type == OP_NULL
7373                                     &&  (  o->op_targ == OP_NEXTSTATE
7374                                         || o->op_targ == OP_DBSTATE
7375                                         )
7376                                     )
7377                         );
7378                         PL_curcop = (COP*)o;
7379                     }
7380                 }
7381                 nop = nop->op_next;
7382
7383                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7384                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7385
7386                 rex->offs[0].end = locinput - reginfo->strbeg;
7387                 if (reginfo->info_aux_eval->pos_magic)
7388                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7389                                   reginfo->sv, reginfo->strbeg,
7390                                   locinput - reginfo->strbeg);
7391
7392                 if (sv_yes_mark) {
7393                     SV *sv_mrk = get_sv("REGMARK", 1);
7394                     sv_setsv(sv_mrk, sv_yes_mark);
7395                 }
7396
7397                 /* we don't use MULTICALL here as we want to call the
7398                  * first op of the block of interest, rather than the
7399                  * first op of the sub. Also, we don't want to free
7400                  * the savestack frame */
7401                 before = (IV)(SP-PL_stack_base);
7402                 PL_op = nop;
7403                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7404                 SPAGAIN;
7405                 if ((IV)(SP-PL_stack_base) == before)
7406                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7407                 else {
7408                     ret = POPs;
7409                     PUTBACK;
7410                 }
7411
7412                 /* before restoring everything, evaluate the returned
7413                  * value, so that 'uninit' warnings don't use the wrong
7414                  * PL_op or pad. Also need to process any magic vars
7415                  * (e.g. $1) *before* parentheses are restored */
7416
7417                 PL_op = NULL;
7418
7419                 re_sv = NULL;
7420                 if (logical == 0)        /*   (?{})/   */
7421                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7422                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7423                     sw = cBOOL(SvTRUE_NN(ret));
7424                     logical = 0;
7425                 }
7426                 else {                   /*  /(??{})  */
7427                     /*  if its overloaded, let the regex compiler handle
7428                      *  it; otherwise extract regex, or stringify  */
7429                     if (SvGMAGICAL(ret))
7430                         ret = sv_mortalcopy(ret);
7431                     if (!SvAMAGIC(ret)) {
7432                         SV *sv = ret;
7433                         if (SvROK(sv))
7434                             sv = SvRV(sv);
7435                         if (SvTYPE(sv) == SVt_REGEXP)
7436                             re_sv = (REGEXP*) sv;
7437                         else if (SvSMAGICAL(ret)) {
7438                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7439                             if (mg)
7440                                 re_sv = (REGEXP *) mg->mg_obj;
7441                         }
7442
7443                         /* force any undef warnings here */
7444                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7445                             ret = sv_mortalcopy(ret);
7446                             (void) SvPV_force_nolen(ret);
7447                         }
7448                     }
7449
7450                 }
7451
7452                 /* *** Note that at this point we don't restore
7453                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7454                  * be used again soon. This is safe as long as nothing
7455                  * in the regexp code uses the pad ! */
7456                 PL_op = oop;
7457                 PL_curcop = ocurcop;
7458                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7459                 PL_curpm_under = PL_curpm;
7460                 PL_curpm = PL_reg_curpm;
7461
7462                 if (logical != 2) {
7463                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7464                     /* NOTREACHED */
7465                 }
7466             }
7467
7468                 /* only /(??{})/  from now on */
7469                 logical = 0;
7470                 {
7471                     /* extract RE object from returned value; compiling if
7472                      * necessary */
7473
7474                     if (re_sv) {
7475                         re_sv = reg_temp_copy(NULL, re_sv);
7476                     }
7477                     else {
7478                         U32 pm_flags = 0;
7479
7480                         if (SvUTF8(ret) && IN_BYTES) {
7481                             /* In use 'bytes': make a copy of the octet
7482                              * sequence, but without the flag on */
7483                             STRLEN len;
7484                             const char *const p = SvPV(ret, len);
7485                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7486                         }
7487                         if (rex->intflags & PREGf_USE_RE_EVAL)
7488                             pm_flags |= PMf_USE_RE_EVAL;
7489
7490                         /* if we got here, it should be an engine which
7491                          * supports compiling code blocks and stuff */
7492                         assert(rex->engine && rex->engine->op_comp);
7493                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7494                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7495                                     rex->engine, NULL, NULL,
7496                                     /* copy /msixn etc to inner pattern */
7497                                     ARG2L(scan),
7498                                     pm_flags);
7499
7500                         if (!(SvFLAGS(ret)
7501                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7502                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7503                             /* This isn't a first class regexp. Instead, it's
7504                                caching a regexp onto an existing, Perl visible
7505                                scalar.  */
7506                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7507                         }
7508                     }
7509                     SAVEFREESV(re_sv);
7510                     re = ReANY(re_sv);
7511                 }
7512                 RXp_MATCH_COPIED_off(re);
7513                 re->subbeg = rex->subbeg;
7514                 re->sublen = rex->sublen;
7515                 re->suboffset = rex->suboffset;
7516                 re->subcoffset = rex->subcoffset;
7517                 re->lastparen = 0;
7518                 re->lastcloseparen = 0;
7519                 rei = RXi_GET(re);
7520                 DEBUG_EXECUTE_r(
7521                     debug_start_match(re_sv, utf8_target, locinput,
7522                                     reginfo->strend, "Matching embedded");
7523                 );
7524                 startpoint = rei->program + 1;
7525                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7526                                              * close_paren only for GOSUB */
7527                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7528                 /* Save all the seen positions so far. */
7529                 ST.cp = regcppush(rex, 0, maxopenparen);
7530                 REGCP_SET(ST.lastcp);
7531                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7532                 maxopenparen = 0;
7533                 /* run the pattern returned from (??{...}) */
7534
7535               eval_recurse_doit: /* Share code with GOSUB below this line
7536                             * At this point we expect the stack context to be
7537                             * set up correctly */
7538
7539                 /* invalidate the S-L poscache. We're now executing a
7540                  * different set of WHILEM ops (and their associated
7541                  * indexes) against the same string, so the bits in the
7542                  * cache are meaningless. Setting maxiter to zero forces
7543                  * the cache to be invalidated and zeroed before reuse.
7544                  * XXX This is too dramatic a measure. Ideally we should
7545                  * save the old cache and restore when running the outer
7546                  * pattern again */
7547                 reginfo->poscache_maxiter = 0;
7548
7549                 /* the new regexp might have a different is_utf8_pat than we do */
7550                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7551
7552                 ST.prev_rex = rex_sv;
7553                 ST.prev_curlyx = cur_curlyx;
7554                 rex_sv = re_sv;
7555                 SET_reg_curpm(rex_sv);
7556                 rex = re;
7557                 rexi = rei;
7558                 cur_curlyx = NULL;
7559                 ST.B = next;
7560                 ST.prev_eval = cur_eval;
7561                 cur_eval = st;
7562                 /* now continue from first node in postoned RE */
7563                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7564                 NOT_REACHED; /* NOTREACHED */
7565         }
7566
7567         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7568             /* note: this is called twice; first after popping B, then A */
7569             DEBUG_STACK_r({
7570                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7571                     depth, cur_eval, ST.prev_eval);
7572             });
7573
7574 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7575             if ( cur_eval && CUR_EVAL.close_paren ) {\
7576                 DEBUG_STACK_r({ \
7577                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7578                         depth,    \
7579                         CUR_EVAL.close_paren - 1,\
7580                         cur_eval, \
7581                         VAL);     \
7582                 });               \
7583                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7584             }
7585
7586             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7587
7588             rex_sv = ST.prev_rex;
7589             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7590             SET_reg_curpm(rex_sv);
7591             rex = ReANY(rex_sv);
7592             rexi = RXi_GET(rex);
7593             {
7594                 /* preserve $^R across LEAVE's. See Bug 121070. */
7595                 SV *save_sv= GvSV(PL_replgv);
7596                 SvREFCNT_inc(save_sv);
7597                 regcpblow(ST.cp); /* LEAVE in disguise */
7598                 sv_setsv(GvSV(PL_replgv), save_sv);
7599                 SvREFCNT_dec(save_sv);
7600             }
7601             cur_eval = ST.prev_eval;
7602             cur_curlyx = ST.prev_curlyx;
7603
7604             /* Invalidate cache. See "invalidate" comment above. */
7605             reginfo->poscache_maxiter = 0;
7606             if ( nochange_depth )
7607                 nochange_depth--;
7608
7609             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7610             sayYES;
7611
7612
7613         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7614             REGCP_UNWIND(ST.lastcp);
7615             sayNO;
7616
7617         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7618             /* note: this is called twice; first after popping B, then A */
7619             DEBUG_STACK_r({
7620                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7621                     depth, cur_eval, ST.prev_eval);
7622             });
7623
7624             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7625
7626             rex_sv = ST.prev_rex;
7627             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7628             SET_reg_curpm(rex_sv);
7629             rex = ReANY(rex_sv);
7630             rexi = RXi_GET(rex);
7631
7632             REGCP_UNWIND(ST.lastcp);
7633             regcppop(rex, &maxopenparen);
7634             cur_eval = ST.prev_eval;
7635             cur_curlyx = ST.prev_curlyx;
7636
7637             /* Invalidate cache. See "invalidate" comment above. */
7638             reginfo->poscache_maxiter = 0;
7639             if ( nochange_depth )
7640                 nochange_depth--;
7641
7642             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7643             sayNO_SILENT;
7644 #undef ST
7645
7646         case OPEN: /*  (  */
7647             n = ARG(scan);  /* which paren pair */
7648             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7649             if (n > maxopenparen)
7650                 maxopenparen = n;
7651             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7652                 "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7653                 depth,
7654                 PTR2UV(rex),
7655                 PTR2UV(rex->offs),
7656                 (UV)n,
7657                 (IV)rex->offs[n].start_tmp,
7658                 (UV)maxopenparen
7659             ));
7660             lastopen = n;
7661             break;
7662
7663         case SROPEN: /*  (*SCRIPT_RUN:  */
7664             script_run_begin = (U8 *) locinput;
7665             break;
7666
7667 /* XXX really need to log other places start/end are set too */
7668 #define CLOSE_CAPTURE                                                      \
7669     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7670     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7671     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
7672         "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
7673         depth,                                                             \
7674         PTR2UV(rex),                                                       \
7675         PTR2UV(rex->offs),                                                 \
7676         (UV)n,                                                             \
7677         (IV)rex->offs[n].start,                                            \
7678         (IV)rex->offs[n].end                                               \
7679     ))
7680
7681         case CLOSE:  /*  )  */
7682             n = ARG(scan);  /* which paren pair */
7683             CLOSE_CAPTURE;
7684             if (n > rex->lastparen)
7685                 rex->lastparen = n;
7686             rex->lastcloseparen = n;
7687             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7688                 goto fake_end;
7689
7690             break;
7691
7692         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7693
7694             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7695             {
7696                 sayNO;
7697             }
7698
7699             break;
7700
7701
7702         case ACCEPT:  /*  (*ACCEPT)  */
7703             if (scan->flags)
7704                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7705             if (ARG2L(scan)){
7706                 regnode *cursor;
7707                 for (cursor=scan;
7708                      cursor && OP(cursor)!=END;
7709                      cursor=regnext(cursor))
7710                 {
7711                     if ( OP(cursor)==CLOSE ){
7712                         n = ARG(cursor);
7713                         if ( n <= lastopen ) {
7714                             CLOSE_CAPTURE;
7715                             if (n > rex->lastparen)
7716                                 rex->lastparen = n;
7717                             rex->lastcloseparen = n;
7718                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7719                                 break;
7720                         }
7721                     }
7722                 }
7723             }
7724             goto fake_end;
7725             /* NOTREACHED */
7726
7727         case GROUPP:  /*  (?(1))  */
7728             n = ARG(scan);  /* which paren pair */
7729             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7730             break;
7731
7732         case NGROUPP:  /*  (?(<name>))  */
7733             /* reg_check_named_buff_matched returns 0 for no match */
7734             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7735             break;
7736
7737         case INSUBP:   /*  (?(R))  */
7738             n = ARG(scan);
7739             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7740              * of SCAN is already set up as matches a eval.close_paren */
7741             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7742             break;
7743
7744         case DEFINEP:  /*  (?(DEFINE))  */
7745             sw = 0;
7746             break;
7747
7748         case IFTHEN:   /*  (?(cond)A|B)  */
7749             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7750             if (sw)
7751                 next = NEXTOPER(NEXTOPER(scan));
7752             else {
7753                 next = scan + ARG(scan);
7754                 if (OP(next) == IFTHEN) /* Fake one. */
7755                     next = NEXTOPER(NEXTOPER(next));
7756             }
7757             break;
7758
7759         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7760             logical = scan->flags;
7761             break;
7762
7763 /*******************************************************************
7764
7765 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7766 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7767 STAR/PLUS/CURLY/CURLYN are used instead.)
7768
7769 A*B is compiled as <CURLYX><A><WHILEM><B>
7770
7771 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7772 state, which contains the current count, initialised to -1. It also sets
7773 cur_curlyx to point to this state, with any previous value saved in the
7774 state block.
7775
7776 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7777 since the pattern may possibly match zero times (i.e. it's a while {} loop
7778 rather than a do {} while loop).
7779
7780 Each entry to WHILEM represents a successful match of A. The count in the
7781 CURLYX block is incremented, another WHILEM state is pushed, and execution
7782 passes to A or B depending on greediness and the current count.
7783
7784 For example, if matching against the string a1a2a3b (where the aN are
7785 substrings that match /A/), then the match progresses as follows: (the
7786 pushed states are interspersed with the bits of strings matched so far):
7787
7788     <CURLYX cnt=-1>
7789     <CURLYX cnt=0><WHILEM>
7790     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7791     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7792     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7793     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7794
7795 (Contrast this with something like CURLYM, which maintains only a single
7796 backtrack state:
7797
7798     <CURLYM cnt=0> a1
7799     a1 <CURLYM cnt=1> a2
7800     a1 a2 <CURLYM cnt=2> a3
7801     a1 a2 a3 <CURLYM cnt=3> b
7802 )
7803
7804 Each WHILEM state block marks a point to backtrack to upon partial failure
7805 of A or B, and also contains some minor state data related to that
7806 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7807 overall state, such as the count, and pointers to the A and B ops.
7808
7809 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7810 must always point to the *current* CURLYX block, the rules are:
7811
7812 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7813 and set cur_curlyx to point the new block.
7814
7815 When popping the CURLYX block after a successful or unsuccessful match,
7816 restore the previous cur_curlyx.
7817
7818 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7819 to the outer one saved in the CURLYX block.
7820
7821 When popping the WHILEM block after a successful or unsuccessful B match,
7822 restore the previous cur_curlyx.
7823
7824 Here's an example for the pattern (AI* BI)*BO
7825 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7826
7827 cur_
7828 curlyx backtrack stack
7829 ------ ---------------
7830 NULL
7831 CO     <CO prev=NULL> <WO>
7832 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7833 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7834 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7835
7836 At this point the pattern succeeds, and we work back down the stack to
7837 clean up, restoring as we go:
7838
7839 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7840 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7841 CO     <CO prev=NULL> <WO>
7842 NULL
7843
7844 *******************************************************************/
7845
7846 #define ST st->u.curlyx
7847
7848         case CURLYX:    /* start of /A*B/  (for complex A) */
7849         {
7850             /* No need to save/restore up to this paren */
7851             I32 parenfloor = scan->flags;
7852
7853             assert(next); /* keep Coverity happy */
7854             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7855                 next += ARG(next);
7856
7857             /* XXXX Probably it is better to teach regpush to support
7858                parenfloor > maxopenparen ... */
7859             if (parenfloor > (I32)rex->lastparen)
7860                 parenfloor = rex->lastparen; /* Pessimization... */
7861
7862             ST.prev_curlyx= cur_curlyx;
7863             cur_curlyx = st;
7864             ST.cp = PL_savestack_ix;
7865
7866             /* these fields contain the state of the current curly.
7867              * they are accessed by subsequent WHILEMs */
7868             ST.parenfloor = parenfloor;
7869             ST.me = scan;
7870             ST.B = next;
7871             ST.minmod = minmod;
7872             minmod = 0;
7873             ST.count = -1;      /* this will be updated by WHILEM */
7874             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7875
7876             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7877             NOT_REACHED; /* NOTREACHED */
7878         }
7879
7880         case CURLYX_end: /* just finished matching all of A*B */
7881             cur_curlyx = ST.prev_curlyx;
7882             sayYES;
7883             NOT_REACHED; /* NOTREACHED */
7884
7885         case CURLYX_end_fail: /* just failed to match all of A*B */
7886             regcpblow(ST.cp);
7887             cur_curlyx = ST.prev_curlyx;
7888             sayNO;
7889             NOT_REACHED; /* NOTREACHED */
7890
7891
7892 #undef ST
7893 #define ST st->u.whilem
7894
7895         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7896         {
7897             /* see the discussion above about CURLYX/WHILEM */
7898             I32 n;
7899             int min, max;
7900             regnode *A;
7901
7902             assert(cur_curlyx); /* keep Coverity happy */
7903
7904             min = ARG1(cur_curlyx->u.curlyx.me);
7905             max = ARG2(cur_curlyx->u.curlyx.me);
7906             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7907             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7908             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7909             ST.cache_offset = 0;
7910             ST.cache_mask = 0;
7911
7912
7913             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: matched %ld out of %d..%d\n",
7914                   depth, (long)n, min, max)
7915             );
7916
7917             /* First just match a string of min A's. */
7918
7919             if (n < min) {
7920                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7921                 cur_curlyx->u.curlyx.lastloc = locinput;
7922                 REGCP_SET(ST.lastcp);
7923
7924                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7925                 NOT_REACHED; /* NOTREACHED */
7926             }
7927
7928             /* If degenerate A matches "", assume A done. */
7929
7930             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7931                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: empty match detected, trying continuation...\n",
7932                    depth)
7933                 );
7934                 goto do_whilem_B_max;
7935             }
7936
7937             /* super-linear cache processing.
7938              *
7939              * The idea here is that for certain types of CURLYX/WHILEM -
7940              * principally those whose upper bound is infinity (and
7941              * excluding regexes that have things like \1 and other very
7942              * non-regular expresssiony things), then if a pattern like
7943              * /....A*.../ fails and we backtrack to the WHILEM, then we
7944              * make a note that this particular WHILEM op was at string
7945              * position 47 (say) when the rest of pattern failed. Then, if
7946              * we ever find ourselves back at that WHILEM, and at string
7947              * position 47 again, we can just fail immediately rather than
7948              * running the rest of the pattern again.
7949              *
7950              * This is very handy when patterns start to go
7951              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7952              * with a combinatorial explosion of backtracking.
7953              *
7954              * The cache is implemented as a bit array, with one bit per
7955              * string byte position per WHILEM op (up to 16) - so its
7956              * between 0.25 and 2x the string size.
7957              *
7958              * To avoid allocating a poscache buffer every time, we do an
7959              * initially countdown; only after we have  executed a WHILEM
7960              * op (string-length x #WHILEMs) times do we allocate the
7961              * cache.
7962              *
7963              * The top 4 bits of scan->flags byte say how many different
7964              * relevant CURLLYX/WHILEM op pairs there are, while the
7965              * bottom 4-bits is the identifying index number of this
7966              * WHILEM.
7967              */
7968
7969             if (scan->flags) {
7970
7971                 if (!reginfo->poscache_maxiter) {
7972                     /* start the countdown: Postpone detection until we
7973                      * know the match is not *that* much linear. */
7974                     reginfo->poscache_maxiter
7975                         =    (reginfo->strend - reginfo->strbeg + 1)
7976                            * (scan->flags>>4);
7977                     /* possible overflow for long strings and many CURLYX's */
7978                     if (reginfo->poscache_maxiter < 0)
7979                         reginfo->poscache_maxiter = I32_MAX;
7980                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7981                 }
7982
7983                 if (reginfo->poscache_iter-- == 0) {
7984                     /* initialise cache */
7985                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7986                     regmatch_info_aux *const aux = reginfo->info_aux;
7987                     if (aux->poscache) {
7988                         if ((SSize_t)reginfo->poscache_size < size) {
7989                             Renew(aux->poscache, size, char);
7990                             reginfo->poscache_size = size;
7991                         }
7992                         Zero(aux->poscache, size, char);
7993                     }
7994                     else {
7995                         reginfo->poscache_size = size;
7996                         Newxz(aux->poscache, size, char);
7997                     }
7998                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7999       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
8000                               PL_colors[4], PL_colors[5])
8001                     );
8002                 }
8003
8004                 if (reginfo->poscache_iter < 0) {
8005                     /* have we already failed at this position? */
8006                     SSize_t offset, mask;
8007
8008                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8009                     offset  = (scan->flags & 0xf) - 1
8010                                 +   (locinput - reginfo->strbeg)
8011                                   * (scan->flags>>4);
8012                     mask    = 1 << (offset % 8);
8013                     offset /= 8;
8014                     if (reginfo->info_aux->poscache[offset] & mask) {
8015                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "whilem: (cache) already tried at this position...\n",
8016                             depth)
8017                         );
8018                         cur_curlyx->u.curlyx.count--;
8019                         sayNO; /* cache records failure */
8020                     }
8021                     ST.cache_offset = offset;
8022                     ST.cache_mask   = mask;
8023                 }
8024             }
8025
8026             /* Prefer B over A for minimal matching. */
8027
8028             if (cur_curlyx->u.curlyx.minmod) {
8029                 ST.save_curlyx = cur_curlyx;
8030                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8031                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8032                                     locinput);
8033                 NOT_REACHED; /* NOTREACHED */
8034             }
8035
8036             /* Prefer A over B for maximal matching. */
8037
8038             if (n < max) { /* More greed allowed? */
8039                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8040                             maxopenparen);
8041                 cur_curlyx->u.curlyx.lastloc = locinput;
8042                 REGCP_SET(ST.lastcp);
8043                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
8044                 NOT_REACHED; /* NOTREACHED */
8045             }
8046             goto do_whilem_B_max;
8047         }
8048         NOT_REACHED; /* NOTREACHED */
8049
8050         case WHILEM_B_min: /* just matched B in a minimal match */
8051         case WHILEM_B_max: /* just matched B in a maximal match */
8052             cur_curlyx = ST.save_curlyx;
8053             sayYES;
8054             NOT_REACHED; /* NOTREACHED */
8055
8056         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8057             cur_curlyx = ST.save_curlyx;
8058             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8059             cur_curlyx->u.curlyx.count--;
8060             CACHEsayNO;
8061             NOT_REACHED; /* NOTREACHED */
8062
8063         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8064             REGCP_UNWIND(ST.lastcp);
8065             regcppop(rex, &maxopenparen);
8066             /* FALLTHROUGH */
8067         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8068             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8069             cur_curlyx->u.curlyx.count--;
8070             CACHEsayNO;
8071             NOT_REACHED; /* NOTREACHED */
8072
8073         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8074             REGCP_UNWIND(ST.lastcp);
8075             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8076             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "whilem: failed, trying continuation...\n",
8077                 depth)
8078             );
8079           do_whilem_B_max:
8080             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8081                 && ckWARN(WARN_REGEXP)
8082                 && !reginfo->warned)
8083             {
8084                 reginfo->warned = TRUE;
8085                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8086                      "Complex regular subexpression recursion limit (%d) "
8087                      "exceeded",
8088                      REG_INFTY - 1);
8089             }
8090
8091             /* now try B */
8092             ST.save_curlyx = cur_curlyx;
8093             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8094             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8095                                 locinput);
8096             NOT_REACHED; /* NOTREACHED */
8097
8098         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8099             cur_curlyx = ST.save_curlyx;
8100
8101             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8102                 /* Maximum greed exceeded */
8103                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8104                     && ckWARN(WARN_REGEXP)
8105                     && !reginfo->warned)
8106                 {
8107                     reginfo->warned     = TRUE;
8108                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8109                         "Complex regular subexpression recursion "
8110                         "limit (%d) exceeded",
8111                         REG_INFTY - 1);
8112                 }
8113                 cur_curlyx->u.curlyx.count--;
8114                 CACHEsayNO;
8115             }
8116
8117             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "trying longer...\n", depth)
8118             );
8119             /* Try grabbing another A and see if it helps. */
8120             cur_curlyx->u.curlyx.lastloc = locinput;
8121             PUSH_STATE_GOTO(WHILEM_A_min,
8122                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8123                 locinput);
8124             NOT_REACHED; /* NOTREACHED */
8125
8126 #undef  ST
8127 #define ST st->u.branch
8128
8129         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8130             next = scan + ARG(scan);
8131             if (next == scan)
8132                 next = NULL;
8133             scan = NEXTOPER(scan);
8134             /* FALLTHROUGH */
8135
8136         case BRANCH:        /*  /(...|A|...)/ */
8137             scan = NEXTOPER(scan); /* scan now points to inner node */
8138             ST.lastparen = rex->lastparen;
8139             ST.lastcloseparen = rex->lastcloseparen;
8140             ST.next_branch = next;
8141             REGCP_SET(ST.cp);
8142
8143             /* Now go into the branch */
8144             if (has_cutgroup) {
8145                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
8146             } else {
8147                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
8148             }
8149             NOT_REACHED; /* NOTREACHED */
8150
8151         case CUTGROUP:  /*  /(*THEN)/  */
8152             sv_yes_mark = st->u.mark.mark_name = scan->flags
8153                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8154                 : NULL;
8155             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
8156             NOT_REACHED; /* NOTREACHED */
8157
8158         case CUTGROUP_next_fail:
8159             do_cutgroup = 1;
8160             no_final = 1;
8161             if (st->u.mark.mark_name)
8162                 sv_commit = st->u.mark.mark_name;
8163             sayNO;
8164             NOT_REACHED; /* NOTREACHED */
8165
8166         case BRANCH_next:
8167             sayYES;
8168             NOT_REACHED; /* NOTREACHED */
8169
8170         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8171             if (do_cutgroup) {
8172                 do_cutgroup = 0;
8173                 no_final = 0;
8174             }
8175             REGCP_UNWIND(ST.cp);
8176             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8177             scan = ST.next_branch;
8178             /* no more branches? */
8179             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8180                 DEBUG_EXECUTE_r({
8181                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8182                         depth,
8183                         PL_colors[4],
8184                         PL_colors[5] );
8185                 });
8186                 sayNO_SILENT;
8187             }
8188             continue; /* execute next BRANCH[J] op */
8189             /* NOTREACHED */
8190
8191         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8192             minmod = 1;
8193             break;
8194
8195 #undef  ST
8196 #define ST st->u.curlym
8197
8198         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8199
8200             /* This is an optimisation of CURLYX that enables us to push
8201              * only a single backtracking state, no matter how many matches
8202              * there are in {m,n}. It relies on the pattern being constant
8203              * length, with no parens to influence future backrefs
8204              */
8205
8206             ST.me = scan;
8207             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8208
8209             ST.lastparen      = rex->lastparen;
8210             ST.lastcloseparen = rex->lastcloseparen;
8211
8212             /* if paren positive, emulate an OPEN/CLOSE around A */
8213             if (ST.me->flags) {
8214                 U32 paren = ST.me->flags;
8215                 if (paren > maxopenparen)
8216                     maxopenparen = paren;
8217                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8218             }
8219             ST.A = scan;
8220             ST.B = next;
8221             ST.alen = 0;
8222             ST.count = 0;
8223             ST.minmod = minmod;
8224             minmod = 0;
8225             ST.c1 = CHRTEST_UNINIT;
8226             REGCP_SET(ST.cp);
8227
8228             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8229                 goto curlym_do_B;
8230
8231           curlym_do_A: /* execute the A in /A{m,n}B/  */
8232             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
8233             NOT_REACHED; /* NOTREACHED */
8234
8235         case CURLYM_A: /* we've just matched an A */
8236             ST.count++;
8237             /* after first match, determine A's length: u.curlym.alen */
8238             if (ST.count == 1) {
8239                 if (reginfo->is_utf8_target) {
8240                     char *s = st->locinput;
8241                     while (s < locinput) {
8242                         ST.alen++;
8243                         s += UTF8SKIP(s);
8244                     }
8245                 }
8246                 else {
8247                     ST.alen = locinput - st->locinput;
8248                 }
8249                 if (ST.alen == 0)
8250                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8251             }
8252             DEBUG_EXECUTE_r(
8253                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8254                           depth, (IV) ST.count, (IV)ST.alen)
8255             );
8256
8257             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8258                 goto fake_end;
8259
8260             {
8261                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8262                 if ( max == REG_INFTY || ST.count < max )
8263                     goto curlym_do_A; /* try to match another A */
8264             }
8265             goto curlym_do_B; /* try to match B */
8266
8267         case CURLYM_A_fail: /* just failed to match an A */
8268             REGCP_UNWIND(ST.cp);
8269
8270
8271             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8272                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8273                 sayNO;
8274
8275           curlym_do_B: /* execute the B in /A{m,n}B/  */
8276             if (ST.c1 == CHRTEST_UNINIT) {
8277                 /* calculate c1 and c2 for possible match of 1st char
8278                  * following curly */
8279                 ST.c1 = ST.c2 = CHRTEST_VOID;
8280                 assert(ST.B);
8281                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8282                     regnode *text_node = ST.B;
8283                     if (! HAS_TEXT(text_node))
8284                         FIND_NEXT_IMPT(text_node);
8285                     /* this used to be
8286
8287                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8288
8289                         But the former is redundant in light of the latter.
8290
8291                         if this changes back then the macro for
8292                         IS_TEXT and friends need to change.
8293                      */
8294                     if (PL_regkind[OP(text_node)] == EXACT) {
8295                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8296                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8297                            reginfo))
8298                         {
8299                             sayNO;
8300                         }
8301                     }
8302                 }
8303             }
8304
8305             DEBUG_EXECUTE_r(
8306                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8307                     depth, (IV)ST.count)
8308                 );
8309             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8310                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8311                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8312                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8313                     {
8314                         /* simulate B failing */
8315                         DEBUG_OPTIMISE_r(
8316                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8317                                 depth,
8318                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8319                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8320                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8321                         );
8322                         state_num = CURLYM_B_fail;
8323                         goto reenter_switch;
8324                     }
8325                 }
8326                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8327                     /* simulate B failing */
8328                     DEBUG_OPTIMISE_r(
8329                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8330                             depth,
8331                             (int) nextchr, ST.c1, ST.c2)
8332                     );
8333                     state_num = CURLYM_B_fail;
8334                     goto reenter_switch;
8335                 }
8336             }
8337
8338             if (ST.me->flags) {
8339                 /* emulate CLOSE: mark current A as captured */
8340                 I32 paren = ST.me->flags;
8341                 if (ST.count) {
8342                     rex->offs[paren].start
8343                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
8344                     rex->offs[paren].end = locinput - reginfo->strbeg;
8345                     if ((U32)paren > rex->lastparen)
8346                         rex->lastparen = paren;
8347                     rex->lastcloseparen = paren;
8348                 }
8349                 else
8350                     rex->offs[paren].end = -1;
8351
8352                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8353                 {
8354                     if (ST.count)
8355                         goto fake_end;
8356                     else
8357                         sayNO;
8358                 }
8359             }
8360
8361             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8362             NOT_REACHED; /* NOTREACHED */
8363
8364         case CURLYM_B_fail: /* just failed to match a B */
8365             REGCP_UNWIND(ST.cp);
8366             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8367             if (ST.minmod) {
8368                 I32 max = ARG2(ST.me);
8369                 if (max != REG_INFTY && ST.count == max)
8370                     sayNO;
8371                 goto curlym_do_A; /* try to match a further A */
8372             }
8373             /* backtrack one A */
8374             if (ST.count == ARG1(ST.me) /* min */)
8375                 sayNO;
8376             ST.count--;
8377             SET_locinput(HOPc(locinput, -ST.alen));
8378             goto curlym_do_B; /* try to match B */
8379
8380 #undef ST
8381 #define ST st->u.curly
8382
8383 #define CURLY_SETPAREN(paren, success) \
8384     if (paren) { \
8385         if (success) { \
8386             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
8387             rex->offs[paren].end = locinput - reginfo->strbeg; \
8388             if (paren > rex->lastparen) \
8389                 rex->lastparen = paren; \
8390             rex->lastcloseparen = paren; \
8391         } \
8392         else { \
8393             rex->offs[paren].end = -1; \
8394             rex->lastparen      = ST.lastparen; \
8395             rex->lastcloseparen = ST.lastcloseparen; \
8396         } \
8397     }
8398
8399         case STAR:              /*  /A*B/ where A is width 1 char */
8400             ST.paren = 0;
8401             ST.min = 0;
8402             ST.max = REG_INFTY;
8403             scan = NEXTOPER(scan);
8404             goto repeat;
8405
8406         case PLUS:              /*  /A+B/ where A is width 1 char */
8407             ST.paren = 0;
8408             ST.min = 1;
8409             ST.max = REG_INFTY;
8410             scan = NEXTOPER(scan);
8411             goto repeat;
8412
8413         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8414             ST.paren = scan->flags;     /* Which paren to set */
8415             ST.lastparen      = rex->lastparen;
8416             ST.lastcloseparen = rex->lastcloseparen;
8417             if (ST.paren > maxopenparen)
8418                 maxopenparen = ST.paren;
8419             ST.min = ARG1(scan);  /* min to match */
8420             ST.max = ARG2(scan);  /* max to match */
8421             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8422             {
8423                 ST.min=1;
8424                 ST.max=1;
8425             }
8426             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8427             goto repeat;
8428
8429         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8430             ST.paren = 0;
8431             ST.min = ARG1(scan);  /* min to match */
8432             ST.max = ARG2(scan);  /* max to match */
8433             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8434           repeat:
8435             /*
8436             * Lookahead to avoid useless match attempts
8437             * when we know what character comes next.
8438             *
8439             * Used to only do .*x and .*?x, but now it allows
8440             * for )'s, ('s and (?{ ... })'s to be in the way
8441             * of the quantifier and the EXACT-like node.  -- japhy
8442             */
8443
8444             assert(ST.min <= ST.max);
8445             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8446                 ST.c1 = ST.c2 = CHRTEST_VOID;
8447             }
8448             else {
8449                 regnode *text_node = next;
8450
8451                 if (! HAS_TEXT(text_node))
8452                     FIND_NEXT_IMPT(text_node);
8453
8454                 if (! HAS_TEXT(text_node))
8455                     ST.c1 = ST.c2 = CHRTEST_VOID;
8456                 else {
8457                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8458                         ST.c1 = ST.c2 = CHRTEST_VOID;
8459                     }
8460                     else {
8461
8462                     /*  Currently we only get here when
8463
8464                         PL_rekind[OP(text_node)] == EXACT
8465
8466                         if this changes back then the macro for IS_TEXT and
8467                         friends need to change. */
8468                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8469                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8470                            reginfo))
8471                         {
8472                             sayNO;
8473                         }
8474                     }
8475                 }
8476             }
8477
8478             ST.A = scan;
8479             ST.B = next;
8480             if (minmod) {
8481                 char *li = locinput;
8482                 minmod = 0;
8483                 if (ST.min &&
8484                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8485                             < ST.min)
8486                     sayNO;
8487                 SET_locinput(li);
8488                 ST.count = ST.min;
8489                 REGCP_SET(ST.cp);
8490                 if (ST.c1 == CHRTEST_VOID)
8491                     goto curly_try_B_min;
8492
8493                 ST.oldloc = locinput;
8494
8495                 /* set ST.maxpos to the furthest point along the
8496                  * string that could possibly match */
8497                 if  (ST.max == REG_INFTY) {
8498                     ST.maxpos = reginfo->strend - 1;
8499                     if (utf8_target)
8500                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8501                             ST.maxpos--;
8502                 }
8503                 else if (utf8_target) {
8504                     int m = ST.max - ST.min;
8505                     for (ST.maxpos = locinput;
8506                          m >0 && ST.maxpos < reginfo->strend; m--)
8507                         ST.maxpos += UTF8SKIP(ST.maxpos);
8508                 }
8509                 else {
8510                     ST.maxpos = locinput + ST.max - ST.min;
8511                     if (ST.maxpos >= reginfo->strend)
8512                         ST.maxpos = reginfo->strend - 1;
8513                 }
8514                 goto curly_try_B_min_known;
8515
8516             }
8517             else {
8518                 /* avoid taking address of locinput, so it can remain
8519                  * a register var */
8520                 char *li = locinput;
8521                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8522                 if (ST.count < ST.min)
8523                     sayNO;
8524                 SET_locinput(li);
8525                 if ((ST.count > ST.min)
8526                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8527                 {
8528                     /* A{m,n} must come at the end of the string, there's
8529                      * no point in backing off ... */
8530                     ST.min = ST.count;
8531                     /* ...except that $ and \Z can match before *and* after
8532                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8533                        We may back off by one in this case. */
8534                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8535                         ST.min--;
8536                 }
8537                 REGCP_SET(ST.cp);
8538                 goto curly_try_B_max;
8539             }
8540             NOT_REACHED; /* NOTREACHED */
8541
8542         case CURLY_B_min_known_fail:
8543             /* failed to find B in a non-greedy match where c1,c2 valid */
8544
8545             REGCP_UNWIND(ST.cp);
8546             if (ST.paren) {
8547                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8548             }
8549             /* Couldn't or didn't -- move forward. */
8550             ST.oldloc = locinput;
8551             if (utf8_target)
8552                 locinput += UTF8SKIP(locinput);
8553             else
8554                 locinput++;
8555             ST.count++;
8556           curly_try_B_min_known:
8557              /* find the next place where 'B' could work, then call B */
8558             {
8559                 int n;
8560                 if (utf8_target) {
8561                     n = (ST.oldloc == locinput) ? 0 : 1;
8562                     if (ST.c1 == ST.c2) {
8563                         /* set n to utf8_distance(oldloc, locinput) */
8564                         while (locinput <= ST.maxpos
8565                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8566                         {
8567                             locinput += UTF8SKIP(locinput);
8568                             n++;
8569                         }
8570                     }
8571                     else {
8572                         /* set n to utf8_distance(oldloc, locinput) */
8573                         while (locinput <= ST.maxpos
8574                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8575                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8576                         {
8577                             locinput += UTF8SKIP(locinput);
8578                             n++;
8579                         }
8580                     }
8581                 }
8582                 else {  /* Not utf8_target */
8583                     if (ST.c1 == ST.c2) {
8584                         locinput = (char *) memchr(locinput,
8585                                                    ST.c1,
8586                                                    ST.maxpos + 1 - locinput);
8587                         if (! locinput) {
8588                             locinput = ST.maxpos + 1;
8589                         }
8590                     }
8591                     else {
8592                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8593
8594                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8595                             while (   locinput <= ST.maxpos
8596                                    && UCHARAT(locinput) != ST.c1
8597                                    && UCHARAT(locinput) != ST.c2)
8598                             {
8599                                 locinput++;
8600                             }
8601                         }
8602                         else {
8603                             /* If c1 and c2 only differ by a single bit, we can
8604                              * avoid a conditional each time through the loop,
8605                              * at the expense of a little preliminary setup and
8606                              * an extra mask each iteration.  By masking out
8607                              * that bit, we match exactly two characters, c1
8608                              * and c2, and so we don't have to test for both.
8609                              * On both ASCII and EBCDIC platforms, most of the
8610                              * ASCII-range and Latin1-range folded equivalents
8611                              * differ only in a single bit, so this is actually
8612                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8613                              * 0x61). */
8614                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8615                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8616                             while (   locinput <= ST.maxpos
8617                                    && (UCHARAT(locinput) & c1_c2_mask)
8618                                                                 != c1_masked)
8619                             {
8620                                 locinput++;
8621                             }
8622                         }
8623                     }
8624                     n = locinput - ST.oldloc;
8625                 }
8626                 if (locinput > ST.maxpos)
8627                     sayNO;
8628                 if (n) {
8629                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8630                      * at b; check that everything between oldloc and
8631                      * locinput matches */
8632                     char *li = ST.oldloc;
8633                     ST.count += n;
8634                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8635                         sayNO;
8636                     assert(n == REG_INFTY || locinput == li);
8637                 }
8638                 CURLY_SETPAREN(ST.paren, ST.count);
8639                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8640                     goto fake_end;
8641                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8642             }
8643             NOT_REACHED; /* NOTREACHED */
8644
8645         case CURLY_B_min_fail:
8646             /* failed to find B in a non-greedy match where c1,c2 invalid */
8647
8648             REGCP_UNWIND(ST.cp);
8649             if (ST.paren) {
8650                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8651             }
8652             /* failed -- move forward one */
8653             {
8654                 char *li = locinput;
8655                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8656                     sayNO;
8657                 }
8658                 locinput = li;
8659             }
8660             {
8661                 ST.count++;
8662                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8663                         ST.count > 0)) /* count overflow ? */
8664                 {
8665                   curly_try_B_min:
8666                     CURLY_SETPAREN(ST.paren, ST.count);
8667                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8668                         goto fake_end;
8669                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8670                 }
8671             }
8672             sayNO;
8673             NOT_REACHED; /* NOTREACHED */
8674
8675           curly_try_B_max:
8676             /* a successful greedy match: now try to match B */
8677             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8678                 goto fake_end;
8679             {
8680                 bool could_match = locinput < reginfo->strend;
8681
8682                 /* If it could work, try it. */
8683                 if (ST.c1 != CHRTEST_VOID && could_match) {
8684                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8685                     {
8686                         could_match = memEQ(locinput,
8687                                             ST.c1_utf8,
8688                                             UTF8SKIP(locinput))
8689                                     || memEQ(locinput,
8690                                              ST.c2_utf8,
8691                                              UTF8SKIP(locinput));
8692                     }
8693                     else {
8694                         could_match = UCHARAT(locinput) == ST.c1
8695                                       || UCHARAT(locinput) == ST.c2;
8696                     }
8697                 }
8698                 if (ST.c1 == CHRTEST_VOID || could_match) {
8699                     CURLY_SETPAREN(ST.paren, ST.count);
8700                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8701                     NOT_REACHED; /* NOTREACHED */
8702                 }
8703             }
8704             /* FALLTHROUGH */
8705
8706         case CURLY_B_max_fail:
8707             /* failed to find B in a greedy match */
8708
8709             REGCP_UNWIND(ST.cp);
8710             if (ST.paren) {
8711                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8712             }
8713             /*  back up. */
8714             if (--ST.count < ST.min)
8715                 sayNO;
8716             locinput = HOPc(locinput, -1);
8717             goto curly_try_B_max;
8718
8719 #undef ST
8720
8721         case END: /*  last op of main pattern  */
8722           fake_end:
8723             if (cur_eval) {
8724                 /* we've just finished A in /(??{A})B/; now continue with B */
8725                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8726                 st->u.eval.prev_rex = rex_sv;           /* inner */
8727
8728                 /* Save *all* the positions. */
8729                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8730                 rex_sv = CUR_EVAL.prev_rex;
8731                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8732                 SET_reg_curpm(rex_sv);
8733                 rex = ReANY(rex_sv);
8734                 rexi = RXi_GET(rex);
8735
8736                 st->u.eval.prev_curlyx = cur_curlyx;
8737                 cur_curlyx = CUR_EVAL.prev_curlyx;
8738
8739                 REGCP_SET(st->u.eval.lastcp);
8740
8741                 /* Restore parens of the outer rex without popping the
8742                  * savestack */
8743                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8744
8745                 st->u.eval.prev_eval = cur_eval;
8746                 cur_eval = CUR_EVAL.prev_eval;
8747                 DEBUG_EXECUTE_r(
8748                     Perl_re_exec_indentf( aTHX_  "EVAL trying tail ... (cur_eval=%p)\n",
8749                                       depth, cur_eval););
8750                 if ( nochange_depth )
8751                     nochange_depth--;
8752
8753                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8754
8755                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8756                                     locinput); /* match B */
8757             }
8758
8759             if (locinput < reginfo->till) {
8760                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8761                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8762                                       PL_colors[4],
8763                                       (long)(locinput - startpos),
8764                                       (long)(reginfo->till - startpos),
8765                                       PL_colors[5]));
8766
8767                 sayNO_SILENT;           /* Cannot match: too short. */
8768             }
8769             sayYES;                     /* Success! */
8770
8771         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8772             DEBUG_EXECUTE_r(
8773             Perl_re_exec_indentf( aTHX_  "%ssubpattern success...%s\n",
8774                 depth, PL_colors[4], PL_colors[5]));
8775             sayYES;                     /* Success! */
8776
8777 #undef  ST
8778 #define ST st->u.ifmatch
8779
8780         {
8781             char *newstart;
8782
8783         case SUSPEND:   /* (?>A) */
8784             ST.wanted = 1;
8785             newstart = locinput;
8786             goto do_ifmatch;
8787
8788         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8789             ST.wanted = 0;
8790             goto ifmatch_trivial_fail_test;
8791
8792         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8793             ST.wanted = 1;
8794           ifmatch_trivial_fail_test:
8795             if (scan->flags) {
8796                 char * const s = HOPBACKc(locinput, scan->flags);
8797                 if (!s) {
8798                     /* trivial fail */
8799                     if (logical) {
8800                         logical = 0;
8801                         sw = 1 - cBOOL(ST.wanted);
8802                     }
8803                     else if (ST.wanted)
8804                         sayNO;
8805                     next = scan + ARG(scan);
8806                     if (next == scan)
8807                         next = NULL;
8808                     break;
8809                 }
8810                 newstart = s;
8811             }
8812             else
8813                 newstart = locinput;
8814
8815           do_ifmatch:
8816             ST.me = scan;
8817             ST.logical = logical;
8818             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8819
8820             /* execute body of (?...A) */
8821             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8822             NOT_REACHED; /* NOTREACHED */
8823         }
8824
8825         case IFMATCH_A_fail: /* body of (?...A) failed */
8826             ST.wanted = !ST.wanted;
8827             /* FALLTHROUGH */
8828
8829         case IFMATCH_A: /* body of (?...A) succeeded */
8830             if (ST.logical) {
8831                 sw = cBOOL(ST.wanted);
8832             }
8833             else if (!ST.wanted)
8834                 sayNO;
8835
8836             if (OP(ST.me) != SUSPEND) {
8837                 /* restore old position except for (?>...) */
8838                 locinput = st->locinput;
8839             }
8840             scan = ST.me + ARG(ST.me);
8841             if (scan == ST.me)
8842                 scan = NULL;
8843             continue; /* execute B */
8844
8845 #undef ST
8846
8847         case LONGJMP: /*  alternative with many branches compiles to
8848                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8849             next = scan + ARG(scan);
8850             if (next == scan)
8851                 next = NULL;
8852             break;
8853
8854         case COMMIT:  /*  (*COMMIT)  */
8855             reginfo->cutpoint = reginfo->strend;
8856             /* FALLTHROUGH */
8857
8858         case PRUNE:   /*  (*PRUNE)   */
8859             if (scan->flags)
8860                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8861             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8862             NOT_REACHED; /* NOTREACHED */
8863
8864         case COMMIT_next_fail:
8865             no_final = 1;
8866             /* FALLTHROUGH */
8867             sayNO;
8868             NOT_REACHED; /* NOTREACHED */
8869
8870         case OPFAIL:   /* (*FAIL)  */
8871             if (scan->flags)
8872                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8873             if (logical) {
8874                 /* deal with (?(?!)X|Y) properly,
8875                  * make sure we trigger the no branch
8876                  * of the trailing IFTHEN structure*/
8877                 sw= 0;
8878                 break;
8879             } else {
8880                 sayNO;
8881             }
8882             NOT_REACHED; /* NOTREACHED */
8883
8884 #define ST st->u.mark
8885         case MARKPOINT: /*  (*MARK:foo)  */
8886             ST.prev_mark = mark_state;
8887             ST.mark_name = sv_commit = sv_yes_mark
8888                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8889             mark_state = st;
8890             ST.mark_loc = locinput;
8891             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8892             NOT_REACHED; /* NOTREACHED */
8893
8894         case MARKPOINT_next:
8895             mark_state = ST.prev_mark;
8896             sayYES;
8897             NOT_REACHED; /* NOTREACHED */
8898
8899         case MARKPOINT_next_fail:
8900             if (popmark && sv_eq(ST.mark_name,popmark))
8901             {
8902                 if (ST.mark_loc > startpoint)
8903                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8904                 popmark = NULL; /* we found our mark */
8905                 sv_commit = ST.mark_name;
8906
8907                 DEBUG_EXECUTE_r({
8908                         Perl_re_exec_indentf( aTHX_  "%ssetting cutpoint to mark:%" SVf "...%s\n",
8909                             depth,
8910                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8911                 });
8912             }
8913             mark_state = ST.prev_mark;
8914             sv_yes_mark = mark_state ?
8915                 mark_state->u.mark.mark_name : NULL;
8916             sayNO;
8917             NOT_REACHED; /* NOTREACHED */
8918
8919         case SKIP:  /*  (*SKIP)  */
8920             if (!scan->flags) {
8921                 /* (*SKIP) : if we fail we cut here*/
8922                 ST.mark_name = NULL;
8923                 ST.mark_loc = locinput;
8924                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8925             } else {
8926                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8927                    otherwise do nothing.  Meaning we need to scan
8928                  */
8929                 regmatch_state *cur = mark_state;
8930                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8931
8932                 while (cur) {
8933                     if ( sv_eq( cur->u.mark.mark_name,
8934                                 find ) )
8935                     {
8936                         ST.mark_name = find;
8937                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8938                     }
8939                     cur = cur->u.mark.prev_mark;
8940                 }
8941             }
8942             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8943             break;
8944
8945         case SKIP_next_fail:
8946             if (ST.mark_name) {
8947                 /* (*CUT:NAME) - Set up to search for the name as we
8948                    collapse the stack*/
8949                 popmark = ST.mark_name;
8950             } else {
8951                 /* (*CUT) - No name, we cut here.*/
8952                 if (ST.mark_loc > startpoint)
8953                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8954                 /* but we set sv_commit to latest mark_name if there
8955                    is one so they can test to see how things lead to this
8956                    cut */
8957                 if (mark_state)
8958                     sv_commit=mark_state->u.mark.mark_name;
8959             }
8960             no_final = 1;
8961             sayNO;
8962             NOT_REACHED; /* NOTREACHED */
8963 #undef ST
8964
8965         case LNBREAK: /* \R */
8966             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8967                 locinput += n;
8968             } else
8969                 sayNO;
8970             break;
8971
8972         default:
8973             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8974                           PTR2UV(scan), OP(scan));
8975             Perl_croak(aTHX_ "regexp memory corruption");
8976
8977         /* this is a point to jump to in order to increment
8978          * locinput by one character */
8979           increment_locinput:
8980             assert(!NEXTCHR_IS_EOS);
8981             if (utf8_target) {
8982                 locinput += PL_utf8skip[nextchr];
8983                 /* locinput is allowed to go 1 char off the end (signifying
8984                  * EOS), but not 2+ */
8985                 if (locinput > reginfo->strend)
8986                     sayNO;
8987             }
8988             else
8989                 locinput++;
8990             break;
8991
8992         } /* end switch */
8993
8994         /* switch break jumps here */
8995         scan = next; /* prepare to execute the next op and ... */
8996         continue;    /* ... jump back to the top, reusing st */
8997         /* NOTREACHED */
8998
8999       push_yes_state:
9000         /* push a state that backtracks on success */
9001         st->u.yes.prev_yes_state = yes_state;
9002         yes_state = st;
9003         /* FALLTHROUGH */
9004       push_state:
9005         /* push a new regex state, then continue at scan  */
9006         {
9007             regmatch_state *newst;
9008
9009             DEBUG_STACK_r({
9010                 regmatch_state *cur = st;
9011                 regmatch_state *curyes = yes_state;
9012                 U32 i;
9013                 regmatch_slab *slab = PL_regmatch_slab;
9014                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9015                     if (cur < SLAB_FIRST(slab)) {
9016                         slab = slab->prev;
9017                         cur = SLAB_LAST(slab);
9018                     }
9019                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9020                         depth,
9021                         i ? "    " : "push",
9022                         depth - i, PL_reg_name[cur->resume_state],
9023                         (curyes == cur) ? "yes" : ""
9024                     );
9025                     if (curyes == cur)
9026                         curyes = cur->u.yes.prev_yes_state;
9027                 }
9028             } else
9029                 DEBUG_STATE_pp("push")
9030             );
9031             depth++;
9032             st->locinput = locinput;
9033             newst = st+1;
9034             if (newst >  SLAB_LAST(PL_regmatch_slab))
9035                 newst = S_push_slab(aTHX);
9036             PL_regmatch_state = newst;
9037
9038             locinput = pushinput;
9039             st = newst;
9040             continue;
9041             /* NOTREACHED */
9042         }
9043     }
9044 #ifdef SOLARIS_BAD_OPTIMIZER
9045 #  undef PL_charclass
9046 #endif
9047
9048     /*
9049     * We get here only if there's trouble -- normally "case END" is
9050     * the terminating point.
9051     */
9052     Perl_croak(aTHX_ "corrupted regexp pointers");
9053     NOT_REACHED; /* NOTREACHED */
9054
9055   yes:
9056     if (yes_state) {
9057         /* we have successfully completed a subexpression, but we must now
9058          * pop to the state marked by yes_state and continue from there */
9059         assert(st != yes_state);
9060 #ifdef DEBUGGING
9061         while (st != yes_state) {
9062             st--;
9063             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9064                 PL_regmatch_slab = PL_regmatch_slab->prev;
9065                 st = SLAB_LAST(PL_regmatch_slab);
9066             }
9067             DEBUG_STATE_r({
9068                 if (no_final) {
9069                     DEBUG_STATE_pp("pop (no final)");
9070                 } else {
9071                     DEBUG_STATE_pp("pop (yes)");
9072                 }
9073             });
9074             depth--;
9075         }
9076 #else
9077         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9078             || yes_state > SLAB_LAST(PL_regmatch_slab))
9079         {
9080             /* not in this slab, pop slab */
9081             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9082             PL_regmatch_slab = PL_regmatch_slab->prev;
9083             st = SLAB_LAST(PL_regmatch_slab);
9084         }
9085         depth -= (st - yes_state);
9086 #endif
9087         st = yes_state;
9088         yes_state = st->u.yes.prev_yes_state;
9089         PL_regmatch_state = st;
9090
9091         if (no_final)
9092             locinput= st->locinput;
9093         state_num = st->resume_state + no_final;
9094         goto reenter_switch;
9095     }
9096
9097     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9098                           PL_colors[4], PL_colors[5]));
9099
9100     if (reginfo->info_aux_eval) {
9101         /* each successfully executed (?{...}) block does the equivalent of
9102          *   local $^R = do {...}
9103          * When popping the save stack, all these locals would be undone;
9104          * bypass this by setting the outermost saved $^R to the latest
9105          * value */
9106         /* I dont know if this is needed or works properly now.
9107          * see code related to PL_replgv elsewhere in this file.
9108          * Yves
9109          */
9110         if (oreplsv != GvSV(PL_replgv))
9111             sv_setsv(oreplsv, GvSV(PL_replgv));
9112     }
9113     result = 1;
9114     goto final_exit;
9115
9116   no:
9117     DEBUG_EXECUTE_r(
9118         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9119             depth,
9120             PL_colors[4], PL_colors[5])
9121         );
9122
9123   no_silent:
9124     if (no_final) {
9125         if (yes_state) {
9126             goto yes;
9127         } else {
9128             goto final_exit;
9129         }
9130     }
9131     if (depth) {
9132         /* there's a previous state to backtrack to */
9133         st--;
9134         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9135             PL_regmatch_slab = PL_regmatch_slab->prev;
9136             st = SLAB_LAST(PL_regmatch_slab);
9137         }
9138         PL_regmatch_state = st;
9139         locinput= st->locinput;
9140
9141         DEBUG_STATE_pp("pop");
9142         depth--;
9143         if (yes_state == st)
9144             yes_state = st->u.yes.prev_yes_state;
9145
9146         state_num = st->resume_state + 1; /* failure = success + 1 */
9147         PERL_ASYNC_CHECK();
9148         goto reenter_switch;
9149     }
9150     result = 0;
9151
9152   final_exit:
9153     if (rex->intflags & PREGf_VERBARG_SEEN) {
9154         SV *sv_err = get_sv("REGERROR", 1);
9155         SV *sv_mrk = get_sv("REGMARK", 1);
9156         if (result) {
9157             sv_commit = &PL_sv_no;
9158             if (!sv_yes_mark)
9159                 sv_yes_mark = &PL_sv_yes;
9160         } else {
9161             if (!sv_commit)
9162                 sv_commit = &PL_sv_yes;
9163             sv_yes_mark = &PL_sv_no;
9164         }
9165         assert(sv_err);
9166         assert(sv_mrk);
9167         sv_setsv(sv_err, sv_commit);
9168         sv_setsv(sv_mrk, sv_yes_mark);
9169     }
9170
9171
9172     if (last_pushed_cv) {
9173         dSP;
9174         /* see "Some notes about MULTICALL" above */
9175         POP_MULTICALL;
9176         PERL_UNUSED_VAR(SP);
9177     }
9178     else
9179         LEAVE_SCOPE(orig_savestack_ix);
9180
9181     assert(!result ||  locinput - reginfo->strbeg >= 0);
9182     return result ?  locinput - reginfo->strbeg : -1;
9183 }
9184
9185 /*
9186  - regrepeat - repeatedly match something simple, report how many
9187  *
9188  * What 'simple' means is a node which can be the operand of a quantifier like
9189  * '+', or {1,3}
9190  *
9191  * startposp - pointer a pointer to the start position.  This is updated
9192  *             to point to the byte following the highest successful
9193  *             match.
9194  * p         - the regnode to be repeatedly matched against.
9195  * reginfo   - struct holding match state, such as strend
9196  * max       - maximum number of things to match.
9197  * depth     - (for debugging) backtracking depth.
9198  */
9199 STATIC I32
9200 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9201             regmatch_info *const reginfo, I32 max _pDEPTH)
9202 {
9203     char *scan;     /* Pointer to current position in target string */
9204     I32 c;
9205     char *loceol = reginfo->strend;   /* local version */
9206     I32 hardcount = 0;  /* How many matches so far */
9207     bool utf8_target = reginfo->is_utf8_target;
9208     unsigned int to_complement = 0;  /* Invert the result? */
9209     UV utf8_flags;
9210     _char_class_number classnum;
9211
9212     PERL_ARGS_ASSERT_REGREPEAT;
9213
9214     scan = *startposp;
9215     if (max == REG_INFTY)
9216         max = I32_MAX;
9217     else if (! utf8_target && loceol - scan > max)
9218         loceol = scan + max;
9219
9220     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
9221      * to the maximum of how far we should go in it (leaving it set to the real
9222      * end, if the maximum permissible would take us beyond that).  This allows
9223      * us to make the loop exit condition that we haven't gone past <loceol> to
9224      * also mean that we haven't exceeded the max permissible count, saving a
9225      * test each time through the loop.  But it assumes that the OP matches a
9226      * single byte, which is true for most of the OPs below when applied to a
9227      * non-UTF-8 target.  Those relatively few OPs that don't have this
9228      * characteristic will have to compensate.
9229      *
9230      * There is no adjustment for UTF-8 targets, as the number of bytes per
9231      * character varies.  OPs will have to test both that the count is less
9232      * than the max permissible (using <hardcount> to keep track), and that we
9233      * are still within the bounds of the string (using <loceol>.  A few OPs
9234      * match a single byte no matter what the encoding.  They can omit the max
9235      * test if, for the UTF-8 case, they do the adjustment that was skipped
9236      * above.
9237      *
9238      * Thus, the code above sets things up for the common case; and exceptional
9239      * cases need extra work; the common case is to make sure <scan> doesn't
9240      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
9241      * count doesn't exceed the maximum permissible */
9242
9243     switch (OP(p)) {
9244     case REG_ANY:
9245         if (utf8_target) {
9246             while (scan < loceol && hardcount < max && *scan != '\n') {
9247                 scan += UTF8SKIP(scan);
9248                 hardcount++;
9249             }
9250         } else {
9251             scan = (char *) memchr(scan, '\n', loceol - scan);
9252             if (! scan) {
9253                 scan = loceol;
9254             }
9255         }
9256         break;
9257     case SANY:
9258         if (utf8_target) {
9259             while (scan < loceol && hardcount < max) {
9260                 scan += UTF8SKIP(scan);
9261                 hardcount++;
9262             }
9263         }
9264         else
9265             scan = loceol;
9266         break;
9267     case EXACTL:
9268         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9269         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9270             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9271         }
9272         /* FALLTHROUGH */
9273     case EXACT:
9274         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9275
9276         c = (U8)*STRING(p);
9277
9278         /* Can use a simple find if the pattern char to match on is invariant
9279          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9280          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9281          * true iff it doesn't matter if the argument is in UTF-8 or not */
9282         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9283             if (utf8_target && loceol - scan > max) {
9284                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9285                  * since here, to match at all, 1 char == 1 byte */
9286                 loceol = scan + max;
9287             }
9288             scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9289         }
9290         else if (reginfo->is_utf8_pat) {
9291             if (utf8_target) {
9292                 STRLEN scan_char_len;
9293
9294                 /* When both target and pattern are UTF-8, we have to do
9295                  * string EQ */
9296                 while (hardcount < max
9297                        && scan < loceol
9298                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9299                        && memEQ(scan, STRING(p), scan_char_len))
9300                 {
9301                     scan += scan_char_len;
9302                     hardcount++;
9303                 }
9304             }
9305             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9306
9307                 /* Target isn't utf8; convert the character in the UTF-8
9308                  * pattern to non-UTF8, and do a simple find */
9309                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9310                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9311             } /* else pattern char is above Latin1, can't possibly match the
9312                  non-UTF-8 target */
9313         }
9314         else {
9315
9316             /* Here, the string must be utf8; pattern isn't, and <c> is
9317              * different in utf8 than not, so can't compare them directly.
9318              * Outside the loop, find the two utf8 bytes that represent c, and
9319              * then look for those in sequence in the utf8 string */
9320             U8 high = UTF8_TWO_BYTE_HI(c);
9321             U8 low = UTF8_TWO_BYTE_LO(c);
9322
9323             while (hardcount < max
9324                     && scan + 1 < loceol
9325                     && UCHARAT(scan) == high
9326                     && UCHARAT(scan + 1) == low)
9327             {
9328                 scan += 2;
9329                 hardcount++;
9330             }
9331         }
9332         break;
9333
9334     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9335         assert(! reginfo->is_utf8_pat);
9336         /* FALLTHROUGH */
9337     case EXACTFAA:
9338         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9339         goto do_exactf;
9340
9341     case EXACTFL:
9342         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9343         utf8_flags = FOLDEQ_LOCALE;
9344         goto do_exactf;
9345
9346     case EXACTF:   /* This node only generated for non-utf8 patterns */
9347         assert(! reginfo->is_utf8_pat);
9348         utf8_flags = 0;
9349         goto do_exactf;
9350
9351     case EXACTFLU8:
9352         if (! utf8_target) {
9353             break;
9354         }
9355         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9356                                     | FOLDEQ_S2_FOLDS_SANE;
9357         goto do_exactf;
9358
9359     case EXACTFU_SS:
9360     case EXACTFU:
9361         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9362
9363       do_exactf: {
9364         int c1, c2;
9365         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9366
9367         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9368
9369         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9370                                         reginfo))
9371         {
9372             if (c1 == CHRTEST_VOID) {
9373                 /* Use full Unicode fold matching */
9374                 char *tmpeol = reginfo->strend;
9375                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9376                 while (hardcount < max
9377                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9378                                              STRING(p), NULL, pat_len,
9379                                              reginfo->is_utf8_pat, utf8_flags))
9380                 {
9381                     scan = tmpeol;
9382                     tmpeol = reginfo->strend;
9383                     hardcount++;
9384                 }
9385             }
9386             else if (utf8_target) {
9387                 if (c1 == c2) {
9388                     while (scan < loceol
9389                            && hardcount < max
9390                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9391                     {
9392                         scan += UTF8SKIP(scan);
9393                         hardcount++;
9394                     }
9395                 }
9396                 else {
9397                     while (scan < loceol
9398                            && hardcount < max
9399                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9400                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9401                     {
9402                         scan += UTF8SKIP(scan);
9403                         hardcount++;
9404                     }
9405                 }
9406             }
9407             else if (c1 == c2) {
9408                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c1);
9409             }
9410             else {
9411                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9412                  * a conditional each time through the loop if the characters
9413                  * differ only in a single bit, as is the usual situation */
9414                 U8 c1_c2_bits_differing = c1 ^ c2;
9415
9416                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9417                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9418
9419                     scan = (char *) find_span_end_mask((U8 *) scan,
9420                                                        (U8 *) loceol,
9421                                                        c1 & c1_c2_mask,
9422                                                        c1_c2_mask);
9423                 }
9424                 else {
9425                     while (    scan < loceol
9426                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9427                     {
9428                         scan++;
9429                     }
9430                 }
9431             }
9432         }
9433         break;
9434     }
9435     case ANYOFL:
9436         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9437
9438         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9439             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9440         }
9441         /* FALLTHROUGH */
9442     case ANYOFD:
9443     case ANYOF:
9444         if (utf8_target) {
9445             while (hardcount < max
9446                    && scan < loceol
9447                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9448             {
9449                 scan += UTF8SKIP(scan);
9450                 hardcount++;
9451             }
9452         }
9453         else if (ANYOF_FLAGS(p)) {
9454             while (scan < loceol
9455                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9456                 scan++;
9457         }
9458         else {
9459             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9460                 scan++;
9461         }
9462         break;
9463
9464     case ANYOFM:
9465         if (utf8_target && loceol - scan > max) {
9466
9467             /* We didn't adjust <loceol> at the beginning of this routine
9468              * because is UTF-8, but it is actually ok to do so, since here, to
9469              * match, 1 char == 1 byte. */
9470             loceol = scan + max;
9471         }
9472
9473         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9474         break;
9475
9476     case ASCII:
9477         if (utf8_target && loceol - scan > max) {
9478             loceol = scan + max;
9479         }
9480
9481         scan = find_next_non_ascii(scan, loceol, utf8_target);
9482         break;
9483
9484     case NASCII:
9485         if (utf8_target) {
9486             while (     hardcount < max
9487                    &&   scan < loceol
9488                    && ! isASCII_utf8_safe(scan, loceol))
9489             {
9490                 scan += UTF8SKIP(scan);
9491                 hardcount++;
9492             }
9493         }
9494         else {
9495             scan = find_next_ascii(scan, loceol, utf8_target);
9496         }
9497         break;
9498
9499     /* The argument (FLAGS) to all the POSIX node types is the class number */
9500
9501     case NPOSIXL:
9502         to_complement = 1;
9503         /* FALLTHROUGH */
9504
9505     case POSIXL:
9506         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9507         if (! utf8_target) {
9508             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9509                                                                    *scan)))
9510             {
9511                 scan++;
9512             }
9513         } else {
9514             while (hardcount < max && scan < loceol
9515                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9516                                                                   (U8 *) scan)))
9517             {
9518                 scan += UTF8SKIP(scan);
9519                 hardcount++;
9520             }
9521         }
9522         break;
9523
9524     case POSIXD:
9525         if (utf8_target) {
9526             goto utf8_posix;
9527         }
9528         /* FALLTHROUGH */
9529
9530     case POSIXA:
9531         if (utf8_target && loceol - scan > max) {
9532
9533             /* We didn't adjust <loceol> at the beginning of this routine
9534              * because is UTF-8, but it is actually ok to do so, since here, to
9535              * match, 1 char == 1 byte. */
9536             loceol = scan + max;
9537         }
9538         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9539             scan++;
9540         }
9541         break;
9542
9543     case NPOSIXD:
9544         if (utf8_target) {
9545             to_complement = 1;
9546             goto utf8_posix;
9547         }
9548         /* FALLTHROUGH */
9549
9550     case NPOSIXA:
9551         if (! utf8_target) {
9552             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9553                 scan++;
9554             }
9555         }
9556         else {
9557
9558             /* The complement of something that matches only ASCII matches all
9559              * non-ASCII, plus everything in ASCII that isn't in the class. */
9560             while (hardcount < max && scan < loceol
9561                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9562                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9563             {
9564                 scan += UTF8SKIP(scan);
9565                 hardcount++;
9566             }
9567         }
9568         break;
9569
9570     case NPOSIXU:
9571         to_complement = 1;
9572         /* FALLTHROUGH */
9573
9574     case POSIXU:
9575         if (! utf8_target) {
9576             while (scan < loceol && to_complement
9577                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9578             {
9579                 scan++;
9580             }
9581         }
9582         else {
9583           utf8_posix:
9584             classnum = (_char_class_number) FLAGS(p);
9585             if (classnum < _FIRST_NON_SWASH_CC) {
9586
9587                 /* Here, a swash is needed for above-Latin1 code points.
9588                  * Process as many Latin1 code points using the built-in rules.
9589                  * Go to another loop to finish processing upon encountering
9590                  * the first Latin1 code point.  We could do that in this loop
9591                  * as well, but the other way saves having to test if the swash
9592                  * has been loaded every time through the loop: extra space to
9593                  * save a test. */
9594                 while (hardcount < max && scan < loceol) {
9595                     if (UTF8_IS_INVARIANT(*scan)) {
9596                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9597                                                                    classnum))))
9598                         {
9599                             break;
9600                         }
9601                         scan++;
9602                     }
9603                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9604                         if (! (to_complement
9605                               ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9606                                                                      *(scan + 1)),
9607                                                     classnum))))
9608                         {
9609                             break;
9610                         }
9611                         scan += 2;
9612                     }
9613                     else {
9614                         goto found_above_latin1;
9615                     }
9616
9617                     hardcount++;
9618                 }
9619             }
9620             else {
9621                 /* For these character classes, the knowledge of how to handle
9622                  * every code point is compiled in to Perl via a macro.  This
9623                  * code is written for making the loops as tight as possible.
9624                  * It could be refactored to save space instead */
9625                 switch (classnum) {
9626                     case _CC_ENUM_SPACE:
9627                         while (hardcount < max
9628                                && scan < loceol
9629                                && (to_complement
9630                                    ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9631                         {
9632                             scan += UTF8SKIP(scan);
9633                             hardcount++;
9634                         }
9635                         break;
9636                     case _CC_ENUM_BLANK:
9637                         while (hardcount < max
9638                                && scan < loceol
9639                                && (to_complement
9640                                     ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9641                         {
9642                             scan += UTF8SKIP(scan);
9643                             hardcount++;
9644                         }
9645                         break;
9646                     case _CC_ENUM_XDIGIT:
9647                         while (hardcount < max
9648                                && scan < loceol
9649                                && (to_complement
9650                                    ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9651                         {
9652                             scan += UTF8SKIP(scan);
9653                             hardcount++;
9654                         }
9655                         break;
9656                     case _CC_ENUM_VERTSPACE:
9657                         while (hardcount < max
9658                                && scan < loceol
9659                                && (to_complement
9660                                    ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9661                         {
9662                             scan += UTF8SKIP(scan);
9663                             hardcount++;
9664                         }
9665                         break;
9666                     case _CC_ENUM_CNTRL:
9667                         while (hardcount < max
9668                                && scan < loceol
9669                                && (to_complement
9670                                    ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9671                         {
9672                             scan += UTF8SKIP(scan);
9673                             hardcount++;
9674                         }
9675                         break;
9676                     default:
9677                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9678                 }
9679             }
9680         }
9681         break;
9682
9683       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
9684
9685         /* Load the swash if not already present */
9686         if (! PL_utf8_swash_ptrs[classnum]) {
9687             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9688             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9689                                         "utf8",
9690                                         "",
9691                                         &PL_sv_undef, 1, 0,
9692                                         PL_XPosix_ptrs[classnum], &flags);
9693         }
9694
9695         while (hardcount < max && scan < loceol
9696                && to_complement ^ cBOOL(_generic_utf8_safe(
9697                                        classnum,
9698                                        scan,
9699                                        loceol,
9700                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
9701                                                    (U8 *) scan,
9702                                                    TRUE))))
9703         {
9704             scan += UTF8SKIP(scan);
9705             hardcount++;
9706         }
9707         break;
9708
9709     case LNBREAK:
9710         if (utf8_target) {
9711             while (hardcount < max && scan < loceol &&
9712                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9713                 scan += c;
9714                 hardcount++;
9715             }
9716         } else {
9717             /* LNBREAK can match one or two latin chars, which is ok, but we
9718              * have to use hardcount in this situation, and throw away the
9719              * adjustment to <loceol> done before the switch statement */
9720             loceol = reginfo->strend;
9721             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9722                 scan+=c;
9723                 hardcount++;
9724             }
9725         }
9726         break;
9727
9728     case BOUNDL:
9729     case NBOUNDL:
9730         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9731         /* FALLTHROUGH */
9732     case BOUND:
9733     case BOUNDA:
9734     case BOUNDU:
9735     case EOS:
9736     case GPOS:
9737     case KEEPS:
9738     case NBOUND:
9739     case NBOUNDA:
9740     case NBOUNDU:
9741     case OPFAIL:
9742     case SBOL:
9743     case SEOL:
9744         /* These are all 0 width, so match right here or not at all. */
9745         break;
9746
9747     default:
9748         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9749         NOT_REACHED; /* NOTREACHED */
9750
9751     }
9752
9753     if (hardcount)
9754         c = hardcount;
9755     else
9756         c = scan - *startposp;
9757     *startposp = scan;
9758
9759     DEBUG_r({
9760         GET_RE_DEBUG_FLAGS_DECL;
9761         DEBUG_EXECUTE_r({
9762             SV * const prop = sv_newmortal();
9763             regprop(prog, prop, p, reginfo, NULL);
9764             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9765                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9766         });
9767     });
9768
9769     return(c);
9770 }
9771
9772
9773 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9774 /*
9775 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9776 create a copy so that changes the caller makes won't change the shared one.
9777 If <altsvp> is non-null, will return NULL in it, for back-compat.
9778  */
9779 SV *
9780 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9781 {
9782     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9783
9784     if (altsvp) {
9785         *altsvp = NULL;
9786     }
9787
9788     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9789 }
9790
9791 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9792
9793 /*
9794  - reginclass - determine if a character falls into a character class
9795
9796   n is the ANYOF-type regnode
9797   p is the target string
9798   p_end points to one byte beyond the end of the target string
9799   utf8_target tells whether p is in UTF-8.
9800
9801   Returns true if matched; false otherwise.
9802
9803   Note that this can be a synthetic start class, a combination of various
9804   nodes, so things you think might be mutually exclusive, such as locale,
9805   aren't.  It can match both locale and non-locale
9806
9807  */
9808
9809 STATIC bool
9810 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9811 {
9812     dVAR;
9813     const char flags = ANYOF_FLAGS(n);
9814     bool match = FALSE;
9815     UV c = *p;
9816
9817     PERL_ARGS_ASSERT_REGINCLASS;
9818
9819     /* If c is not already the code point, get it.  Note that
9820      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9821     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9822         STRLEN c_len = 0;
9823         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9824         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9825         if (c_len == (STRLEN)-1) {
9826             _force_out_malformed_utf8_message(p, p_end,
9827                                               utf8n_flags,
9828                                               1 /* 1 means die */ );
9829             NOT_REACHED; /* NOTREACHED */
9830         }
9831         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9832             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9833         }
9834     }
9835
9836     /* If this character is potentially in the bitmap, check it */
9837     if (c < NUM_ANYOF_CODE_POINTS) {
9838         if (ANYOF_BITMAP_TEST(n, c))
9839             match = TRUE;
9840         else if ((flags
9841                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9842                   && OP(n) == ANYOFD
9843                   && ! utf8_target
9844                   && ! isASCII(c))
9845         {
9846             match = TRUE;
9847         }
9848         else if (flags & ANYOF_LOCALE_FLAGS) {
9849             if ((flags & ANYOFL_FOLD)
9850                 && c < 256
9851                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9852             {
9853                 match = TRUE;
9854             }
9855             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9856                      && c < 256
9857             ) {
9858
9859                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9860                  * if the class includes the Posix character class given by
9861                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9862                  * complemented Posix class given by int(bit/2).  So we loop
9863                  * through the bits, each time changing whether we complement
9864                  * the result or not.  Suppose for the sake of illustration
9865                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9866                  * is set, it means there is a match for this ANYOF node if the
9867                  * character is in the class given by the expression (0 / 2 = 0
9868                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9869                  * and since 'to_complement' is 0, the result will stay TRUE,
9870                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9871                  * bit 1 is 1.  That means there is a match if the character
9872                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9873                  * but will on bit 1.  On the second iteration 'to_complement'
9874                  * will be 1, so the exclusive or will reverse things, so we
9875                  * are testing for \W.  On the third iteration, 'to_complement'
9876                  * will be 0, and we would be testing for \s; the fourth
9877                  * iteration would test for \S, etc.
9878                  *
9879                  * Note that this code assumes that all the classes are closed
9880                  * under folding.  For example, if a character matches \w, then
9881                  * its fold does too; and vice versa.  This should be true for
9882                  * any well-behaved locale for all the currently defined Posix
9883                  * classes, except for :lower: and :upper:, which are handled
9884                  * by the pseudo-class :cased: which matches if either of the
9885                  * other two does.  To get rid of this assumption, an outer
9886                  * loop could be used below to iterate over both the source
9887                  * character, and its fold (if different) */
9888
9889                 int count = 0;
9890                 int to_complement = 0;
9891
9892                 while (count < ANYOF_MAX) {
9893                     if (ANYOF_POSIXL_TEST(n, count)
9894                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9895                     {
9896                         match = TRUE;
9897                         break;
9898                     }
9899                     count++;
9900                     to_complement ^= 1;
9901                 }
9902             }
9903         }
9904     }
9905
9906
9907     /* If the bitmap didn't (or couldn't) match, and something outside the
9908      * bitmap could match, try that. */
9909     if (!match) {
9910         if (c >= NUM_ANYOF_CODE_POINTS
9911             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9912         {
9913             match = TRUE;       /* Everything above the bitmap matches */
9914         }
9915             /* Here doesn't match everything above the bitmap.  If there is
9916              * some information available beyond the bitmap, we may find a
9917              * match in it.  If so, this is most likely because the code point
9918              * is outside the bitmap range.  But rarely, it could be because of
9919              * some other reason.  If so, various flags are set to indicate
9920              * this possibility.  On ANYOFD nodes, there may be matches that
9921              * happen only when the target string is UTF-8; or for other node
9922              * types, because runtime lookup is needed, regardless of the
9923              * UTF-8ness of the target string.  Finally, under /il, there may
9924              * be some matches only possible if the locale is a UTF-8 one. */
9925         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9926                  && (   c >= NUM_ANYOF_CODE_POINTS
9927                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9928                          && (   UNLIKELY(OP(n) != ANYOFD)
9929                              || (utf8_target && ! isASCII_uni(c)
9930 #                               if NUM_ANYOF_CODE_POINTS > 256
9931                                                                  && c < 256
9932 #                               endif
9933                                 )))
9934                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9935                          && IN_UTF8_CTYPE_LOCALE)))
9936         {
9937             SV* only_utf8_locale = NULL;
9938             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9939                                                        &only_utf8_locale, NULL);
9940             if (sw) {
9941                 U8 utf8_buffer[2];
9942                 U8 * utf8_p;
9943                 if (utf8_target) {
9944                     utf8_p = (U8 *) p;
9945                 } else { /* Convert to utf8 */
9946                     utf8_p = utf8_buffer;
9947                     append_utf8_from_native_byte(*p, &utf8_p);
9948                     utf8_p = utf8_buffer;
9949                 }
9950
9951                 if (swash_fetch(sw, utf8_p, TRUE)) {
9952                     match = TRUE;
9953                 }
9954             }
9955             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9956                 match = _invlist_contains_cp(only_utf8_locale, c);
9957             }
9958         }
9959
9960         if (UNICODE_IS_SUPER(c)
9961             && (flags
9962                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9963             && OP(n) != ANYOFD
9964             && ckWARN_d(WARN_NON_UNICODE))
9965         {
9966             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9967                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9968         }
9969     }
9970
9971 #if ANYOF_INVERT != 1
9972     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9973      * use it, don't */
9974 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9975 #endif
9976
9977     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9978     return (flags & ANYOF_INVERT) ^ match;
9979 }
9980
9981 STATIC U8 *
9982 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9983 {
9984     /* return the position 'off' UTF-8 characters away from 's', forward if
9985      * 'off' >= 0, backwards if negative.  But don't go outside of position
9986      * 'lim', which better be < s  if off < 0 */
9987
9988     PERL_ARGS_ASSERT_REGHOP3;
9989
9990     if (off >= 0) {
9991         while (off-- && s < lim) {
9992             /* XXX could check well-formedness here */
9993             U8 *new_s = s + UTF8SKIP(s);
9994             if (new_s > lim) /* lim may be in the middle of a long character */
9995                 return s;
9996             s = new_s;
9997         }
9998     }
9999     else {
10000         while (off++ && s > lim) {
10001             s--;
10002             if (UTF8_IS_CONTINUED(*s)) {
10003                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10004                     s--;
10005                 if (! UTF8_IS_START(*s)) {
10006                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10007                 }
10008             }
10009             /* XXX could check well-formedness here */
10010         }
10011     }
10012     return s;
10013 }
10014
10015 STATIC U8 *
10016 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10017 {
10018     PERL_ARGS_ASSERT_REGHOP4;
10019
10020     if (off >= 0) {
10021         while (off-- && s < rlim) {
10022             /* XXX could check well-formedness here */
10023             s += UTF8SKIP(s);
10024         }
10025     }
10026     else {
10027         while (off++ && s > llim) {
10028             s--;
10029             if (UTF8_IS_CONTINUED(*s)) {
10030                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10031                     s--;
10032                 if (! UTF8_IS_START(*s)) {
10033                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10034                 }
10035             }
10036             /* XXX could check well-formedness here */
10037         }
10038     }
10039     return s;
10040 }
10041
10042 /* like reghop3, but returns NULL on overrun, rather than returning last
10043  * char pos */
10044
10045 STATIC U8 *
10046 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10047 {
10048     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10049
10050     if (off >= 0) {
10051         while (off-- && s < lim) {
10052             /* XXX could check well-formedness here */
10053             s += UTF8SKIP(s);
10054         }
10055         if (off >= 0)
10056             return NULL;
10057     }
10058     else {
10059         while (off++ && s > lim) {
10060             s--;
10061             if (UTF8_IS_CONTINUED(*s)) {
10062                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10063                     s--;
10064                 if (! UTF8_IS_START(*s)) {
10065                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10066                 }
10067             }
10068             /* XXX could check well-formedness here */
10069         }
10070         if (off <= 0)
10071             return NULL;
10072     }
10073     return s;
10074 }
10075
10076
10077 /* when executing a regex that may have (?{}), extra stuff needs setting
10078    up that will be visible to the called code, even before the current
10079    match has finished. In particular:
10080
10081    * $_ is localised to the SV currently being matched;
10082    * pos($_) is created if necessary, ready to be updated on each call-out
10083      to code;
10084    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10085      isn't set until the current pattern is successfully finished), so that
10086      $1 etc of the match-so-far can be seen;
10087    * save the old values of subbeg etc of the current regex, and  set then
10088      to the current string (again, this is normally only done at the end
10089      of execution)
10090 */
10091
10092 static void
10093 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10094 {
10095     MAGIC *mg;
10096     regexp *const rex = ReANY(reginfo->prog);
10097     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10098
10099     eval_state->rex = rex;
10100
10101     if (reginfo->sv) {
10102         /* Make $_ available to executed code. */
10103         if (reginfo->sv != DEFSV) {
10104             SAVE_DEFSV;
10105             DEFSV_set(reginfo->sv);
10106         }
10107
10108         if (!(mg = mg_find_mglob(reginfo->sv))) {
10109             /* prepare for quick setting of pos */
10110             mg = sv_magicext_mglob(reginfo->sv);
10111             mg->mg_len = -1;
10112         }
10113         eval_state->pos_magic = mg;
10114         eval_state->pos       = mg->mg_len;
10115         eval_state->pos_flags = mg->mg_flags;
10116     }
10117     else
10118         eval_state->pos_magic = NULL;
10119
10120     if (!PL_reg_curpm) {
10121         /* PL_reg_curpm is a fake PMOP that we can attach the current
10122          * regex to and point PL_curpm at, so that $1 et al are visible
10123          * within a /(?{})/. It's just allocated once per interpreter the
10124          * first time its needed */
10125         Newxz(PL_reg_curpm, 1, PMOP);
10126 #ifdef USE_ITHREADS
10127         {
10128             SV* const repointer = &PL_sv_undef;
10129             /* this regexp is also owned by the new PL_reg_curpm, which
10130                will try to free it.  */
10131             av_push(PL_regex_padav, repointer);
10132             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10133             PL_regex_pad = AvARRAY(PL_regex_padav);
10134         }
10135 #endif
10136     }
10137     SET_reg_curpm(reginfo->prog);
10138     eval_state->curpm = PL_curpm;
10139     PL_curpm_under = PL_curpm;
10140     PL_curpm = PL_reg_curpm;
10141     if (RXp_MATCH_COPIED(rex)) {
10142         /*  Here is a serious problem: we cannot rewrite subbeg,
10143             since it may be needed if this match fails.  Thus
10144             $` inside (?{}) could fail... */
10145         eval_state->subbeg     = rex->subbeg;
10146         eval_state->sublen     = rex->sublen;
10147         eval_state->suboffset  = rex->suboffset;
10148         eval_state->subcoffset = rex->subcoffset;
10149 #ifdef PERL_ANY_COW
10150         eval_state->saved_copy = rex->saved_copy;
10151 #endif
10152         RXp_MATCH_COPIED_off(rex);
10153     }
10154     else
10155         eval_state->subbeg = NULL;
10156     rex->subbeg = (char *)reginfo->strbeg;
10157     rex->suboffset = 0;
10158     rex->subcoffset = 0;
10159     rex->sublen = reginfo->strend - reginfo->strbeg;
10160 }
10161
10162
10163 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10164
10165 static void
10166 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10167 {
10168     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10169     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10170     regmatch_slab *s;
10171
10172     Safefree(aux->poscache);
10173
10174     if (eval_state) {
10175
10176         /* undo the effects of S_setup_eval_state() */
10177
10178         if (eval_state->subbeg) {
10179             regexp * const rex = eval_state->rex;
10180             rex->subbeg     = eval_state->subbeg;
10181             rex->sublen     = eval_state->sublen;
10182             rex->suboffset  = eval_state->suboffset;
10183             rex->subcoffset = eval_state->subcoffset;
10184 #ifdef PERL_ANY_COW
10185             rex->saved_copy = eval_state->saved_copy;
10186 #endif
10187             RXp_MATCH_COPIED_on(rex);
10188         }
10189         if (eval_state->pos_magic)
10190         {
10191             eval_state->pos_magic->mg_len = eval_state->pos;
10192             eval_state->pos_magic->mg_flags =
10193                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10194                | (eval_state->pos_flags & MGf_BYTES);
10195         }
10196
10197         PL_curpm = eval_state->curpm;
10198     }
10199
10200     PL_regmatch_state = aux->old_regmatch_state;
10201     PL_regmatch_slab  = aux->old_regmatch_slab;
10202
10203     /* free all slabs above current one - this must be the last action
10204      * of this function, as aux and eval_state are allocated within
10205      * slabs and may be freed here */
10206
10207     s = PL_regmatch_slab->next;
10208     if (s) {
10209         PL_regmatch_slab->next = NULL;
10210         while (s) {
10211             regmatch_slab * const osl = s;
10212             s = s->next;
10213             Safefree(osl);
10214         }
10215     }
10216 }
10217
10218
10219 STATIC void
10220 S_to_utf8_substr(pTHX_ regexp *prog)
10221 {
10222     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10223      * on the converted value */
10224
10225     int i = 1;
10226
10227     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10228
10229     do {
10230         if (prog->substrs->data[i].substr
10231             && !prog->substrs->data[i].utf8_substr) {
10232             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10233             prog->substrs->data[i].utf8_substr = sv;
10234             sv_utf8_upgrade(sv);
10235             if (SvVALID(prog->substrs->data[i].substr)) {
10236                 if (SvTAIL(prog->substrs->data[i].substr)) {
10237                     /* Trim the trailing \n that fbm_compile added last
10238                        time.  */
10239                     SvCUR_set(sv, SvCUR(sv) - 1);
10240                     /* Whilst this makes the SV technically "invalid" (as its
10241                        buffer is no longer followed by "\0") when fbm_compile()
10242                        adds the "\n" back, a "\0" is restored.  */
10243                     fbm_compile(sv, FBMcf_TAIL);
10244                 } else
10245                     fbm_compile(sv, 0);
10246             }
10247             if (prog->substrs->data[i].substr == prog->check_substr)
10248                 prog->check_utf8 = sv;
10249         }
10250     } while (i--);
10251 }
10252
10253 STATIC bool
10254 S_to_byte_substr(pTHX_ regexp *prog)
10255 {
10256     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10257      * on the converted value; returns FALSE if can't be converted. */
10258
10259     int i = 1;
10260
10261     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10262
10263     do {
10264         if (prog->substrs->data[i].utf8_substr
10265             && !prog->substrs->data[i].substr) {
10266             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10267             if (! sv_utf8_downgrade(sv, TRUE)) {
10268                 return FALSE;
10269             }
10270             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10271                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10272                     /* Trim the trailing \n that fbm_compile added last
10273                         time.  */
10274                     SvCUR_set(sv, SvCUR(sv) - 1);
10275                     fbm_compile(sv, FBMcf_TAIL);
10276                 } else
10277                     fbm_compile(sv, 0);
10278             }
10279             prog->substrs->data[i].substr = sv;
10280             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10281                 prog->check_substr = sv;
10282         }
10283     } while (i--);
10284
10285     return TRUE;
10286 }
10287
10288 #ifndef PERL_IN_XSUB_RE
10289
10290 bool
10291 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10292 {
10293     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10294      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10295      * the larger string bounded by 'strbeg' and 'strend'.
10296      *
10297      * 'cp' needs to be assigned (if not a future version of the Unicode
10298      * Standard could make it something that combines with adjacent characters,
10299      * so code using it would then break), and there has to be a GCB break
10300      * before and after the character. */
10301
10302     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10303     const U8 * prev_cp_start;
10304
10305     PERL_ARGS_ASSERT__IS_GRAPHEME;
10306
10307     /* Unassigned code points are forbidden */
10308     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10309                                     _invlist_search(PL_Assigned_invlist, cp))))
10310     {
10311         return FALSE;
10312     }
10313
10314     cp_gcb_val = getGCB_VAL_CP(cp);
10315
10316     /* Find the GCB value of the previous code point in the input */
10317     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10318     if (UNLIKELY(prev_cp_start == s)) {
10319         prev_cp_gcb_val = GCB_EDGE;
10320     }
10321     else {
10322         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10323     }
10324
10325     /* And check that is a grapheme boundary */
10326     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10327                 TRUE /* is UTF-8 encoded */ ))
10328     {
10329         return FALSE;
10330     }
10331
10332     /* Similarly verify there is a break between the current character and the
10333      * following one */
10334     s += UTF8SKIP(s);
10335     if (s >= strend) {
10336         next_cp_gcb_val = GCB_EDGE;
10337     }
10338     else {
10339         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10340     }
10341
10342     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10343 }
10344
10345 /*
10346 =head1 Unicode Support
10347
10348 =for apidoc isSCRIPT_RUN
10349
10350 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10351 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10352 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10353 two degenerate cases given below, this function returns TRUE iff all code
10354 points in it come from any combination of three "scripts" given by the Unicode
10355 "Script Extensions" property: Common, Inherited, and possibly one other.
10356 Additionally all decimal digits must come from the same consecutive sequence of
10357 10.
10358
10359 For example, if all the characters in the sequence are Greek, or Common, or
10360 Inherited, this function will return TRUE, provided any decimal digits in it
10361 are the ASCII digits "0".."9".  For scripts (unlike Greek) that have their own
10362 digits defined this will accept either digits from that set or from 0..9, but
10363 not a combination of the two.  Some scripts, such as Arabic, have more than one
10364 set of digits.  All digits must come from the same set for this function to
10365 return TRUE.
10366
10367 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10368 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10369 C<SCX_INVALID> if the function returns FALSE.
10370
10371 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10372 will be C<SCX_INVALID>.
10373
10374 If the sequence contains a single code point which is unassigned to a character
10375 in the version of Unicode being used, the function will return TRUE, and the
10376 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10377 in the input sequence will result in the function treating the input as not
10378 being a script run.
10379
10380 The returned script will be C<SCX_Inherited> iff all the code points in it are
10381 from the Inherited script.
10382
10383 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10384 it are from the Inherited or Common scripts.
10385
10386 =cut
10387
10388 */
10389
10390 bool
10391 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10392 {
10393     /* Basically, it looks at each character in the sequence to see if the
10394      * above conditions are met; if not it fails.  It uses an inversion map to
10395      * find the enum corresponding to the script of each character.  But this
10396      * is complicated by the fact that a few code points can be in any of
10397      * several scripts.  The data has been constructed so that there are
10398      * additional enum values (all negative) for these situations.  The
10399      * absolute value of those is an index into another table which contains
10400      * pointers to auxiliary tables for each such situation.  Each aux array
10401      * lists all the scripts for the given situation.  There is another,
10402      * parallel, table that gives the number of entries in each aux table.
10403      * These are all defined in charclass_invlists.h */
10404
10405     /* XXX Here are the additional things UTS 39 says could be done:
10406      * Mark Chinese strings as “mixed script” if they contain both simplified
10407      * (S) and traditional (T) Chinese characters, using the Unihan data in the
10408      * Unicode Character Database [UCD].  The criterion can only be applied if
10409      * the language of the string is known to be Chinese. So, for example, the
10410      * string “写真だけの結婚式 ” is Japanese, and should not be marked as
10411      * mixed script because of a mixture of S and T characters.  Testing for
10412      * whether a character is S or T needs to be based not on whether the
10413      * character has a S or T variant , but whether the character is an S or T
10414      * variant. khw notes that the sample contains a Hiragana character, and it
10415      * is unclear if absence of any foreign script marks the script as
10416      * "Chinese"
10417      *
10418      * Forbid sequences of the same nonspacing mark
10419      *
10420      * Check to see that all the characters are in the sets of exemplar
10421      * characters for at least one language in the Unicode Common Locale Data
10422      * Repository [CLDR]. */
10423
10424
10425     /* Things that match /\d/u */
10426     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10427     UV * decimals_array = invlist_array(decimals_invlist);
10428
10429     /* What code point is the digit '0' of the script run? */
10430     UV zero_of_run = 0;
10431     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10432     SCX_enum script_of_char = SCX_INVALID;
10433
10434     /* If the script remains not fully determined from iteration to iteration,
10435      * this is the current intersection of the possiblities.  */
10436     SCX_enum * intersection = NULL;
10437     PERL_UINT_FAST8_T intersection_len = 0;
10438
10439     bool retval = TRUE;
10440
10441     /* This is supposed to be a return parameter, but currently unused */
10442     SCX_enum * ret_script = NULL;
10443
10444     assert(send >= s);
10445
10446     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10447
10448     /* All code points in 0..255 are either Common or Latin, so must be a
10449      * script run.  We can special case it */
10450     if (! utf8_target && LIKELY(send > s)) {
10451         if (ret_script == NULL) {
10452             return TRUE;
10453         }
10454
10455         /* If any character is Latin, the run is Latin */
10456         while (s < send) {
10457             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10458                 *ret_script = SCX_Latin;
10459                 return TRUE;
10460             }
10461         }
10462
10463         /* If all are Common ... */
10464         *ret_script = SCX_Common;
10465         return TRUE;
10466     }
10467
10468     /* Look at each character in the sequence */
10469     while (s < send) {
10470         UV cp;
10471
10472         /* The code allows all scripts to use the ASCII digits.  This is
10473          * because they are used in commerce even in scripts that have their
10474          * own set.  Hence any ASCII ones found are ok, unless a digit from
10475          * another set has already been encountered.  (The other digit ranges
10476          * in Common are not similarly blessed) */
10477         if (UNLIKELY(isDIGIT(*s))) {
10478             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10479                 retval = FALSE;
10480                 break;
10481             }
10482             if (zero_of_run > 0) {
10483                 if (zero_of_run != '0') {
10484                     retval = FALSE;
10485                     break;
10486                 }
10487             }
10488             else {
10489                 zero_of_run = '0';
10490             }
10491             s++;
10492             continue;
10493         }
10494
10495         /* Here, isn't an ASCII digit.  Find the code point of the character */
10496         if (! UTF8_IS_INVARIANT(*s)) {
10497             Size_t len;
10498             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10499             s += len;
10500         }
10501         else {
10502             cp = *(s++);
10503         }
10504
10505         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10506          * digit in that script.  We can skip the rest of this code for this
10507          * character. */
10508         if (UNLIKELY(   zero_of_run > 0
10509                      && cp >= zero_of_run
10510                      && cp - zero_of_run <= 9))
10511         {
10512             continue;
10513         }
10514
10515         /* Find the character's script.  The correct values are hard-coded here
10516          * for small-enough code points. */
10517         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10518                                unlikely to change */
10519             if (       cp > 255
10520                 || (   isALPHA_L1(cp)
10521                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10522             {
10523                 script_of_char = SCX_Latin;
10524             }
10525             else {
10526                 script_of_char = SCX_Common;
10527             }
10528         }
10529         else {
10530             script_of_char = _Perl_SCX_invmap[
10531                                        _invlist_search(PL_SCX_invlist, cp)];
10532         }
10533
10534         /* We arbitrarily accept a single unassigned character, but not in
10535          * combination with anything else, and not a run of them. */
10536         if (   UNLIKELY(script_of_run == SCX_Unknown)
10537             || UNLIKELY(   script_of_run != SCX_INVALID
10538                         && script_of_char == SCX_Unknown))
10539         {
10540             retval = FALSE;
10541             break;
10542         }
10543
10544         /* For the first character, or the run is inherited, the run's script
10545          * is set to the char's */
10546         if (   UNLIKELY(script_of_run == SCX_INVALID)
10547             || UNLIKELY(script_of_run == SCX_Inherited))
10548         {
10549             script_of_run = script_of_char;
10550         }
10551
10552         /* For the character's script to be Unknown, it must be the first
10553          * character in the sequence (for otherwise a test above would have
10554          * prevented us from reaching here), and we have set the run's script
10555          * to it.  Nothing further to be done for this character */
10556         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10557             continue;
10558         }
10559
10560         /* We accept 'inherited' script characters currently even at the
10561          * beginning.  (We know that no characters in Inherited are digits, or
10562          * we'd have to check for that) */
10563         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10564             continue;
10565         }
10566
10567         /* If the run so far is Common, and the new character isn't, change the
10568          * run's script to that of this character */
10569         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10570
10571             /* But Common contains several sets of digits.  Only the '0' set
10572              * can be part of another script. */
10573             if (zero_of_run > 0 && zero_of_run != '0') {
10574                 retval = FALSE;
10575                 break;
10576             }
10577
10578             script_of_run = script_of_char;
10579         }
10580
10581         /* All decimal digits must be from the same sequence of 10.  Above, we
10582          * handled any ASCII digits without descending to here.  We also
10583          * handled the case where we already knew what digit sequence is the
10584          * one to use, and the character is in that sequence.  Now that we know
10585          * the script, we can use script_zeros[] to directly find which
10586          * sequence the script uses, except in a few cases it returns 0 */
10587         if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
10588             zero_of_run = script_zeros[script_of_char];
10589         }
10590
10591         /* Now we can see if the script of the character is the same as that of
10592          * the run */
10593         if (LIKELY(script_of_char == script_of_run)) {
10594             /* By far the most common case */
10595             goto scripts_match;
10596         }
10597
10598
10599         /* Here, the script of the run isn't Common.  But characters in Common
10600          * match any script */
10601         if (script_of_char == SCX_Common) {
10602             goto scripts_match;
10603         }
10604
10605 #ifndef HAS_SCX_AUX_TABLES
10606
10607         /* Too early a Unicode version to have a code point belonging to more
10608          * than one script, so, if the scripts don't exactly match, fail */
10609         retval = FALSE;
10610         break;
10611
10612 #else
10613
10614         /* Here there is no exact match between the character's script and the
10615          * run's.  And we've handled the special cases of scripts Unknown,
10616          * Inherited, and Common.
10617          *
10618          * Negative script numbers signify that the value may be any of several
10619          * scripts, and we need to look at auxiliary information to make our
10620          * deterimination.  But if both are non-negative, we can fail now */
10621         if (LIKELY(script_of_char >= 0)) {
10622             const SCX_enum * search_in;
10623             PERL_UINT_FAST8_T search_in_len;
10624             PERL_UINT_FAST8_T i;
10625
10626             if (LIKELY(script_of_run >= 0)) {
10627                 retval = FALSE;
10628                 break;
10629             }
10630
10631             /* Use the previously constructed set of possible scripts, if any.
10632              * */
10633             if (intersection) {
10634                 search_in = intersection;
10635                 search_in_len = intersection_len;
10636             }
10637             else {
10638                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10639                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10640             }
10641
10642             for (i = 0; i < search_in_len; i++) {
10643                 if (search_in[i] == script_of_char) {
10644                     script_of_run = script_of_char;
10645                     goto scripts_match;
10646                 }
10647             }
10648
10649             retval = FALSE;
10650             break;
10651         }
10652         else if (LIKELY(script_of_run >= 0)) {
10653             /* script of character could be one of several, but run is a single
10654              * script */
10655             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10656             const PERL_UINT_FAST8_T search_in_len
10657                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10658             PERL_UINT_FAST8_T i;
10659
10660             for (i = 0; i < search_in_len; i++) {
10661                 if (search_in[i] == script_of_run) {
10662                     script_of_char = script_of_run;
10663                     goto scripts_match;
10664                 }
10665             }
10666
10667             retval = FALSE;
10668             break;
10669         }
10670         else {
10671             /* Both run and char could be in one of several scripts.  If the
10672              * intersection is empty, then this character isn't in this script
10673              * run.  Otherwise, we need to calculate the intersection to use
10674              * for future iterations of the loop, unless we are already at the
10675              * final character */
10676             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10677             const PERL_UINT_FAST8_T char_len
10678                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10679             const SCX_enum * search_run;
10680             PERL_UINT_FAST8_T run_len;
10681
10682             SCX_enum * new_overlap = NULL;
10683             PERL_UINT_FAST8_T i, j;
10684
10685             if (intersection) {
10686                 search_run = intersection;
10687                 run_len = intersection_len;
10688             }
10689             else {
10690                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10691                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10692             }
10693
10694             intersection_len = 0;
10695
10696             for (i = 0; i < run_len; i++) {
10697                 for (j = 0; j < char_len; j++) {
10698                     if (search_run[i] == search_char[j]) {
10699
10700                         /* Here, the script at i,j matches.  That means this
10701                          * character is in the run.  But continue on to find
10702                          * the complete intersection, for the next loop
10703                          * iteration, and for the digit check after it.
10704                          *
10705                          * On the first found common script, we malloc space
10706                          * for the intersection list for the worst case of the
10707                          * intersection, which is the minimum of the number of
10708                          * scripts remaining in each set. */
10709                         if (intersection_len == 0) {
10710                             Newx(new_overlap,
10711                                  MIN(run_len - i, char_len - j),
10712                                  SCX_enum);
10713                         }
10714                         new_overlap[intersection_len++] = search_run[i];
10715                     }
10716                 }
10717             }
10718
10719             /* Here we've looked through everything.  If they have no scripts
10720              * in common, not a run */
10721             if (intersection_len == 0) {
10722                 retval = FALSE;
10723                 break;
10724             }
10725
10726             /* If there is only a single script in common, set to that.
10727              * Otherwise, use the intersection going forward */
10728             Safefree(intersection);
10729             if (intersection_len == 1) {
10730                 script_of_run = script_of_char = new_overlap[0];
10731                 Safefree(new_overlap);
10732             }
10733             else {
10734                 intersection = new_overlap;
10735             }
10736         }
10737
10738 #endif
10739
10740   scripts_match:
10741
10742         /* Here, the script of the character is compatible with that of the
10743          * run.  Either they match exactly, or one or both can be any of
10744          * several scripts, and the intersection is not empty.  If the
10745          * character is not a decimal digit, we are done with it.  Otherwise,
10746          * it could still fail if it is from a different set of 10 than seen
10747          * already (or we may not have seen any, and we need to set the
10748          * sequence).  If we have determined a single script and that script
10749          * only has one set of digits (almost all scripts are like that), then
10750          * this isn't a problem, as any digit must come from the same sequence.
10751          * The only scripts that have multiple sequences have been constructed
10752          * to be 0 in 'script_zeros[]'.
10753          *
10754          * Here we check if it is a digit. */
10755         if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
10756             && (   (          zero_of_run == 0
10757                     || (  (   script_of_char >= 0
10758                            && script_zeros[script_of_char] == 0)
10759                         ||    intersection))))
10760         {
10761             SSize_t range_zero_index;
10762             range_zero_index = _invlist_search(decimals_invlist, cp);
10763             if (   LIKELY(range_zero_index >= 0)
10764                 && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
10765             {
10766                 UV range_zero = decimals_array[range_zero_index];
10767                 if (zero_of_run) {
10768                     if (zero_of_run != range_zero) {
10769                         retval = FALSE;
10770                         break;
10771                     }
10772                 }
10773                 else {
10774                     zero_of_run = range_zero;
10775                 }
10776             }
10777         }
10778     } /* end of looping through CLOSESR text */
10779
10780     Safefree(intersection);
10781
10782     if (ret_script != NULL) {
10783         if (retval) {
10784             *ret_script = script_of_run;
10785         }
10786         else {
10787             *ret_script = SCX_INVALID;
10788         }
10789     }
10790
10791     return retval;
10792 }
10793
10794 #endif /* ifndef PERL_IN_XSUB_RE */
10795
10796 /*
10797  * ex: set ts=8 sts=4 sw=4 et:
10798  */