regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 /* Returns a boolean as to whether the input unsigned number is a power of 2
 100  * (2**0, 2**1, etc).  In other words if it has just a single bit set.
 101  * If not, subtracting 1 would leave the uppermost bit set, so the & would
 102  * yield non-zero */
 103 #define isPOWER_OF_2(n) ((n & (n-1)) == 0)
 104
 105 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 106     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 107     goto target;                                                         \
 108 } STMT_END
 109
 110 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 111
 112 #ifndef STATIC
 113 #define STATIC  static
 114 #endif
 115
 116 /* Valid only if 'c', the character being looke-up, is an invariant under
 117  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 118  * everything matchable is straight forward in the bitmap */
 119 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 120                                 ? reginclass(prog,p,c,c+1,u)                \
 121                                 : ANYOF_BITMAP_TEST(p,*(c)))
 122
 123 /*
 124  * Forwards.
 125  */
 126
 127 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 128
 129 #define HOPc(pos,off) \
 130         (char *)(reginfo->is_utf8_target \
 131             ? reghop3((U8*)pos, off, \
 132                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 133             : (U8*)(pos + off))
 134
 135 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 136 #define HOPBACK3(pos, off, lim) \
 137         (reginfo->is_utf8_target                          \
 138             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 139             : (pos - off >= lim)                                 \
 140                 ? (U8*)pos - off                                 \
 141                 : NULL)
 142
 143 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 144
 145 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 146 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 147
 148 /* lim must be +ve. Returns NULL on overshoot */
 149 #define HOPMAYBE3(pos,off,lim) \
 150         (reginfo->is_utf8_target                        \
 151             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 152             : ((U8*)pos + off <= lim)                   \
 153                 ? (U8*)pos + off                        \
 154                 : NULL)
 155
 156 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 157  * off must be >=0; args should be vars rather than expressions */
 158 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 159     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 160     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 161 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 162
 163 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 164     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 165     : (U8*)(pos + off))
 166 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 167
 168 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 169 #define NEXTCHR_IS_EOS (nextchr < 0)
 170
 171 #define SET_nextchr \
 172     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 173
 174 #define SET_locinput(p) \
 175     locinput = (p);  \
 176     SET_nextchr
 177
 178 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 179 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 180
 181 /* for use after a quantifier and before an EXACT-like node -- japhy */
 182 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 183  *
 184  * NOTE that *nothing* that affects backtracking should be in here, specifically
 185  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 186  * node that is in between two EXACT like nodes when ascertaining what the required
 187  * "follow" character is. This should probably be moved to regex compile time
 188  * although it may be done at run time beause of the REF possibility - more
 189  * investigation required. -- demerphq
 190 */
 191 #define JUMPABLE(rn) (                                                             \
 192     OP(rn) == OPEN ||                                                              \
 193     (OP(rn) == CLOSE &&                                                            \
 194      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 195     OP(rn) == EVAL ||                                                              \
 196     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 197     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 198     OP(rn) == KEEPS ||                                                             \
 199     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 200 )
 201 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 202
 203 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 204
 205 #if 0
 206 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 207    we don't need this definition.  XXX These are now out-of-sync*/
 208 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 209 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFAA || OP(rn)==EXACTFAA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 210 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 211
 212 #else
 213 /* ... so we use this as its faster. */
 214 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 215 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFAA || OP(rn) == EXACTFAA_NO_TRIE)
 216 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 217 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 218
 219 #endif
 220
 221 /*
 222   Search for mandatory following text node; for lookahead, the text must
 223   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 224 */
 225 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 226     while (JUMPABLE(rn)) { \
 227         const OPCODE type = OP(rn); \
 228         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 229             rn = NEXTOPER(NEXTOPER(rn)); \
 230         else if (type == PLUS) \
 231             rn = NEXTOPER(rn); \
 232         else if (type == IFMATCH) \
 233             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 234         else rn += NEXT_OFF(rn); \
 235     } \
 236 } STMT_END
 237
 238 #define SLAB_FIRST(s) (&(s)->states[0])
 239 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 240
 241 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 242 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 243 static regmatch_state * S_push_slab(pTHX);
 244
 245 #define REGCP_PAREN_ELEMS 3
 246 #define REGCP_OTHER_ELEMS 3
 247 #define REGCP_FRAME_ELEMS 1
 248 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 249  * are needed for the regexp context stack bookkeeping. */
 250
 251 STATIC CHECKPOINT
 252 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 253 {
 254     const int retval = PL_savestack_ix;
 255     const int paren_elems_to_push =
 256                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 257     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 258     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 259     I32 p;
 260     GET_RE_DEBUG_FLAGS_DECL;
 261
 262     PERL_ARGS_ASSERT_REGCPPUSH;
 263
 264     if (paren_elems_to_push < 0)
 265         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 266                    (int)paren_elems_to_push, (int)maxopenparen,
 267                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 268
 269     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 270         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 271                    " out of range (%lu-%ld)",
 272                    total_elems,
 273                    (unsigned long)maxopenparen,
 274                    (long)parenfloor);
 275
 276     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 277
 278     DEBUG_BUFFERS_r(
 279         if ((int)maxopenparen > (int)parenfloor)
 280             Perl_re_exec_indentf( aTHX_
 281                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 282                 depth,
 283                 PTR2UV(rex),
 284                 PTR2UV(rex->offs)
 285             );
 286     );
 287     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 288 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 289         SSPUSHIV(rex->offs[p].end);
 290         SSPUSHIV(rex->offs[p].start);
 291         SSPUSHINT(rex->offs[p].start_tmp);
 292         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 293             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 294             depth,
 295             (UV)p,
 296             (IV)rex->offs[p].start,
 297             (IV)rex->offs[p].start_tmp,
 298             (IV)rex->offs[p].end
 299         ));
 300     }
 301 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 302     SSPUSHINT(maxopenparen);
 303     SSPUSHINT(rex->lastparen);
 304     SSPUSHINT(rex->lastcloseparen);
 305     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 306
 307     return retval;
 308 }
 309
 310 /* These are needed since we do not localize EVAL nodes: */
 311 #define REGCP_SET(cp)                                           \
 312     DEBUG_STATE_r(                                              \
 313         Perl_re_exec_indentf( aTHX_                             \
 314             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 315             depth, (IV)PL_savestack_ix                          \
 316         )                                                       \
 317     );                                                          \
 318     cp = PL_savestack_ix
 319
 320 #define REGCP_UNWIND(cp)                                        \
 321     DEBUG_STATE_r(                                              \
 322         if (cp != PL_savestack_ix)                              \
 323             Perl_re_exec_indentf( aTHX_                         \
 324                 "Clearing an EVAL scope, savestack=%"           \
 325                 IVdf "..%" IVdf "\n",                           \
 326                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 327             )                                                   \
 328     );                                                          \
 329     regcpblow(cp)
 330
 331 /* set the start and end positions of capture ix */
 332 #define CLOSE_CAPTURE(ix, s, e)                                            \
 333     rex->offs[ix].start = s;                                               \
 334     rex->offs[ix].end = e;                                                 \
 335     if (ix > rex->lastparen)                                               \
 336         rex->lastparen = ix;                                               \
 337     rex->lastcloseparen = ix;                                              \
 338     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 339         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 340         depth,                                                             \
 341         PTR2UV(rex),                                                       \
 342         PTR2UV(rex->offs),                                                 \
 343         (UV)ix,                                                            \
 344         (IV)rex->offs[ix].start,                                           \
 345         (IV)rex->offs[ix].end,                                             \
 346         (UV)rex->lastparen                                                 \
 347     ))
 348
 349 #define UNWIND_PAREN(lp, lcp)               \
 350     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 351         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 352         depth,                              \
 353         PTR2UV(rex),                        \
 354         PTR2UV(rex->offs),                  \
 355         (UV)(lp),                           \
 356         (UV)(rex->lastparen),               \
 357         (UV)(lcp)                           \
 358     ));                                     \
 359     for (n = rex->lastparen; n > lp; n--)   \
 360         rex->offs[n].end = -1;              \
 361     rex->lastparen = n;                     \
 362     rex->lastcloseparen = lcp;
 363
 364
 365 STATIC void
 366 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 367 {
 368     UV i;
 369     U32 paren;
 370     GET_RE_DEBUG_FLAGS_DECL;
 371
 372     PERL_ARGS_ASSERT_REGCPPOP;
 373
 374     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 375     i = SSPOPUV;
 376     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 377     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 378     rex->lastcloseparen = SSPOPINT;
 379     rex->lastparen = SSPOPINT;
 380     *maxopenparen_p = SSPOPINT;
 381
 382     i -= REGCP_OTHER_ELEMS;
 383     /* Now restore the parentheses context. */
 384     DEBUG_BUFFERS_r(
 385         if (i || rex->lastparen + 1 <= rex->nparens)
 386             Perl_re_exec_indentf( aTHX_
 387                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 388                 depth,
 389                 PTR2UV(rex),
 390                 PTR2UV(rex->offs)
 391             );
 392     );
 393     paren = *maxopenparen_p;
 394     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 395         SSize_t tmps;
 396         rex->offs[paren].start_tmp = SSPOPINT;
 397         rex->offs[paren].start = SSPOPIV;
 398         tmps = SSPOPIV;
 399         if (paren <= rex->lastparen)
 400             rex->offs[paren].end = tmps;
 401         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 402             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 403             depth,
 404             (UV)paren,
 405             (IV)rex->offs[paren].start,
 406             (IV)rex->offs[paren].start_tmp,
 407             (IV)rex->offs[paren].end,
 408             (paren > rex->lastparen ? "(skipped)" : ""));
 409         );
 410         paren--;
 411     }
 412 #if 1
 413     /* It would seem that the similar code in regtry()
 414      * already takes care of this, and in fact it is in
 415      * a better location to since this code can #if 0-ed out
 416      * but the code in regtry() is needed or otherwise tests
 417      * requiring null fields (pat.t#187 and split.t#{13,14}
 418      * (as of patchlevel 7877)  will fail.  Then again,
 419      * this code seems to be necessary or otherwise
 420      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 421      * --jhi updated by dapm */
 422     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 423         if (i > *maxopenparen_p)
 424             rex->offs[i].start = -1;
 425         rex->offs[i].end = -1;
 426         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 427             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 428             depth,
 429             (UV)i,
 430             (i > *maxopenparen_p) ? "-1" : "  "
 431         ));
 432     }
 433 #endif
 434 }
 435
 436 /* restore the parens and associated vars at savestack position ix,
 437  * but without popping the stack */
 438
 439 STATIC void
 440 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 441 {
 442     I32 tmpix = PL_savestack_ix;
 443     PERL_ARGS_ASSERT_REGCP_RESTORE;
 444
 445     PL_savestack_ix = ix;
 446     regcppop(rex, maxopenparen_p);
 447     PL_savestack_ix = tmpix;
 448 }
 449
 450 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 451
 452 #ifndef PERL_IN_XSUB_RE
 453
 454 bool
 455 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 456 {
 457     /* Returns a boolean as to whether or not 'character' is a member of the
 458      * Posix character class given by 'classnum' that should be equivalent to a
 459      * value in the typedef '_char_class_number'.
 460      *
 461      * Ideally this could be replaced by a just an array of function pointers
 462      * to the C library functions that implement the macros this calls.
 463      * However, to compile, the precise function signatures are required, and
 464      * these may vary from platform to to platform.  To avoid having to figure
 465      * out what those all are on each platform, I (khw) am using this method,
 466      * which adds an extra layer of function call overhead (unless the C
 467      * optimizer strips it away).  But we don't particularly care about
 468      * performance with locales anyway. */
 469
 470     switch ((_char_class_number) classnum) {
 471         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 472         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 473         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 474         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 475         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 476                                         || isUPPER_LC(character);
 477         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 478         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 479         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 480         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 481         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 482         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 483         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 484         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 485         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 486         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 487         default:    /* VERTSPACE should never occur in locales */
 488             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 489     }
 490
 491     NOT_REACHED; /* NOTREACHED */
 492     return FALSE;
 493 }
 494
 495 #endif
 496
 497 STATIC bool
 498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 499 {
 500     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 501      * 'character' is a member of the Posix character class given by 'classnum'
 502      * that should be equivalent to a value in the typedef
 503      * '_char_class_number'.
 504      *
 505      * This just calls isFOO_lc on the code point for the character if it is in
 506      * the range 0-255.  Outside that range, all characters use Unicode
 507      * rules, ignoring any locale.  So use the Unicode function if this class
 508      * requires a swash, and use the Unicode macro otherwise. */
 509
 510     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 511
 512     if (UTF8_IS_INVARIANT(*character)) {
 513         return isFOO_lc(classnum, *character);
 514     }
 515     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 516         return isFOO_lc(classnum,
 517                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 518     }
 519
 520     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 521
 522     switch ((_char_class_number) classnum) {
 523         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 524         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 525         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 526         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 527         default:
 528             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 529                                         utf8_to_uvchr_buf(character, e, NULL));
 530     }
 531
 532     return FALSE; /* Things like CNTRL are always below 256 */
 533 }
 534
 535 STATIC char *
 536 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
 537 {
 538     /* Returns the position of the first ASCII byte in the sequence between 's'
 539      * and 'send-1' inclusive; returns 'send' if none found */
 540
 541     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 542
 543 #ifndef EBCDIC
 544
 545     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 546
 547                             /* This term is wordsize if subword; 0 if not */
 548                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 549
 550                             /* 'offset' */
 551                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 552     {
 553
 554         /* Process per-byte until reach word boundary.  XXX This loop could be
 555          * eliminated if we knew that this platform had fast unaligned reads */
 556         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 557             if (isASCII(*s)) {
 558                 return s;
 559             }
 560             s++;    /* khw didn't bother creating a separate loop for
 561                        utf8_target */
 562         }
 563
 564         /* Here, we know we have at least one full word to process.  Process
 565          * per-word as long as we have at least a full word left */
 566         do {
 567             PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
 568             if (complemented & PERL_VARIANTS_WORD_MASK)  {
 569
 570 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 571      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 572
 573                 s += _variant_byte_number(complemented);
 574                 return s;
 575
 576 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 577            checks. */
 578
 579                 break;
 580 #  endif
 581             }
 582
 583             s += PERL_WORDSIZE;
 584
 585         } while (s + PERL_WORDSIZE <= send);
 586     }
 587
 588 #endif
 589
 590     /* Process per-character */
 591     if (utf8_target) {
 592         while (s < send) {
 593             if (isASCII(*s)) {
 594                 return s;
 595             }
 596             s += UTF8SKIP(s);
 597         }
 598     }
 599     else {
 600         while (s < send) {
 601             if (isASCII(*s)) {
 602                 return s;
 603             }
 604             s++;
 605         }
 606     }
 607
 608     return s;
 609 }
 610
 611 STATIC char *
 612 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
 613 {
 614     /* Returns the position of the first non-ASCII byte in the sequence between
 615      * 's' and 'send-1' inclusive; returns 'send' if none found */
 616
 617 #ifdef EBCDIC
 618
 619     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 620
 621     if (utf8_target) {
 622         while (s < send) {
 623             if ( ! isASCII(*s)) {
 624                 return s;
 625             }
 626             s += UTF8SKIP(s);
 627         }
 628     }
 629     else {
 630         while (s < send) {
 631             if ( ! isASCII(*s)) {
 632                 return s;
 633             }
 634             s++;
 635         }
 636     }
 637
 638     return s;
 639
 640 #else
 641
 642     const U8 * next_non_ascii = NULL;
 643
 644     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 645     PERL_UNUSED_ARG(utf8_target);
 646
 647     /* On ASCII platforms invariants and ASCII are identical, so if the string
 648      * is entirely invariants, there is no non-ASCII character */
 649     return (is_utf8_invariant_string_loc((U8 *) s,
 650                                          (STRLEN) (send - s),
 651                                          &next_non_ascii))
 652             ? (char *) send
 653             : (char *) next_non_ascii;
 654
 655 #endif
 656
 657 }
 658
 659 STATIC U8 *
 660 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 661 {
 662     /* Returns the position of the first byte in the sequence between 's' and
 663      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 664      * */
 665
 666     PERL_ARGS_ASSERT_FIND_SPAN_END;
 667
 668     assert(send >= s);
 669
 670     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 671                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 672                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 673     {
 674         PERL_UINTMAX_T span_word;
 675
 676         /* Process per-byte until reach word boundary.  XXX This loop could be
 677          * eliminated if we knew that this platform had fast unaligned reads */
 678         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 679             if (*s != span_byte) {
 680                 return s;
 681             }
 682             s++;
 683         }
 684
 685         /* Create a word filled with the bytes we are spanning */
 686         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 687
 688         /* Process per-word as long as we have at least a full word left */
 689         do {
 690
 691             /* Keep going if the whole word is composed of 'span_byte's */
 692             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 693                 s += PERL_WORDSIZE;
 694                 continue;
 695             }
 696
 697             /* Here, at least one byte in the word isn't 'span_byte'. */
 698
 699 #ifdef EBCDIC
 700
 701             break;
 702
 703 #else
 704
 705             /* This xor leaves 1 bits only in those non-matching bytes */
 706             span_word ^= * (PERL_UINTMAX_T *) s;
 707
 708             /* Make sure the upper bit of each non-matching byte is set.  This
 709              * makes each such byte look like an ASCII platform variant byte */
 710             span_word |= span_word << 1;
 711             span_word |= span_word << 2;
 712             span_word |= span_word << 4;
 713
 714             /* That reduces the problem to what this function solves */
 715             return s + _variant_byte_number(span_word);
 716
 717 #endif
 718
 719         } while (s + PERL_WORDSIZE <= send);
 720     }
 721
 722     /* Process the straggler bytes beyond the final word boundary */
 723     while (s < send) {
 724         if (*s != span_byte) {
 725             return s;
 726         }
 727         s++;
 728     }
 729
 730     return s;
 731 }
 732
 733 STATIC U8 *
 734 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 735 {
 736     /* Returns the position of the first byte in the sequence between 's'
 737      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 738      * returns 'send' if none found.  It uses word-level operations instead of
 739      * byte to speed up the process */
 740
 741     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 742
 743     assert(send >= s);
 744     assert((byte & mask) == byte);
 745
 746 #ifndef EBCDIC
 747
 748     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 749                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 750                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 751     {
 752         PERL_UINTMAX_T word, mask_word;
 753
 754         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 755             if (((*s) & mask) == byte) {
 756                 return s;
 757             }
 758             s++;
 759         }
 760
 761         word      = PERL_COUNT_MULTIPLIER * byte;
 762         mask_word = PERL_COUNT_MULTIPLIER * mask;
 763
 764         do {
 765             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 766
 767             /* If 'masked' contains bytes with the bit pattern of 'byte' within
 768              * it, xoring with 'word' will leave each of the 8 bits in such
 769              * bytes be 0, and no byte containing any other bit pattern will be
 770              * 0. */
 771             masked ^= word;
 772
 773             /* This causes the most significant bit to be set to 1 for any
 774              * bytes in the word that aren't completely 0 */
 775             masked |= masked << 1;
 776             masked |= masked << 2;
 777             masked |= masked << 4;
 778
 779             /* The msbits are the same as what marks a byte as variant, so we
 780              * can use this mask.  If all msbits are 1, the word doesn't
 781              * contain 'byte' */
 782             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 783                 s += PERL_WORDSIZE;
 784                 continue;
 785             }
 786
 787             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 788              * and any that are, are 0.  Complement and re-AND to swap that */
 789             masked = ~ masked;
 790             masked &= PERL_VARIANTS_WORD_MASK;
 791
 792             /* This reduces the problem to that solved by this function */
 793             s += _variant_byte_number(masked);
 794             return s;
 795
 796         } while (s + PERL_WORDSIZE <= send);
 797     }
 798
 799 #endif
 800
 801     while (s < send) {
 802         if (((*s) & mask) == byte) {
 803             return s;
 804         }
 805         s++;
 806     }
 807
 808     return s;
 809 }
 810
 811 STATIC U8 *
 812 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 813 {
 814     /* Returns the position of the first byte in the sequence between 's' and
 815      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 816      * 'span_byte' should have been ANDed with 'mask' in the call of this
 817      * function.  Returns 'send' if none found.  Works like find_span_end(),
 818      * except for the AND */
 819
 820     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 821
 822     assert(send >= s);
 823     assert((span_byte & mask) == span_byte);
 824
 825     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 826                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 827                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 828     {
 829         PERL_UINTMAX_T span_word, mask_word;
 830
 831         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 832             if (((*s) & mask) != span_byte) {
 833                 return s;
 834             }
 835             s++;
 836         }
 837
 838         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 839         mask_word = PERL_COUNT_MULTIPLIER * mask;
 840
 841         do {
 842             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 843
 844             if (masked == span_word) {
 845                 s += PERL_WORDSIZE;
 846                 continue;
 847             }
 848
 849 #ifdef EBCDIC
 850
 851             break;
 852
 853 #else
 854
 855             masked ^= span_word;
 856             masked |= masked << 1;
 857             masked |= masked << 2;
 858             masked |= masked << 4;
 859             return s + _variant_byte_number(masked);
 860
 861 #endif
 862
 863         } while (s + PERL_WORDSIZE <= send);
 864     }
 865
 866     while (s < send) {
 867         if (((*s) & mask) != span_byte) {
 868             return s;
 869         }
 870         s++;
 871     }
 872
 873     return s;
 874 }
 875
 876 /*
 877  * pregexec and friends
 878  */
 879
 880 #ifndef PERL_IN_XSUB_RE
 881 /*
 882  - pregexec - match a regexp against a string
 883  */
 884 I32
 885 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 886          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 887 /* stringarg: the point in the string at which to begin matching */
 888 /* strend:    pointer to null at end of string */
 889 /* strbeg:    real beginning of string */
 890 /* minend:    end of match must be >= minend bytes after stringarg. */
 891 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 892  *            itself is accessed via the pointers above */
 893 /* nosave:    For optimizations. */
 894 {
 895     PERL_ARGS_ASSERT_PREGEXEC;
 896
 897     return
 898         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 899                       nosave ? 0 : REXEC_COPY_STR);
 900 }
 901 #endif
 902
 903
 904
 905 /* re_intuit_start():
 906  *
 907  * Based on some optimiser hints, try to find the earliest position in the
 908  * string where the regex could match.
 909  *
 910  *   rx:     the regex to match against
 911  *   sv:     the SV being matched: only used for utf8 flag; the string
 912  *           itself is accessed via the pointers below. Note that on
 913  *           something like an overloaded SV, SvPOK(sv) may be false
 914  *           and the string pointers may point to something unrelated to
 915  *           the SV itself.
 916  *   strbeg: real beginning of string
 917  *   strpos: the point in the string at which to begin matching
 918  *   strend: pointer to the byte following the last char of the string
 919  *   flags   currently unused; set to 0
 920  *   data:   currently unused; set to NULL
 921  *
 922  * The basic idea of re_intuit_start() is to use some known information
 923  * about the pattern, namely:
 924  *
 925  *   a) the longest known anchored substring (i.e. one that's at a
 926  *      constant offset from the beginning of the pattern; but not
 927  *      necessarily at a fixed offset from the beginning of the
 928  *      string);
 929  *   b) the longest floating substring (i.e. one that's not at a constant
 930  *      offset from the beginning of the pattern);
 931  *   c) Whether the pattern is anchored to the string; either
 932  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 933  *      or anchored to pos(): /\G/;
 934  *   d) A start class: a real or synthetic character class which
 935  *      represents which characters are legal at the start of the pattern;
 936  *
 937  * to either quickly reject the match, or to find the earliest position
 938  * within the string at which the pattern might match, thus avoiding
 939  * running the full NFA engine at those earlier locations, only to
 940  * eventually fail and retry further along.
 941  *
 942  * Returns NULL if the pattern can't match, or returns the address within
 943  * the string which is the earliest place the match could occur.
 944  *
 945  * The longest of the anchored and floating substrings is called 'check'
 946  * and is checked first. The other is called 'other' and is checked
 947  * second. The 'other' substring may not be present.  For example,
 948  *
 949  *    /(abc|xyz)ABC\d{0,3}DEFG/
 950  *
 951  * will have
 952  *
 953  *   check substr (float)    = "DEFG", offset 6..9 chars
 954  *   other substr (anchored) = "ABC",  offset 3..3 chars
 955  *   stclass = [ax]
 956  *
 957  * Be aware that during the course of this function, sometimes 'anchored'
 958  * refers to a substring being anchored relative to the start of the
 959  * pattern, and sometimes to the pattern itself being anchored relative to
 960  * the string. For example:
 961  *
 962  *   /\dabc/:   "abc" is anchored to the pattern;
 963  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 964  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 965  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 966  *                    but the pattern is anchored to the string.
 967  */
 968
 969 char *
 970 Perl_re_intuit_start(pTHX_
 971                     REGEXP * const rx,
 972                     SV *sv,
 973                     const char * const strbeg,
 974                     char *strpos,
 975                     char *strend,
 976                     const U32 flags,
 977                     re_scream_pos_data *data)
 978 {
 979     struct regexp *const prog = ReANY(rx);
 980     SSize_t start_shift = prog->check_offset_min;
 981     /* Should be nonnegative! */
 982     SSize_t end_shift   = 0;
 983     /* current lowest pos in string where the regex can start matching */
 984     char *rx_origin = strpos;
 985     SV *check;
 986     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 987     U8   other_ix = 1 - prog->substrs->check_ix;
 988     bool ml_anch = 0;
 989     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 990     char *check_at = NULL;              /* check substr found at this pos */
 991     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 992     RXi_GET_DECL(prog,progi);
 993     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 994     regmatch_info *const reginfo = &reginfo_buf;
 995     GET_RE_DEBUG_FLAGS_DECL;
 996
 997     PERL_ARGS_ASSERT_RE_INTUIT_START;
 998     PERL_UNUSED_ARG(flags);
 999     PERL_UNUSED_ARG(data);
1000
1001     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1002                 "Intuit: trying to determine minimum start position...\n"));
1003
1004     /* for now, assume that all substr offsets are positive. If at some point
1005      * in the future someone wants to do clever things with lookbehind and
1006      * -ve offsets, they'll need to fix up any code in this function
1007      * which uses these offsets. See the thread beginning
1008      * <20140113145929.GF27210@iabyn.com>
1009      */
1010     assert(prog->substrs->data[0].min_offset >= 0);
1011     assert(prog->substrs->data[0].max_offset >= 0);
1012     assert(prog->substrs->data[1].min_offset >= 0);
1013     assert(prog->substrs->data[1].max_offset >= 0);
1014     assert(prog->substrs->data[2].min_offset >= 0);
1015     assert(prog->substrs->data[2].max_offset >= 0);
1016
1017     /* for now, assume that if both present, that the floating substring
1018      * doesn't start before the anchored substring.
1019      * If you break this assumption (e.g. doing better optimisations
1020      * with lookahead/behind), then you'll need to audit the code in this
1021      * function carefully first
1022      */
1023     assert(
1024             ! (  (prog->anchored_utf8 || prog->anchored_substr)
1025               && (prog->float_utf8    || prog->float_substr))
1026            || (prog->float_min_offset >= prog->anchored_offset));
1027
1028     /* byte rather than char calculation for efficiency. It fails
1029      * to quickly reject some cases that can't match, but will reject
1030      * them later after doing full char arithmetic */
1031     if (prog->minlen > strend - strpos) {
1032         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033                               "  String too short...\n"));
1034         goto fail;
1035     }
1036
1037     RXp_MATCH_UTF8_set(prog, utf8_target);
1038     reginfo->is_utf8_target = cBOOL(utf8_target);
1039     reginfo->info_aux = NULL;
1040     reginfo->strbeg = strbeg;
1041     reginfo->strend = strend;
1042     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1043     reginfo->intuit = 1;
1044     /* not actually used within intuit, but zero for safety anyway */
1045     reginfo->poscache_maxiter = 0;
1046
1047     if (utf8_target) {
1048         if ((!prog->anchored_utf8 && prog->anchored_substr)
1049                 || (!prog->float_utf8 && prog->float_substr))
1050             to_utf8_substr(prog);
1051         check = prog->check_utf8;
1052     } else {
1053         if (!prog->check_substr && prog->check_utf8) {
1054             if (! to_byte_substr(prog)) {
1055                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1056             }
1057         }
1058         check = prog->check_substr;
1059     }
1060
1061     /* dump the various substring data */
1062     DEBUG_OPTIMISE_MORE_r({
1063         int i;
1064         for (i=0; i<=2; i++) {
1065             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1066                                   : prog->substrs->data[i].substr);
1067             if (!sv)
1068                 continue;
1069
1070             Perl_re_printf( aTHX_
1071                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1072                 " useful=%" IVdf " utf8=%d [%s]\n",
1073                 i,
1074                 (IV)prog->substrs->data[i].min_offset,
1075                 (IV)prog->substrs->data[i].max_offset,
1076                 (IV)prog->substrs->data[i].end_shift,
1077                 BmUSEFUL(sv),
1078                 utf8_target ? 1 : 0,
1079                 SvPEEK(sv));
1080         }
1081     });
1082
1083     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1084
1085         /* ml_anch: check after \n?
1086          *
1087          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1088          * with /.*.../, these flags will have been added by the
1089          * compiler:
1090          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1091          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1092          */
1093         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1094                    && !(prog->intflags & PREGf_IMPLICIT);
1095
1096         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1097             /* we are only allowed to match at BOS or \G */
1098
1099             /* trivially reject if there's a BOS anchor and we're not at BOS.
1100              *
1101              * Note that we don't try to do a similar quick reject for
1102              * \G, since generally the caller will have calculated strpos
1103              * based on pos() and gofs, so the string is already correctly
1104              * anchored by definition; and handling the exceptions would
1105              * be too fiddly (e.g. REXEC_IGNOREPOS).
1106              */
1107             if (   strpos != strbeg
1108                 && (prog->intflags & PREGf_ANCH_SBOL))
1109             {
1110                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1111                                 "  Not at start...\n"));
1112                 goto fail;
1113             }
1114
1115             /* in the presence of an anchor, the anchored (relative to the
1116              * start of the regex) substr must also be anchored relative
1117              * to strpos. So quickly reject if substr isn't found there.
1118              * This works for \G too, because the caller will already have
1119              * subtracted gofs from pos, and gofs is the offset from the
1120              * \G to the start of the regex. For example, in /.abc\Gdef/,
1121              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1122              * caller will have set strpos=pos()-4; we look for the substr
1123              * at position pos()-4+1, which lines up with the "a" */
1124
1125             if (prog->check_offset_min == prog->check_offset_max) {
1126                 /* Substring at constant offset from beg-of-str... */
1127                 SSize_t slen = SvCUR(check);
1128                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1129
1130                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1131                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1132                     (IV)prog->check_offset_min));
1133
1134                 if (SvTAIL(check)) {
1135                     /* In this case, the regex is anchored at the end too.
1136                      * Unless it's a multiline match, the lengths must match
1137                      * exactly, give or take a \n.  NB: slen >= 1 since
1138                      * the last char of check is \n */
1139                     if (!multiline
1140                         && (   strend - s > slen
1141                             || strend - s < slen - 1
1142                             || (strend - s == slen && strend[-1] != '\n')))
1143                     {
1144                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1145                                             "  String too long...\n"));
1146                         goto fail_finish;
1147                     }
1148                     /* Now should match s[0..slen-2] */
1149                     slen--;
1150                 }
1151                 if (slen && (strend - s < slen
1152                     || *SvPVX_const(check) != *s
1153                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1154                 {
1155                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1156                                     "  String not equal...\n"));
1157                     goto fail_finish;
1158                 }
1159
1160                 check_at = s;
1161                 goto success_at_start;
1162             }
1163         }
1164     }
1165
1166     end_shift = prog->check_end_shift;
1167
1168 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1169     if (end_shift < 0)
1170         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1171                    (IV)end_shift, RX_PRECOMP(rx));
1172 #endif
1173
1174   restart:
1175
1176     /* This is the (re)entry point of the main loop in this function.
1177      * The goal of this loop is to:
1178      * 1) find the "check" substring in the region rx_origin..strend
1179      *    (adjusted by start_shift / end_shift). If not found, reject
1180      *    immediately.
1181      * 2) If it exists, look for the "other" substr too if defined; for
1182      *    example, if the check substr maps to the anchored substr, then
1183      *    check the floating substr, and vice-versa. If not found, go
1184      *    back to (1) with rx_origin suitably incremented.
1185      * 3) If we find an rx_origin position that doesn't contradict
1186      *    either of the substrings, then check the possible additional
1187      *    constraints on rx_origin of /^.../m or a known start class.
1188      *    If these fail, then depending on which constraints fail, jump
1189      *    back to here, or to various other re-entry points further along
1190      *    that skip some of the first steps.
1191      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1192      *    substring. If the start position was determined to be at the
1193      *    beginning of the string  - so, not rejected, but not optimised,
1194      *    since we have to run regmatch from position 0 - decrement the
1195      *    BmUSEFUL() count. Otherwise increment it.
1196      */
1197
1198
1199     /* first, look for the 'check' substring */
1200
1201     {
1202         U8* start_point;
1203         U8* end_point;
1204
1205         DEBUG_OPTIMISE_MORE_r({
1206             Perl_re_printf( aTHX_
1207                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1208                 " Start shift: %" IVdf " End shift %" IVdf
1209                 " Real end Shift: %" IVdf "\n",
1210                 (IV)(rx_origin - strbeg),
1211                 (IV)prog->check_offset_min,
1212                 (IV)start_shift,
1213                 (IV)end_shift,
1214                 (IV)prog->check_end_shift);
1215         });
1216
1217         end_point = HOPBACK3(strend, end_shift, rx_origin);
1218         if (!end_point)
1219             goto fail_finish;
1220         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1221         if (!start_point)
1222             goto fail_finish;
1223
1224
1225         /* If the regex is absolutely anchored to either the start of the
1226          * string (SBOL) or to pos() (ANCH_GPOS), then
1227          * check_offset_max represents an upper bound on the string where
1228          * the substr could start. For the ANCH_GPOS case, we assume that
1229          * the caller of intuit will have already set strpos to
1230          * pos()-gofs, so in this case strpos + offset_max will still be
1231          * an upper bound on the substr.
1232          */
1233         if (!ml_anch
1234             && prog->intflags & PREGf_ANCH
1235             && prog->check_offset_max != SSize_t_MAX)
1236         {
1237             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1238             const char * const anchor =
1239                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1240             SSize_t targ_len = (char*)end_point - anchor;
1241
1242             if (check_len > targ_len) {
1243                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1244                               "Target string too short to match required substring...\n"));
1245                 goto fail_finish;
1246             }
1247
1248             /* do a bytes rather than chars comparison. It's conservative;
1249              * so it skips doing the HOP if the result can't possibly end
1250              * up earlier than the old value of end_point.
1251              */
1252             assert(anchor + check_len <= (char *)end_point);
1253             if (prog->check_offset_max + check_len < targ_len) {
1254                 end_point = HOP3lim((U8*)anchor,
1255                                 prog->check_offset_max,
1256                                 end_point - check_len
1257                             )
1258                             + check_len;
1259                 if (end_point < start_point)
1260                     goto fail_finish;
1261             }
1262         }
1263
1264         check_at = fbm_instr( start_point, end_point,
1265                       check, multiline ? FBMrf_MULTILINE : 0);
1266
1267         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1268             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1269             (IV)((char*)start_point - strbeg),
1270             (IV)((char*)end_point   - strbeg),
1271             (IV)(check_at ? check_at - strbeg : -1)
1272         ));
1273
1274         /* Update the count-of-usability, remove useless subpatterns,
1275             unshift s.  */
1276
1277         DEBUG_EXECUTE_r({
1278             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1279                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1280             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1281                               (check_at ? "Found" : "Did not find"),
1282                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1283                     ? "anchored" : "floating"),
1284                 quoted,
1285                 RE_SV_TAIL(check),
1286                 (check_at ? " at offset " : "...\n") );
1287         });
1288
1289         if (!check_at)
1290             goto fail_finish;
1291         /* set rx_origin to the minimum position where the regex could start
1292          * matching, given the constraint of the just-matched check substring.
1293          * But don't set it lower than previously.
1294          */
1295
1296         if (check_at - rx_origin > prog->check_offset_max)
1297             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1298         /* Finish the diagnostic message */
1299         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1300             "%ld (rx_origin now %" IVdf ")...\n",
1301             (long)(check_at - strbeg),
1302             (IV)(rx_origin - strbeg)
1303         ));
1304     }
1305
1306
1307     /* now look for the 'other' substring if defined */
1308
1309     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1310                     : prog->substrs->data[other_ix].substr)
1311     {
1312         /* Take into account the "other" substring. */
1313         char *last, *last1;
1314         char *s;
1315         SV* must;
1316         struct reg_substr_datum *other;
1317
1318       do_other_substr:
1319         other = &prog->substrs->data[other_ix];
1320
1321         /* if "other" is anchored:
1322          * we've previously found a floating substr starting at check_at.
1323          * This means that the regex origin must lie somewhere
1324          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1325          * and max:                 HOP3(check_at, -check_offset_min)
1326          * (except that min will be >= strpos)
1327          * So the fixed  substr must lie somewhere between
1328          *  HOP3(min, anchored_offset)
1329          *  HOP3(max, anchored_offset) + SvCUR(substr)
1330          */
1331
1332         /* if "other" is floating
1333          * Calculate last1, the absolute latest point where the
1334          * floating substr could start in the string, ignoring any
1335          * constraints from the earlier fixed match. It is calculated
1336          * as follows:
1337          *
1338          * strend - prog->minlen (in chars) is the absolute latest
1339          * position within the string where the origin of the regex
1340          * could appear. The latest start point for the floating
1341          * substr is float_min_offset(*) on from the start of the
1342          * regex.  last1 simply combines thee two offsets.
1343          *
1344          * (*) You might think the latest start point should be
1345          * float_max_offset from the regex origin, and technically
1346          * you'd be correct. However, consider
1347          *    /a\d{2,4}bcd\w/
1348          * Here, float min, max are 3,5 and minlen is 7.
1349          * This can match either
1350          *    /a\d\dbcd\w/
1351          *    /a\d\d\dbcd\w/
1352          *    /a\d\d\d\dbcd\w/
1353          * In the first case, the regex matches minlen chars; in the
1354          * second, minlen+1, in the third, minlen+2.
1355          * In the first case, the floating offset is 3 (which equals
1356          * float_min), in the second, 4, and in the third, 5 (which
1357          * equals float_max). In all cases, the floating string bcd
1358          * can never start more than 4 chars from the end of the
1359          * string, which equals minlen - float_min. As the substring
1360          * starts to match more than float_min from the start of the
1361          * regex, it makes the regex match more than minlen chars,
1362          * and the two cancel each other out. So we can always use
1363          * float_min - minlen, rather than float_max - minlen for the
1364          * latest position in the string.
1365          *
1366          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1367          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1368          */
1369
1370         assert(prog->minlen >= other->min_offset);
1371         last1 = HOP3c(strend,
1372                         other->min_offset - prog->minlen, strbeg);
1373
1374         if (other_ix) {/* i.e. if (other-is-float) */
1375             /* last is the latest point where the floating substr could
1376              * start, *given* any constraints from the earlier fixed
1377              * match. This constraint is that the floating string starts
1378              * <= float_max_offset chars from the regex origin (rx_origin).
1379              * If this value is less than last1, use it instead.
1380              */
1381             assert(rx_origin <= last1);
1382             last =
1383                 /* this condition handles the offset==infinity case, and
1384                  * is a short-cut otherwise. Although it's comparing a
1385                  * byte offset to a char length, it does so in a safe way,
1386                  * since 1 char always occupies 1 or more bytes,
1387                  * so if a string range is  (last1 - rx_origin) bytes,
1388                  * it will be less than or equal to  (last1 - rx_origin)
1389                  * chars; meaning it errs towards doing the accurate HOP3
1390                  * rather than just using last1 as a short-cut */
1391                 (last1 - rx_origin) < other->max_offset
1392                     ? last1
1393                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1394         }
1395         else {
1396             assert(strpos + start_shift <= check_at);
1397             last = HOP4c(check_at, other->min_offset - start_shift,
1398                         strbeg, strend);
1399         }
1400
1401         s = HOP3c(rx_origin, other->min_offset, strend);
1402         if (s < other_last)     /* These positions already checked */
1403             s = other_last;
1404
1405         must = utf8_target ? other->utf8_substr : other->substr;
1406         assert(SvPOK(must));
1407         {
1408             char *from = s;
1409             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1410
1411             if (to > strend)
1412                 to = strend;
1413             if (from > to) {
1414                 s = NULL;
1415                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1416                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1417                     (IV)(from - strbeg),
1418                     (IV)(to   - strbeg)
1419                 ));
1420             }
1421             else {
1422                 s = fbm_instr(
1423                     (unsigned char*)from,
1424                     (unsigned char*)to,
1425                     must,
1426                     multiline ? FBMrf_MULTILINE : 0
1427                 );
1428                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1429                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1430                     (IV)(from - strbeg),
1431                     (IV)(to   - strbeg),
1432                     (IV)(s ? s - strbeg : -1)
1433                 ));
1434             }
1435         }
1436
1437         DEBUG_EXECUTE_r({
1438             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1439                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1440             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1441                 s ? "Found" : "Contradicts",
1442                 other_ix ? "floating" : "anchored",
1443                 quoted, RE_SV_TAIL(must));
1444         });
1445
1446
1447         if (!s) {
1448             /* last1 is latest possible substr location. If we didn't
1449              * find it before there, we never will */
1450             if (last >= last1) {
1451                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1452                                         "; giving up...\n"));
1453                 goto fail_finish;
1454             }
1455
1456             /* try to find the check substr again at a later
1457              * position. Maybe next time we'll find the "other" substr
1458              * in range too */
1459             other_last = HOP3c(last, 1, strend) /* highest failure */;
1460             rx_origin =
1461                 other_ix /* i.e. if other-is-float */
1462                     ? HOP3c(rx_origin, 1, strend)
1463                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1464             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1465                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1466                 (other_ix ? "floating" : "anchored"),
1467                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1468                 (IV)(rx_origin - strbeg)
1469             ));
1470             goto restart;
1471         }
1472         else {
1473             if (other_ix) { /* if (other-is-float) */
1474                 /* other_last is set to s, not s+1, since its possible for
1475                  * a floating substr to fail first time, then succeed
1476                  * second time at the same floating position; e.g.:
1477                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1478                  * The first time round, anchored and float match at
1479                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1480                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1481                  */
1482                 other_last = s;
1483             }
1484             else {
1485                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1486                 other_last = HOP3c(s, 1, strend);
1487             }
1488             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1489                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1490                   (long)(s - strbeg),
1491                 (IV)(rx_origin - strbeg)
1492               ));
1493
1494         }
1495     }
1496     else {
1497         DEBUG_OPTIMISE_MORE_r(
1498             Perl_re_printf( aTHX_
1499                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1500                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1501                 " strend:%" IVdf "\n",
1502                 (IV)prog->check_offset_min,
1503                 (IV)prog->check_offset_max,
1504                 (IV)(check_at-strbeg),
1505                 (IV)(rx_origin-strbeg),
1506                 (IV)(rx_origin-check_at),
1507                 (IV)(strend-strbeg)
1508             )
1509         );
1510     }
1511
1512   postprocess_substr_matches:
1513
1514     /* handle the extra constraint of /^.../m if present */
1515
1516     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1517         char *s;
1518
1519         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1520                         "  looking for /^/m anchor"));
1521
1522         /* we have failed the constraint of a \n before rx_origin.
1523          * Find the next \n, if any, even if it's beyond the current
1524          * anchored and/or floating substrings. Whether we should be
1525          * scanning ahead for the next \n or the next substr is debatable.
1526          * On the one hand you'd expect rare substrings to appear less
1527          * often than \n's. On the other hand, searching for \n means
1528          * we're effectively flipping between check_substr and "\n" on each
1529          * iteration as the current "rarest" string candidate, which
1530          * means for example that we'll quickly reject the whole string if
1531          * hasn't got a \n, rather than trying every substr position
1532          * first
1533          */
1534
1535         s = HOP3c(strend, - prog->minlen, strpos);
1536         if (s <= rx_origin ||
1537             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1538         {
1539             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1540                             "  Did not find /%s^%s/m...\n",
1541                             PL_colors[0], PL_colors[1]));
1542             goto fail_finish;
1543         }
1544
1545         /* earliest possible origin is 1 char after the \n.
1546          * (since *rx_origin == '\n', it's safe to ++ here rather than
1547          * HOP(rx_origin, 1)) */
1548         rx_origin++;
1549
1550         if (prog->substrs->check_ix == 0  /* check is anchored */
1551             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1552         {
1553             /* Position contradicts check-string; either because
1554              * check was anchored (and thus has no wiggle room),
1555              * or check was float and rx_origin is above the float range */
1556             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1557                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1558                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1559             goto restart;
1560         }
1561
1562         /* if we get here, the check substr must have been float,
1563          * is in range, and we may or may not have had an anchored
1564          * "other" substr which still contradicts */
1565         assert(prog->substrs->check_ix); /* check is float */
1566
1567         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1568             /* whoops, the anchored "other" substr exists, so we still
1569              * contradict. On the other hand, the float "check" substr
1570              * didn't contradict, so just retry the anchored "other"
1571              * substr */
1572             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1573                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1574                 PL_colors[0], PL_colors[1],
1575                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1576                 (IV)(rx_origin - strbeg)
1577             ));
1578             goto do_other_substr;
1579         }
1580
1581         /* success: we don't contradict the found floating substring
1582          * (and there's no anchored substr). */
1583         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1584             "  Found /%s^%s/m with rx_origin %ld...\n",
1585             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1586     }
1587     else {
1588         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1589             "  (multiline anchor test skipped)\n"));
1590     }
1591
1592   success_at_start:
1593
1594
1595     /* if we have a starting character class, then test that extra constraint.
1596      * (trie stclasses are too expensive to use here, we are better off to
1597      * leave it to regmatch itself) */
1598
1599     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1600         const U8* const str = (U8*)STRING(progi->regstclass);
1601
1602         /* XXX this value could be pre-computed */
1603         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1604                     ?  (reginfo->is_utf8_pat
1605                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1606                         : STR_LEN(progi->regstclass))
1607                     : 1);
1608         char * endpos;
1609         char *s;
1610         /* latest pos that a matching float substr constrains rx start to */
1611         char *rx_max_float = NULL;
1612
1613         /* if the current rx_origin is anchored, either by satisfying an
1614          * anchored substring constraint, or a /^.../m constraint, then we
1615          * can reject the current origin if the start class isn't found
1616          * at the current position. If we have a float-only match, then
1617          * rx_origin is constrained to a range; so look for the start class
1618          * in that range. if neither, then look for the start class in the
1619          * whole rest of the string */
1620
1621         /* XXX DAPM it's not clear what the minlen test is for, and why
1622          * it's not used in the floating case. Nothing in the test suite
1623          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1624          * Here are some old comments, which may or may not be correct:
1625          *
1626          *   minlen == 0 is possible if regstclass is \b or \B,
1627          *   and the fixed substr is ''$.
1628          *   Since minlen is already taken into account, rx_origin+1 is
1629          *   before strend; accidentally, minlen >= 1 guaranties no false
1630          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1631          *   0) below assumes that regstclass does not come from lookahead...
1632          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1633          *   This leaves EXACTF-ish only, which are dealt with in
1634          *   find_byclass().
1635          */
1636
1637         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1638             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1639         else if (prog->float_substr || prog->float_utf8) {
1640             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1641             endpos = HOP3clim(rx_max_float, cl_l, strend);
1642         }
1643         else
1644             endpos= strend;
1645
1646         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1647             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1648             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1649               (IV)start_shift, (IV)(check_at - strbeg),
1650               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1651
1652         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1653                             reginfo);
1654         if (!s) {
1655             if (endpos == strend) {
1656                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1657                                 "  Could not match STCLASS...\n") );
1658                 goto fail;
1659             }
1660             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1661                                "  This position contradicts STCLASS...\n") );
1662             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1663                         && !(prog->intflags & PREGf_IMPLICIT))
1664                 goto fail;
1665
1666             /* Contradict one of substrings */
1667             if (prog->anchored_substr || prog->anchored_utf8) {
1668                 if (prog->substrs->check_ix == 1) { /* check is float */
1669                     /* Have both, check_string is floating */
1670                     assert(rx_origin + start_shift <= check_at);
1671                     if (rx_origin + start_shift != check_at) {
1672                         /* not at latest position float substr could match:
1673                          * Recheck anchored substring, but not floating.
1674                          * The condition above is in bytes rather than
1675                          * chars for efficiency. It's conservative, in
1676                          * that it errs on the side of doing 'goto
1677                          * do_other_substr'. In this case, at worst,
1678                          * an extra anchored search may get done, but in
1679                          * practice the extra fbm_instr() is likely to
1680                          * get skipped anyway. */
1681                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1682                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1683                             (long)(other_last - strbeg),
1684                             (IV)(rx_origin - strbeg)
1685                         ));
1686                         goto do_other_substr;
1687                     }
1688                 }
1689             }
1690             else {
1691                 /* float-only */
1692
1693                 if (ml_anch) {
1694                     /* In the presence of ml_anch, we might be able to
1695                      * find another \n without breaking the current float
1696                      * constraint. */
1697
1698                     /* strictly speaking this should be HOP3c(..., 1, ...),
1699                      * but since we goto a block of code that's going to
1700                      * search for the next \n if any, its safe here */
1701                     rx_origin++;
1702                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1703                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1704                               PL_colors[0], PL_colors[1],
1705                               (long)(rx_origin - strbeg)) );
1706                     goto postprocess_substr_matches;
1707                 }
1708
1709                 /* strictly speaking this can never be true; but might
1710                  * be if we ever allow intuit without substrings */
1711                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1712                     goto fail;
1713
1714                 rx_origin = rx_max_float;
1715             }
1716
1717             /* at this point, any matching substrings have been
1718              * contradicted. Start again... */
1719
1720             rx_origin = HOP3c(rx_origin, 1, strend);
1721
1722             /* uses bytes rather than char calculations for efficiency.
1723              * It's conservative: it errs on the side of doing 'goto restart',
1724              * where there is code that does a proper char-based test */
1725             if (rx_origin + start_shift + end_shift > strend) {
1726                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1727                                        "  Could not match STCLASS...\n") );
1728                 goto fail;
1729             }
1730             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1731                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1732                 (prog->substrs->check_ix ? "floating" : "anchored"),
1733                 (long)(rx_origin + start_shift - strbeg),
1734                 (IV)(rx_origin - strbeg)
1735             ));
1736             goto restart;
1737         }
1738
1739         /* Success !!! */
1740
1741         if (rx_origin != s) {
1742             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1743                         "  By STCLASS: moving %ld --> %ld\n",
1744                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1745                    );
1746         }
1747         else {
1748             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1749                                   "  Does not contradict STCLASS...\n");
1750                    );
1751         }
1752     }
1753
1754     /* Decide whether using the substrings helped */
1755
1756     if (rx_origin != strpos) {
1757         /* Fixed substring is found far enough so that the match
1758            cannot start at strpos. */
1759
1760         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1761         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1762     }
1763     else {
1764         /* The found rx_origin position does not prohibit matching at
1765          * strpos, so calling intuit didn't gain us anything. Decrement
1766          * the BmUSEFUL() count on the check substring, and if we reach
1767          * zero, free it.  */
1768         if (!(prog->intflags & PREGf_NAUGHTY)
1769             && (utf8_target ? (
1770                 prog->check_utf8                /* Could be deleted already */
1771                 && --BmUSEFUL(prog->check_utf8) < 0
1772                 && (prog->check_utf8 == prog->float_utf8)
1773             ) : (
1774                 prog->check_substr              /* Could be deleted already */
1775                 && --BmUSEFUL(prog->check_substr) < 0
1776                 && (prog->check_substr == prog->float_substr)
1777             )))
1778         {
1779             /* If flags & SOMETHING - do not do it many times on the same match */
1780             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1781             /* XXX Does the destruction order has to change with utf8_target? */
1782             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1783             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1784             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1785             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1786             check = NULL;                       /* abort */
1787             /* XXXX This is a remnant of the old implementation.  It
1788                     looks wasteful, since now INTUIT can use many
1789                     other heuristics. */
1790             prog->extflags &= ~RXf_USE_INTUIT;
1791         }
1792     }
1793
1794     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1795             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1796              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1797
1798     return rx_origin;
1799
1800   fail_finish:                          /* Substring not found */
1801     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1802         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1803   fail:
1804     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1805                           PL_colors[4], PL_colors[5]));
1806     return NULL;
1807 }
1808
1809
1810 #define DECL_TRIE_TYPE(scan) \
1811     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1812                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1813                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1814                     trie_type = ((scan->flags == EXACT)                             \
1815                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1816                                  : (scan->flags == EXACTL)                          \
1817                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1818                                     : (scan->flags == EXACTFAA)                     \
1819                                       ? (utf8_target                                \
1820                                          ? trie_utf8_exactfa_fold                   \
1821                                          : trie_latin_utf8_exactfa_fold)            \
1822                                       : (scan->flags == EXACTFLU8                   \
1823                                          ? (utf8_target                             \
1824                                            ? trie_flu8                              \
1825                                            : trie_flu8_latin)                       \
1826                                          : (utf8_target                             \
1827                                            ? trie_utf8_fold                         \
1828                                            : trie_latin_utf8_fold)))
1829
1830 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1831  * 'foldbuf+sizeof(foldbuf)' */
1832 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1833 STMT_START {                                                                        \
1834     STRLEN skiplen;                                                                 \
1835     U8 flags = FOLD_FLAGS_FULL;                                                     \
1836     switch (trie_type) {                                                            \
1837     case trie_flu8:                                                                 \
1838         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1839         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1840             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1841         }                                                                           \
1842         goto do_trie_utf8_fold;                                                     \
1843     case trie_utf8_exactfa_fold:                                                    \
1844         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1845         /* FALLTHROUGH */                                                           \
1846     case trie_utf8_fold:                                                            \
1847       do_trie_utf8_fold:                                                            \
1848         if ( foldlen>0 ) {                                                          \
1849             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1850             foldlen -= len;                                                         \
1851             uscan += len;                                                           \
1852             len=0;                                                                  \
1853         } else {                                                                    \
1854             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1855                                                                             flags); \
1856             len = UTF8SKIP(uc);                                                     \
1857             skiplen = UVCHR_SKIP( uvc );                                            \
1858             foldlen -= skiplen;                                                     \
1859             uscan = foldbuf + skiplen;                                              \
1860         }                                                                           \
1861         break;                                                                      \
1862     case trie_flu8_latin:                                                           \
1863         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1864         goto do_trie_latin_utf8_fold;                                               \
1865     case trie_latin_utf8_exactfa_fold:                                              \
1866         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1867         /* FALLTHROUGH */                                                           \
1868     case trie_latin_utf8_fold:                                                      \
1869       do_trie_latin_utf8_fold:                                                      \
1870         if ( foldlen>0 ) {                                                          \
1871             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1872             foldlen -= len;                                                         \
1873             uscan += len;                                                           \
1874             len=0;                                                                  \
1875         } else {                                                                    \
1876             len = 1;                                                                \
1877             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1878             skiplen = UVCHR_SKIP( uvc );                                            \
1879             foldlen -= skiplen;                                                     \
1880             uscan = foldbuf + skiplen;                                              \
1881         }                                                                           \
1882         break;                                                                      \
1883     case trie_utf8l:                                                                \
1884         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1885         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1886             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1887         }                                                                           \
1888         /* FALLTHROUGH */                                                           \
1889     case trie_utf8:                                                                 \
1890         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1891         break;                                                                      \
1892     case trie_plain:                                                                \
1893         uvc = (UV)*uc;                                                              \
1894         len = 1;                                                                    \
1895     }                                                                               \
1896     if (uvc < 256) {                                                                \
1897         charid = trie->charmap[ uvc ];                                              \
1898     }                                                                               \
1899     else {                                                                          \
1900         charid = 0;                                                                 \
1901         if (widecharmap) {                                                          \
1902             SV** const svpp = hv_fetch(widecharmap,                                 \
1903                         (char*)&uvc, sizeof(UV), 0);                                \
1904             if (svpp)                                                               \
1905                 charid = (U16)SvIV(*svpp);                                          \
1906         }                                                                           \
1907     }                                                                               \
1908 } STMT_END
1909
1910 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1911     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1912                 startpos, doutf8, depth)
1913
1914 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1915     STMT_START {                                            \
1916         while (s < strend) {                                \
1917             CODE                                            \
1918             s += ((UTF8) ? UTF8SKIP(s) : 1);                \
1919         }                                                   \
1920     } STMT_END
1921
1922 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1923     STMT_START {                                            \
1924         while (s < strend) {                                \
1925             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1926         }                                                   \
1927     } STMT_END
1928
1929 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1930     if (COND) {                                                \
1931         FBC_CHECK_AND_TRY                                      \
1932         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1933         previous_occurrence_end = s;                           \
1934     }                                                          \
1935     else {                                                     \
1936         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1937     }
1938
1939 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1940     if (utf8_target) {                                         \
1941         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1942     }                                                          \
1943     else {                                                     \
1944         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1945     }
1946
1947 /* We keep track of where the next character should start after an occurrence
1948  * of the one we're looking for.  Knowing that, we can see right away if the
1949  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1950  * don't accept the 2nd and succeeding adjacent occurrences */
1951 #define FBC_CHECK_AND_TRY                                      \
1952         if (   (   doevery                                     \
1953                 || s != previous_occurrence_end)               \
1954             && (reginfo->intuit || regtry(reginfo, &s)))       \
1955         {                                                      \
1956             goto got_it;                                       \
1957         }
1958
1959
1960 /* This differs from the above macros in that it calls a function which returns
1961  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1962  * there is no such occurrence. */
1963 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1964     while (s < strend) {                                    \
1965         s = (f);                                            \
1966         if (s >= strend) {                                  \
1967             break;                                          \
1968         }                                                   \
1969                                                             \
1970         FBC_CHECK_AND_TRY                                   \
1971         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1972         previous_occurrence_end = s;                        \
1973     }
1974
1975 /* The three macros below are slightly different versions of the same logic.
1976  *
1977  * The first is for /a and /aa when the target string is UTF-8.  This can only
1978  * match ascii, but it must advance based on UTF-8.   The other two handle the
1979  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1980  * for the boundary (or non-boundary) between a word and non-word character.
1981  * The utf8 and non-utf8 cases have the same logic, but the details must be
1982  * different.  Find the "wordness" of the character just prior to this one, and
1983  * compare it with the wordness of this one.  If they differ, we have a
1984  * boundary.  At the beginning of the string, pretend that the previous
1985  * character was a new-line.
1986  *
1987  * All these macros uncleanly have side-effects with each other and outside
1988  * variables.  So far it's been too much trouble to clean-up
1989  *
1990  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1991  *               a word character or not.
1992  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1993  *               word/non-word
1994  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1995  *
1996  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1997  * are looking for a boundary or for a non-boundary.  If we are looking for a
1998  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1999  * see if this tentative match actually works, and if so, to quit the loop
2000  * here.  And vice-versa if we are looking for a non-boundary.
2001  *
2002  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
2003  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
2004  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
2005  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2006  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2007  * complement.  But in that branch we complement tmp, meaning that at the
2008  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2009  * which means at the top of the loop in the next iteration, it is
2010  * TEST_NON_UTF8(s-1) */
2011 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
2012     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
2013     tmp = TEST_NON_UTF8(tmp);                                                  \
2014     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2015         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
2016             tmp = !tmp;                                                        \
2017             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
2018         }                                                                      \
2019         else {                                                                 \
2020             IF_FAIL;                                                           \
2021         }                                                                      \
2022     );                                                                         \
2023
2024 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2025  * TEST_UTF8 is a macro that for the same input code points returns identically
2026  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
2027 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
2028     if (s == reginfo->strbeg) {                                                \
2029         tmp = '\n';                                                            \
2030     }                                                                          \
2031     else { /* Back-up to the start of the previous character */                \
2032         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
2033         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
2034                                                        0, UTF8_ALLOW_DEFAULT); \
2035     }                                                                          \
2036     tmp = TEST_UV(tmp);                                                        \
2037     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2038         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2039             tmp = !tmp;                                                        \
2040             IF_SUCCESS;                                                        \
2041         }                                                                      \
2042         else {                                                                 \
2043             IF_FAIL;                                                           \
2044         }                                                                      \
2045     );
2046
2047 /* Like the above two macros.  UTF8_CODE is the complete code for handling
2048  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2049  * macros below */
2050 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
2051     if (utf8_target) {                                                         \
2052         UTF8_CODE                                                              \
2053     }                                                                          \
2054     else {  /* Not utf8 */                                                     \
2055         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
2056         tmp = TEST_NON_UTF8(tmp);                                              \
2057         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
2058             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
2059                 IF_SUCCESS;                                                    \
2060                 tmp = !tmp;                                                    \
2061             }                                                                  \
2062             else {                                                             \
2063                 IF_FAIL;                                                       \
2064             }                                                                  \
2065         );                                                                     \
2066     }                                                                          \
2067     /* Here, things have been set up by the previous code so that tmp is the   \
2068      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
2069      * utf8ness of the target).  We also have to check if this matches against \
2070      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
2071      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
2072      * string */                                                               \
2073     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2074         IF_SUCCESS;                                                            \
2075     }                                                                          \
2076     else {                                                                     \
2077         IF_FAIL;                                                               \
2078     }
2079
2080 /* This is the macro to use when we want to see if something that looks like it
2081  * could match, actually does, and if so exits the loop */
2082 #define REXEC_FBC_TRYIT                            \
2083     if ((reginfo->intuit || regtry(reginfo, &s)))  \
2084         goto got_it
2085
2086 /* The only difference between the BOUND and NBOUND cases is that
2087  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2088  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2089  * with the other one being empty (PLACEHOLDER is defined as empty).
2090  *
2091  * The TEST_FOO parameters are for operating on different forms of input, but
2092  * all should be ones that return identically for the same underlying code
2093  * points */
2094 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
2095     FBC_BOUND_COMMON(                                                          \
2096           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
2097           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2098
2099 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2100     FBC_BOUND_COMMON(                                                          \
2101             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2102             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2103
2104 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2105     FBC_BOUND_COMMON(                                                          \
2106           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2107           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2108
2109 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2110     FBC_BOUND_COMMON(                                                          \
2111             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2112             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2113
2114 #ifdef DEBUGGING
2115 static IV
2116 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2117   IV cp_out = _invlist_search(invlist, cp_in);
2118   assert(cp_out >= 0);
2119   return cp_out;
2120 }
2121 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2122         invmap[S_get_break_val_cp_checked(invlist, cp)]
2123 #else
2124 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2125         invmap[_invlist_search(invlist, cp)]
2126 #endif
2127
2128 /* Takes a pointer to an inversion list, a pointer to its corresponding
2129  * inversion map, and a code point, and returns the code point's value
2130  * according to the two arrays.  It assumes that all code points have a value.
2131  * This is used as the base macro for macros for particular properties */
2132 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2133         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2134
2135 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2136  * of a code point, returning the value for the first code point in the string.
2137  * And it takes the particular macro name that finds the desired value given a
2138  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2139 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2140              (__ASSERT_(pos < strend)                                          \
2141                  /* Note assumes is valid UTF-8 */                             \
2142              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2143
2144 /* Returns the GCB value for the input code point */
2145 #define getGCB_VAL_CP(cp)                                                      \
2146           _generic_GET_BREAK_VAL_CP(                                           \
2147                                     PL_GCB_invlist,                            \
2148                                     _Perl_GCB_invmap,                          \
2149                                     (cp))
2150
2151 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2152  * bounded by pos and strend */
2153 #define getGCB_VAL_UTF8(pos, strend)                                           \
2154     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2155
2156 /* Returns the LB value for the input code point */
2157 #define getLB_VAL_CP(cp)                                                       \
2158           _generic_GET_BREAK_VAL_CP(                                           \
2159                                     PL_LB_invlist,                             \
2160                                     _Perl_LB_invmap,                           \
2161                                     (cp))
2162
2163 /* Returns the LB value for the first code point in the UTF-8 encoded string
2164  * bounded by pos and strend */
2165 #define getLB_VAL_UTF8(pos, strend)                                            \
2166     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2167
2168
2169 /* Returns the SB value for the input code point */
2170 #define getSB_VAL_CP(cp)                                                       \
2171           _generic_GET_BREAK_VAL_CP(                                           \
2172                                     PL_SB_invlist,                             \
2173                                     _Perl_SB_invmap,                     \
2174                                     (cp))
2175
2176 /* Returns the SB value for the first code point in the UTF-8 encoded string
2177  * bounded by pos and strend */
2178 #define getSB_VAL_UTF8(pos, strend)                                            \
2179     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2180
2181 /* Returns the WB value for the input code point */
2182 #define getWB_VAL_CP(cp)                                                       \
2183           _generic_GET_BREAK_VAL_CP(                                           \
2184                                     PL_WB_invlist,                             \
2185                                     _Perl_WB_invmap,                         \
2186                                     (cp))
2187
2188 /* Returns the WB value for the first code point in the UTF-8 encoded string
2189  * bounded by pos and strend */
2190 #define getWB_VAL_UTF8(pos, strend)                                            \
2191     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2192
2193 /* We know what class REx starts with.  Try to find this position... */
2194 /* if reginfo->intuit, its a dryrun */
2195 /* annoyingly all the vars in this routine have different names from their counterparts
2196    in regmatch. /grrr */
2197 STATIC char *
2198 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2199     const char *strend, regmatch_info *reginfo)
2200 {
2201     dVAR;
2202
2203     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2204     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2205
2206     char *pat_string;   /* The pattern's exactish string */
2207     char *pat_end;          /* ptr to end char of pat_string */
2208     re_fold_t folder;   /* Function for computing non-utf8 folds */
2209     const U8 *fold_array;   /* array for folding ords < 256 */
2210     STRLEN ln;
2211     STRLEN lnc;
2212     U8 c1;
2213     U8 c2;
2214     char *e;
2215
2216     /* In some cases we accept only the first occurence of 'x' in a sequence of
2217      * them.  This variable points to just beyond the end of the previous
2218      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2219      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2220      * hop back.) */
2221     char * previous_occurrence_end = 0;
2222
2223     I32 tmp;            /* Scratch variable */
2224     const bool utf8_target = reginfo->is_utf8_target;
2225     UV utf8_fold_flags = 0;
2226     const bool is_utf8_pat = reginfo->is_utf8_pat;
2227     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2228                                    with a result inverts that result, as 0^1 =
2229                                    1 and 1^1 = 0 */
2230     _char_class_number classnum;
2231
2232     RXi_GET_DECL(prog,progi);
2233
2234     PERL_ARGS_ASSERT_FIND_BYCLASS;
2235
2236     /* We know what class it must start with. */
2237     switch (OP(c)) {
2238     case ANYOFPOSIXL:
2239     case ANYOFL:
2240         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2241
2242         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2243             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2244         }
2245
2246         /* FALLTHROUGH */
2247     case ANYOFD:
2248     case ANYOF:
2249         if (utf8_target) {
2250             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2251                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2252         }
2253         else if (ANYOF_FLAGS(c)) {
2254             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2255         }
2256         else {
2257             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2258         }
2259         break;
2260
2261     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2262         /* UTF-8ness doesn't matter, so use 0 */
2263         REXEC_FBC_FIND_NEXT_SCAN(0,
2264          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2265                                    (U8) ARG(c), FLAGS(c)));
2266         break;
2267
2268     case NANYOFM:
2269         REXEC_FBC_FIND_NEXT_SCAN(0,
2270          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2271                                    (U8) ARG(c), FLAGS(c)));
2272         break;
2273
2274     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2275         assert(! is_utf8_pat);
2276         /* FALLTHROUGH */
2277     case EXACTFAA:
2278         if (is_utf8_pat || utf8_target) {
2279             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2280             goto do_exactf_utf8;
2281         }
2282         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
2283         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
2284         goto do_exactf_non_utf8;        /* isn't dealt with by these */
2285
2286     case EXACTF:   /* This node only generated for non-utf8 patterns */
2287         assert(! is_utf8_pat);
2288         if (utf8_target) {
2289             utf8_fold_flags = 0;
2290             goto do_exactf_utf8;
2291         }
2292         fold_array = PL_fold;
2293         folder = foldEQ;
2294         goto do_exactf_non_utf8;
2295
2296     case EXACTFL:
2297         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2298         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2299             utf8_fold_flags = FOLDEQ_LOCALE;
2300             goto do_exactf_utf8;
2301         }
2302         fold_array = PL_fold_locale;
2303         folder = foldEQ_locale;
2304         goto do_exactf_non_utf8;
2305
2306     case EXACTFU_SS:
2307         if (is_utf8_pat) {
2308             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2309         }
2310         goto do_exactf_utf8;
2311
2312     case EXACTFLU8:
2313             if (! utf8_target) {    /* All code points in this node require
2314                                        UTF-8 to express.  */
2315                 break;
2316             }
2317             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2318                                              | FOLDEQ_S2_FOLDS_SANE;
2319             goto do_exactf_utf8;
2320
2321     case EXACTFU:
2322         if (is_utf8_pat || utf8_target) {
2323             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2324             goto do_exactf_utf8;
2325         }
2326
2327         /* Any 'ss' in the pattern should have been replaced by regcomp,
2328          * so we don't have to worry here about this single special case
2329          * in the Latin1 range */
2330         fold_array = PL_fold_latin1;
2331         folder = foldEQ_latin1;
2332
2333         /* FALLTHROUGH */
2334
2335       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2336                            are no glitches with fold-length differences
2337                            between the target string and pattern */
2338
2339         /* The idea in the non-utf8 EXACTF* cases is to first find the
2340          * first character of the EXACTF* node and then, if necessary,
2341          * case-insensitively compare the full text of the node.  c1 is the
2342          * first character.  c2 is its fold.  This logic will not work for
2343          * Unicode semantics and the german sharp ss, which hence should
2344          * not be compiled into a node that gets here. */
2345         pat_string = STRING(c);
2346         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2347
2348         /* We know that we have to match at least 'ln' bytes (which is the
2349          * same as characters, since not utf8).  If we have to match 3
2350          * characters, and there are only 2 availabe, we know without
2351          * trying that it will fail; so don't start a match past the
2352          * required minimum number from the far end */
2353         e = HOP3c(strend, -((SSize_t)ln), s);
2354         if (e < s)
2355             break;
2356
2357         c1 = *pat_string;
2358         c2 = fold_array[c1];
2359         if (c1 == c2) { /* If char and fold are the same */
2360             while (s <= e) {
2361                 s = (char *) memchr(s, c1, e + 1 - s);
2362                 if (s == NULL) {
2363                     break;
2364                 }
2365
2366                 /* Check that the rest of the node matches */
2367                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2368                     && (reginfo->intuit || regtry(reginfo, &s)) )
2369                 {
2370                     goto got_it;
2371                 }
2372                 s++;
2373             }
2374         }
2375         else {
2376             U8 bits_differing = c1 ^ c2;
2377
2378             /* If the folds differ in one bit position only, we can mask to
2379              * match either of them, and can use this faster find method.  Both
2380              * ASCII and EBCDIC tend to have their case folds differ in only
2381              * one position, so this is very likely */
2382             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2383                 bits_differing = ~ bits_differing;
2384                 while (s <= e) {
2385                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2386                                         (c1 & bits_differing), bits_differing);
2387                     if (s > e) {
2388                         break;
2389                     }
2390
2391                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2392                         && (reginfo->intuit || regtry(reginfo, &s)) )
2393                     {
2394                         goto got_it;
2395                     }
2396                     s++;
2397                 }
2398             }
2399             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2400                        should actually happen only in EXACTFL nodes */
2401                 while (s <= e) {
2402                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2403                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2404                         && (reginfo->intuit || regtry(reginfo, &s)) )
2405                     {
2406                         goto got_it;
2407                     }
2408                     s++;
2409                 }
2410             }
2411         }
2412         break;
2413
2414       do_exactf_utf8:
2415       {
2416         unsigned expansion;
2417
2418         /* If one of the operands is in utf8, we can't use the simpler folding
2419          * above, due to the fact that many different characters can have the
2420          * same fold, or portion of a fold, or different- length fold */
2421         pat_string = STRING(c);
2422         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2423         pat_end = pat_string + ln;
2424         lnc = is_utf8_pat       /* length to match in characters */
2425                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2426                 : ln;
2427
2428         /* We have 'lnc' characters to match in the pattern, but because of
2429          * multi-character folding, each character in the target can match
2430          * up to 3 characters (Unicode guarantees it will never exceed
2431          * this) if it is utf8-encoded; and up to 2 if not (based on the
2432          * fact that the Latin 1 folds are already determined, and the
2433          * only multi-char fold in that range is the sharp-s folding to
2434          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2435          * string character.  Adjust lnc accordingly, rounding up, so that
2436          * if we need to match at least 4+1/3 chars, that really is 5. */
2437         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2438         lnc = (lnc + expansion - 1) / expansion;
2439
2440         /* As in the non-UTF8 case, if we have to match 3 characters, and
2441          * only 2 are left, it's guaranteed to fail, so don't start a
2442          * match that would require us to go beyond the end of the string
2443          */
2444         e = HOP3c(strend, -((SSize_t)lnc), s);
2445
2446         /* XXX Note that we could recalculate e to stop the loop earlier,
2447          * as the worst case expansion above will rarely be met, and as we
2448          * go along we would usually find that e moves further to the left.
2449          * This would happen only after we reached the point in the loop
2450          * where if there were no expansion we should fail.  Unclear if
2451          * worth the expense */
2452
2453         while (s <= e) {
2454             char *my_strend= (char *)strend;
2455             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2456                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2457                 && (reginfo->intuit || regtry(reginfo, &s)) )
2458             {
2459                 goto got_it;
2460             }
2461             s += (utf8_target) ? UTF8SKIP(s) : 1;
2462         }
2463         break;
2464     }
2465
2466     case BOUNDL:
2467         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2468         if (FLAGS(c) != TRADITIONAL_BOUND) {
2469             if (! IN_UTF8_CTYPE_LOCALE) {
2470                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2471                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2472             }
2473             goto do_boundu;
2474         }
2475
2476         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2477         break;
2478
2479     case NBOUNDL:
2480         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2481         if (FLAGS(c) != TRADITIONAL_BOUND) {
2482             if (! IN_UTF8_CTYPE_LOCALE) {
2483                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2484                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2485             }
2486             goto do_nboundu;
2487         }
2488
2489         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2490         break;
2491
2492     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2493                    meaning */
2494         assert(FLAGS(c) == TRADITIONAL_BOUND);
2495
2496         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2497         break;
2498
2499     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2500                    meaning */
2501         assert(FLAGS(c) == TRADITIONAL_BOUND);
2502
2503         FBC_BOUND_A(isWORDCHAR_A);
2504         break;
2505
2506     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2507                    meaning */
2508         assert(FLAGS(c) == TRADITIONAL_BOUND);
2509
2510         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2511         break;
2512
2513     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2514                    meaning */
2515         assert(FLAGS(c) == TRADITIONAL_BOUND);
2516
2517         FBC_NBOUND_A(isWORDCHAR_A);
2518         break;
2519
2520     case NBOUNDU:
2521         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2522             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2523             break;
2524         }
2525
2526       do_nboundu:
2527
2528         to_complement = 1;
2529         /* FALLTHROUGH */
2530
2531     case BOUNDU:
2532       do_boundu:
2533         switch((bound_type) FLAGS(c)) {
2534             case TRADITIONAL_BOUND:
2535                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2536                 break;
2537             case GCB_BOUND:
2538                 if (s == reginfo->strbeg) {
2539                     if (reginfo->intuit || regtry(reginfo, &s))
2540                     {
2541                         goto got_it;
2542                     }
2543
2544                     /* Didn't match.  Try at the next position (if there is one) */
2545                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2546                     if (UNLIKELY(s >= reginfo->strend)) {
2547                         break;
2548                     }
2549                 }
2550
2551                 if (utf8_target) {
2552                     GCB_enum before = getGCB_VAL_UTF8(
2553                                                reghop3((U8*)s, -1,
2554                                                        (U8*)(reginfo->strbeg)),
2555                                                (U8*) reginfo->strend);
2556                     while (s < strend) {
2557                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2558                                                         (U8*) reginfo->strend);
2559                         if (   (to_complement ^ isGCB(before,
2560                                                       after,
2561                                                       (U8*) reginfo->strbeg,
2562                                                       (U8*) s,
2563                                                       utf8_target))
2564                             && (reginfo->intuit || regtry(reginfo, &s)))
2565                         {
2566                             goto got_it;
2567                         }
2568                         before = after;
2569                         s += UTF8SKIP(s);
2570                     }
2571                 }
2572                 else {  /* Not utf8.  Everything is a GCB except between CR and
2573                            LF */
2574                     while (s < strend) {
2575                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2576                                               || UCHARAT(s) != '\n'))
2577                             && (reginfo->intuit || regtry(reginfo, &s)))
2578                         {
2579                             goto got_it;
2580                         }
2581                         s++;
2582                     }
2583                 }
2584
2585                 /* And, since this is a bound, it can match after the final
2586                  * character in the string */
2587                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2588                     goto got_it;
2589                 }
2590                 break;
2591
2592             case LB_BOUND:
2593                 if (s == reginfo->strbeg) {
2594                     if (reginfo->intuit || regtry(reginfo, &s)) {
2595                         goto got_it;
2596                     }
2597                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2598                     if (UNLIKELY(s >= reginfo->strend)) {
2599                         break;
2600                     }
2601                 }
2602
2603                 if (utf8_target) {
2604                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2605                                                                -1,
2606                                                                (U8*)(reginfo->strbeg)),
2607                                                        (U8*) reginfo->strend);
2608                     while (s < strend) {
2609                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2610                         if (to_complement ^ isLB(before,
2611                                                  after,
2612                                                  (U8*) reginfo->strbeg,
2613                                                  (U8*) s,
2614                                                  (U8*) reginfo->strend,
2615                                                  utf8_target)
2616                             && (reginfo->intuit || regtry(reginfo, &s)))
2617                         {
2618                             goto got_it;
2619                         }
2620                         before = after;
2621                         s += UTF8SKIP(s);
2622                     }
2623                 }
2624                 else {  /* Not utf8. */
2625                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2626                     while (s < strend) {
2627                         LB_enum after = getLB_VAL_CP((U8) *s);
2628                         if (to_complement ^ isLB(before,
2629                                                  after,
2630                                                  (U8*) reginfo->strbeg,
2631                                                  (U8*) s,
2632                                                  (U8*) reginfo->strend,
2633                                                  utf8_target)
2634                             && (reginfo->intuit || regtry(reginfo, &s)))
2635                         {
2636                             goto got_it;
2637                         }
2638                         before = after;
2639                         s++;
2640                     }
2641                 }
2642
2643                 if (reginfo->intuit || regtry(reginfo, &s)) {
2644                     goto got_it;
2645                 }
2646
2647                 break;
2648
2649             case SB_BOUND:
2650                 if (s == reginfo->strbeg) {
2651                     if (reginfo->intuit || regtry(reginfo, &s)) {
2652                         goto got_it;
2653                     }
2654                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2655                     if (UNLIKELY(s >= reginfo->strend)) {
2656                         break;
2657                     }
2658                 }
2659
2660                 if (utf8_target) {
2661                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2662                                                         -1,
2663                                                         (U8*)(reginfo->strbeg)),
2664                                                       (U8*) reginfo->strend);
2665                     while (s < strend) {
2666                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2667                                                          (U8*) reginfo->strend);
2668                         if ((to_complement ^ isSB(before,
2669                                                   after,
2670                                                   (U8*) reginfo->strbeg,
2671                                                   (U8*) s,
2672                                                   (U8*) reginfo->strend,
2673                                                   utf8_target))
2674                             && (reginfo->intuit || regtry(reginfo, &s)))
2675                         {
2676                             goto got_it;
2677                         }
2678                         before = after;
2679                         s += UTF8SKIP(s);
2680                     }
2681                 }
2682                 else {  /* Not utf8. */
2683                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2684                     while (s < strend) {
2685                         SB_enum after = getSB_VAL_CP((U8) *s);
2686                         if ((to_complement ^ isSB(before,
2687                                                   after,
2688                                                   (U8*) reginfo->strbeg,
2689                                                   (U8*) s,
2690                                                   (U8*) reginfo->strend,
2691                                                   utf8_target))
2692                             && (reginfo->intuit || regtry(reginfo, &s)))
2693                         {
2694                             goto got_it;
2695                         }
2696                         before = after;
2697                         s++;
2698                     }
2699                 }
2700
2701                 /* Here are at the final position in the target string.  The SB
2702                  * value is always true here, so matches, depending on other
2703                  * constraints */
2704                 if (reginfo->intuit || regtry(reginfo, &s)) {
2705                     goto got_it;
2706                 }
2707
2708                 break;
2709
2710             case WB_BOUND:
2711                 if (s == reginfo->strbeg) {
2712                     if (reginfo->intuit || regtry(reginfo, &s)) {
2713                         goto got_it;
2714                     }
2715                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2716                     if (UNLIKELY(s >= reginfo->strend)) {
2717                         break;
2718                     }
2719                 }
2720
2721                 if (utf8_target) {
2722                     /* We are at a boundary between char_sub_0 and char_sub_1.
2723                      * We also keep track of the value for char_sub_-1 as we
2724                      * loop through the line.   Context may be needed to make a
2725                      * determination, and if so, this can save having to
2726                      * recalculate it */
2727                     WB_enum previous = WB_UNKNOWN;
2728                     WB_enum before = getWB_VAL_UTF8(
2729                                               reghop3((U8*)s,
2730                                                       -1,
2731                                                       (U8*)(reginfo->strbeg)),
2732                                               (U8*) reginfo->strend);
2733                     while (s < strend) {
2734                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2735                                                         (U8*) reginfo->strend);
2736                         if ((to_complement ^ isWB(previous,
2737                                                   before,
2738                                                   after,
2739                                                   (U8*) reginfo->strbeg,
2740                                                   (U8*) s,
2741                                                   (U8*) reginfo->strend,
2742                                                   utf8_target))
2743                             && (reginfo->intuit || regtry(reginfo, &s)))
2744                         {
2745                             goto got_it;
2746                         }
2747                         previous = before;
2748                         before = after;
2749                         s += UTF8SKIP(s);
2750                     }
2751                 }
2752                 else {  /* Not utf8. */
2753                     WB_enum previous = WB_UNKNOWN;
2754                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2755                     while (s < strend) {
2756                         WB_enum after = getWB_VAL_CP((U8) *s);
2757                         if ((to_complement ^ isWB(previous,
2758                                                   before,
2759                                                   after,
2760                                                   (U8*) reginfo->strbeg,
2761                                                   (U8*) s,
2762                                                   (U8*) reginfo->strend,
2763                                                   utf8_target))
2764                             && (reginfo->intuit || regtry(reginfo, &s)))
2765                         {
2766                             goto got_it;
2767                         }
2768                         previous = before;
2769                         before = after;
2770                         s++;
2771                     }
2772                 }
2773
2774                 if (reginfo->intuit || regtry(reginfo, &s)) {
2775                     goto got_it;
2776                 }
2777         }
2778         break;
2779
2780     case LNBREAK:
2781         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2782                         is_LNBREAK_latin1_safe(s, strend)
2783         );
2784         break;
2785
2786     case ASCII:
2787         REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
2788         break;
2789
2790     case NASCII:
2791         if (utf8_target) {
2792             REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2793                                                             utf8_target));
2794         }
2795         else {
2796             REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2797                                                             utf8_target));
2798         }
2799
2800         break;
2801
2802     /* The argument to all the POSIX node types is the class number to pass to
2803      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2804
2805     case NPOSIXL:
2806         to_complement = 1;
2807         /* FALLTHROUGH */
2808
2809     case POSIXL:
2810         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2811         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2812                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2813         break;
2814
2815     case NPOSIXD:
2816         to_complement = 1;
2817         /* FALLTHROUGH */
2818
2819     case POSIXD:
2820         if (utf8_target) {
2821             goto posix_utf8;
2822         }
2823         goto posixa;
2824
2825     case NPOSIXA:
2826         if (utf8_target) {
2827             /* The complement of something that matches only ASCII matches all
2828              * non-ASCII, plus everything in ASCII that isn't in the class. */
2829             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2830                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2831             break;
2832         }
2833
2834         to_complement = 1;
2835         goto posixa;
2836
2837     case POSIXA:
2838         /* Don't need to worry about utf8, as it can match only a single
2839          * byte invariant character.  But we do anyway for performance reasons,
2840          * as otherwise we would have to examine all the continuation
2841          * characters */
2842         if (utf8_target) {
2843             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2844             break;
2845         }
2846
2847       posixa:
2848         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2849                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2850         break;
2851
2852     case NPOSIXU:
2853         to_complement = 1;
2854         /* FALLTHROUGH */
2855
2856     case POSIXU:
2857         if (! utf8_target) {
2858             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2859                                  to_complement ^ cBOOL(_generic_isCC(*s,
2860                                                                     FLAGS(c))));
2861         }
2862         else {
2863
2864           posix_utf8:
2865             classnum = (_char_class_number) FLAGS(c);
2866             switch (classnum) {
2867                 default:
2868                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2869                         to_complement ^ cBOOL(_invlist_contains_cp(
2870                                               PL_XPosix_ptrs[classnum],
2871                                               utf8_to_uvchr_buf((U8 *) s,
2872                                                                 (U8 *) strend,
2873                                                                 NULL))));
2874                     break;
2875                 case _CC_ENUM_SPACE:
2876                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2877                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2878                     break;
2879
2880                 case _CC_ENUM_BLANK:
2881                     REXEC_FBC_CLASS_SCAN(1,
2882                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2883                     break;
2884
2885                 case _CC_ENUM_XDIGIT:
2886                     REXEC_FBC_CLASS_SCAN(1,
2887                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2888                     break;
2889
2890                 case _CC_ENUM_VERTSPACE:
2891                     REXEC_FBC_CLASS_SCAN(1,
2892                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2893                     break;
2894
2895                 case _CC_ENUM_CNTRL:
2896                     REXEC_FBC_CLASS_SCAN(1,
2897                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2898                     break;
2899             }
2900         }
2901         break;
2902
2903     case AHOCORASICKC:
2904     case AHOCORASICK:
2905         {
2906             DECL_TRIE_TYPE(c);
2907             /* what trie are we using right now */
2908             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2909             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2910             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2911
2912             const char *last_start = strend - trie->minlen;
2913 #ifdef DEBUGGING
2914             const char *real_start = s;
2915 #endif
2916             STRLEN maxlen = trie->maxlen;
2917             SV *sv_points;
2918             U8 **points; /* map of where we were in the input string
2919                             when reading a given char. For ASCII this
2920                             is unnecessary overhead as the relationship
2921                             is always 1:1, but for Unicode, especially
2922                             case folded Unicode this is not true. */
2923             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2924             U8 *bitmap=NULL;
2925
2926
2927             GET_RE_DEBUG_FLAGS_DECL;
2928
2929             /* We can't just allocate points here. We need to wrap it in
2930              * an SV so it gets freed properly if there is a croak while
2931              * running the match */
2932             ENTER;
2933             SAVETMPS;
2934             sv_points=newSV(maxlen * sizeof(U8 *));
2935             SvCUR_set(sv_points,
2936                 maxlen * sizeof(U8 *));
2937             SvPOK_on(sv_points);
2938             sv_2mortal(sv_points);
2939             points=(U8**)SvPV_nolen(sv_points );
2940             if ( trie_type != trie_utf8_fold
2941                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2942             {
2943                 if (trie->bitmap)
2944                     bitmap=(U8*)trie->bitmap;
2945                 else
2946                     bitmap=(U8*)ANYOF_BITMAP(c);
2947             }
2948             /* this is the Aho-Corasick algorithm modified a touch
2949                to include special handling for long "unknown char" sequences.
2950                The basic idea being that we use AC as long as we are dealing
2951                with a possible matching char, when we encounter an unknown char
2952                (and we have not encountered an accepting state) we scan forward
2953                until we find a legal starting char.
2954                AC matching is basically that of trie matching, except that when
2955                we encounter a failing transition, we fall back to the current
2956                states "fail state", and try the current char again, a process
2957                we repeat until we reach the root state, state 1, or a legal
2958                transition. If we fail on the root state then we can either
2959                terminate if we have reached an accepting state previously, or
2960                restart the entire process from the beginning if we have not.
2961
2962              */
2963             while (s <= last_start) {
2964                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2965                 U8 *uc = (U8*)s;
2966                 U16 charid = 0;
2967                 U32 base = 1;
2968                 U32 state = 1;
2969                 UV uvc = 0;
2970                 STRLEN len = 0;
2971                 STRLEN foldlen = 0;
2972                 U8 *uscan = (U8*)NULL;
2973                 U8 *leftmost = NULL;
2974 #ifdef DEBUGGING
2975                 U32 accepted_word= 0;
2976 #endif
2977                 U32 pointpos = 0;
2978
2979                 while ( state && uc <= (U8*)strend ) {
2980                     int failed=0;
2981                     U32 word = aho->states[ state ].wordnum;
2982
2983                     if( state==1 ) {
2984                         if ( bitmap ) {
2985                             DEBUG_TRIE_EXECUTE_r(
2986                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2987                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2988                                         (char *)uc, utf8_target, 0 );
2989                                     Perl_re_printf( aTHX_
2990                                         " Scanning for legal start char...\n");
2991                                 }
2992                             );
2993                             if (utf8_target) {
2994                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2995                                     uc += UTF8SKIP(uc);
2996                                 }
2997                             } else {
2998                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2999                                     uc++;
3000                                 }
3001                             }
3002                             s= (char *)uc;
3003                         }
3004                         if (uc >(U8*)last_start) break;
3005                     }
3006
3007                     if ( word ) {
3008                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
3009                         if (!leftmost || lpos < leftmost) {
3010                             DEBUG_r(accepted_word=word);
3011                             leftmost= lpos;
3012                         }
3013                         if (base==0) break;
3014
3015                     }
3016                     points[pointpos++ % maxlen]= uc;
3017                     if (foldlen || uc < (U8*)strend) {
3018                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3019                                              (U8 *) strend, uscan, len, uvc,
3020                                              charid, foldlen, foldbuf,
3021                                              uniflags);
3022                         DEBUG_TRIE_EXECUTE_r({
3023                             dump_exec_pos( (char *)uc, c, strend,
3024                                         real_start, s, utf8_target, 0);
3025                             Perl_re_printf( aTHX_
3026                                 " Charid:%3u CP:%4" UVxf " ",
3027                                  charid, uvc);
3028                         });
3029                     }
3030                     else {
3031                         len = 0;
3032                         charid = 0;
3033                     }
3034
3035
3036                     do {
3037 #ifdef DEBUGGING
3038                         word = aho->states[ state ].wordnum;
3039 #endif
3040                         base = aho->states[ state ].trans.base;
3041
3042                         DEBUG_TRIE_EXECUTE_r({
3043                             if (failed)
3044                                 dump_exec_pos( (char *)uc, c, strend, real_start,
3045                                     s,   utf8_target, 0 );
3046                             Perl_re_printf( aTHX_
3047                                 "%sState: %4" UVxf ", word=%" UVxf,
3048                                 failed ? " Fail transition to " : "",
3049                                 (UV)state, (UV)word);
3050                         });
3051                         if ( base ) {
3052                             U32 tmp;
3053                             I32 offset;
3054                             if (charid &&
3055                                  ( ((offset = base + charid
3056                                     - 1 - trie->uniquecharcount)) >= 0)
3057                                  && ((U32)offset < trie->lasttrans)
3058                                  && trie->trans[offset].check == state
3059                                  && (tmp=trie->trans[offset].next))
3060                             {
3061                                 DEBUG_TRIE_EXECUTE_r(
3062                                     Perl_re_printf( aTHX_ " - legal\n"));
3063                                 state = tmp;
3064                                 break;
3065                             }
3066                             else {
3067                                 DEBUG_TRIE_EXECUTE_r(
3068                                     Perl_re_printf( aTHX_ " - fail\n"));
3069                                 failed = 1;
3070                                 state = aho->fail[state];
3071                             }
3072                         }
3073                         else {
3074                             /* we must be accepting here */
3075                             DEBUG_TRIE_EXECUTE_r(
3076                                     Perl_re_printf( aTHX_ " - accepting\n"));
3077                             failed = 1;
3078                             break;
3079                         }
3080                     } while(state);
3081                     uc += len;
3082                     if (failed) {
3083                         if (leftmost)
3084                             break;
3085                         if (!state) state = 1;
3086                     }
3087                 }
3088                 if ( aho->states[ state ].wordnum ) {
3089                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3090                     if (!leftmost || lpos < leftmost) {
3091                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3092                         leftmost = lpos;
3093                     }
3094                 }
3095                 if (leftmost) {
3096                     s = (char*)leftmost;
3097                     DEBUG_TRIE_EXECUTE_r({
3098                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3099                             (UV)accepted_word, (IV)(s - real_start)
3100                         );
3101                     });
3102                     if (reginfo->intuit || regtry(reginfo, &s)) {
3103                         FREETMPS;
3104                         LEAVE;
3105                         goto got_it;
3106                     }
3107                     s = HOPc(s,1);
3108                     DEBUG_TRIE_EXECUTE_r({
3109                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3110                     });
3111                 } else {
3112                     DEBUG_TRIE_EXECUTE_r(
3113                         Perl_re_printf( aTHX_ "No match.\n"));
3114                     break;
3115                 }
3116             }
3117             FREETMPS;
3118             LEAVE;
3119         }
3120         break;
3121     default:
3122         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3123     }
3124     return 0;
3125   got_it:
3126     return s;
3127 }
3128
3129 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3130  * flags have same meanings as with regexec_flags() */
3131
3132 static void
3133 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3134                             char *strbeg,
3135                             char *strend,
3136                             SV *sv,
3137                             U32 flags,
3138                             bool utf8_target)
3139 {
3140     struct regexp *const prog = ReANY(rx);
3141
3142     if (flags & REXEC_COPY_STR) {
3143 #ifdef PERL_ANY_COW
3144         if (SvCANCOW(sv)) {
3145             DEBUG_C(Perl_re_printf( aTHX_
3146                               "Copy on write: regexp capture, type %d\n",
3147                                     (int) SvTYPE(sv)));
3148             /* Create a new COW SV to share the match string and store
3149              * in saved_copy, unless the current COW SV in saved_copy
3150              * is valid and suitable for our purpose */
3151             if ((   prog->saved_copy
3152                  && SvIsCOW(prog->saved_copy)
3153                  && SvPOKp(prog->saved_copy)
3154                  && SvIsCOW(sv)
3155                  && SvPOKp(sv)
3156                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3157             {
3158                 /* just reuse saved_copy SV */
3159                 if (RXp_MATCH_COPIED(prog)) {
3160                     Safefree(prog->subbeg);
3161                     RXp_MATCH_COPIED_off(prog);
3162                 }
3163             }
3164             else {
3165                 /* create new COW SV to share string */
3166                 RXp_MATCH_COPY_FREE(prog);
3167                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3168             }
3169             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3170             assert (SvPOKp(prog->saved_copy));
3171             prog->sublen  = strend - strbeg;
3172             prog->suboffset = 0;
3173             prog->subcoffset = 0;
3174         } else
3175 #endif
3176         {
3177             SSize_t min = 0;
3178             SSize_t max = strend - strbeg;
3179             SSize_t sublen;
3180
3181             if (    (flags & REXEC_COPY_SKIP_POST)
3182                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3183                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3184             ) { /* don't copy $' part of string */
3185                 U32 n = 0;
3186                 max = -1;
3187                 /* calculate the right-most part of the string covered
3188                  * by a capture. Due to lookahead, this may be to
3189                  * the right of $&, so we have to scan all captures */
3190                 while (n <= prog->lastparen) {
3191                     if (prog->offs[n].end > max)
3192                         max = prog->offs[n].end;
3193                     n++;
3194                 }
3195                 if (max == -1)
3196                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3197                             ? prog->offs[0].start
3198                             : 0;
3199                 assert(max >= 0 && max <= strend - strbeg);
3200             }
3201
3202             if (    (flags & REXEC_COPY_SKIP_PRE)
3203                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3204                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3205             ) { /* don't copy $` part of string */
3206                 U32 n = 0;
3207                 min = max;
3208                 /* calculate the left-most part of the string covered
3209                  * by a capture. Due to lookbehind, this may be to
3210                  * the left of $&, so we have to scan all captures */
3211                 while (min && n <= prog->lastparen) {
3212                     if (   prog->offs[n].start != -1
3213                         && prog->offs[n].start < min)
3214                     {
3215                         min = prog->offs[n].start;
3216                     }
3217                     n++;
3218                 }
3219                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3220                     && min >  prog->offs[0].end
3221                 )
3222                     min = prog->offs[0].end;
3223
3224             }
3225
3226             assert(min >= 0 && min <= max && min <= strend - strbeg);
3227             sublen = max - min;
3228
3229             if (RXp_MATCH_COPIED(prog)) {
3230                 if (sublen > prog->sublen)
3231                     prog->subbeg =
3232                             (char*)saferealloc(prog->subbeg, sublen+1);
3233             }
3234             else
3235                 prog->subbeg = (char*)safemalloc(sublen+1);
3236             Copy(strbeg + min, prog->subbeg, sublen, char);
3237             prog->subbeg[sublen] = '\0';
3238             prog->suboffset = min;
3239             prog->sublen = sublen;
3240             RXp_MATCH_COPIED_on(prog);
3241         }
3242         prog->subcoffset = prog->suboffset;
3243         if (prog->suboffset && utf8_target) {
3244             /* Convert byte offset to chars.
3245              * XXX ideally should only compute this if @-/@+
3246              * has been seen, a la PL_sawampersand ??? */
3247
3248             /* If there's a direct correspondence between the
3249              * string which we're matching and the original SV,
3250              * then we can use the utf8 len cache associated with
3251              * the SV. In particular, it means that under //g,
3252              * sv_pos_b2u() will use the previously cached
3253              * position to speed up working out the new length of
3254              * subcoffset, rather than counting from the start of
3255              * the string each time. This stops
3256              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3257              * from going quadratic */
3258             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3259                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3260                                                 SV_GMAGIC|SV_CONST_RETURN);
3261             else
3262                 prog->subcoffset = utf8_length((U8*)strbeg,
3263                                     (U8*)(strbeg+prog->suboffset));
3264         }
3265     }
3266     else {
3267         RXp_MATCH_COPY_FREE(prog);
3268         prog->subbeg = strbeg;
3269         prog->suboffset = 0;
3270         prog->subcoffset = 0;
3271         prog->sublen = strend - strbeg;
3272     }
3273 }
3274
3275
3276
3277
3278 /*
3279  - regexec_flags - match a regexp against a string
3280  */
3281 I32
3282 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3283               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3284 /* stringarg: the point in the string at which to begin matching */
3285 /* strend:    pointer to null at end of string */
3286 /* strbeg:    real beginning of string */
3287 /* minend:    end of match must be >= minend bytes after stringarg. */
3288 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3289  *            itself is accessed via the pointers above */
3290 /* data:      May be used for some additional optimizations.
3291               Currently unused. */
3292 /* flags:     For optimizations. See REXEC_* in regexp.h */
3293
3294 {
3295     struct regexp *const prog = ReANY(rx);
3296     char *s;
3297     regnode *c;
3298     char *startpos;
3299     SSize_t minlen;             /* must match at least this many chars */
3300     SSize_t dontbother = 0;     /* how many characters not to try at end */
3301     const bool utf8_target = cBOOL(DO_UTF8(sv));
3302     I32 multiline;
3303     RXi_GET_DECL(prog,progi);
3304     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3305     regmatch_info *const reginfo = &reginfo_buf;
3306     regexp_paren_pair *swap = NULL;
3307     I32 oldsave;
3308     GET_RE_DEBUG_FLAGS_DECL;
3309
3310     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3311     PERL_UNUSED_ARG(data);
3312
3313     /* Be paranoid... */
3314     if (prog == NULL) {
3315         Perl_croak(aTHX_ "NULL regexp parameter");
3316     }
3317
3318     DEBUG_EXECUTE_r(
3319         debug_start_match(rx, utf8_target, stringarg, strend,
3320         "Matching");
3321     );
3322
3323     startpos = stringarg;
3324
3325     /* set these early as they may be used by the HOP macros below */
3326     reginfo->strbeg = strbeg;
3327     reginfo->strend = strend;
3328     reginfo->is_utf8_target = cBOOL(utf8_target);
3329
3330     if (prog->intflags & PREGf_GPOS_SEEN) {
3331         MAGIC *mg;
3332
3333         /* set reginfo->ganch, the position where \G can match */
3334
3335         reginfo->ganch =
3336             (flags & REXEC_IGNOREPOS)
3337             ? stringarg /* use start pos rather than pos() */
3338             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3339               /* Defined pos(): */
3340             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3341             : strbeg; /* pos() not defined; use start of string */
3342
3343         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3344             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3345
3346         /* in the presence of \G, we may need to start looking earlier in
3347          * the string than the suggested start point of stringarg:
3348          * if prog->gofs is set, then that's a known, fixed minimum
3349          * offset, such as
3350          * /..\G/:   gofs = 2
3351          * /ab|c\G/: gofs = 1
3352          * or if the minimum offset isn't known, then we have to go back
3353          * to the start of the string, e.g. /w+\G/
3354          */
3355
3356         if (prog->intflags & PREGf_ANCH_GPOS) {
3357             if (prog->gofs) {
3358                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3359                 if (!startpos ||
3360                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3361                 {
3362                     DEBUG_r(Perl_re_printf( aTHX_
3363                             "fail: ganch-gofs before earliest possible start\n"));
3364                     return 0;
3365                 }
3366             }
3367             else
3368                 startpos = reginfo->ganch;
3369         }
3370         else if (prog->gofs) {
3371             startpos = HOPBACKc(startpos, prog->gofs);
3372             if (!startpos)
3373                 startpos = strbeg;
3374         }
3375         else if (prog->intflags & PREGf_GPOS_FLOAT)
3376             startpos = strbeg;
3377     }
3378
3379     minlen = prog->minlen;
3380     if ((startpos + minlen) > strend || startpos < strbeg) {
3381         DEBUG_r(Perl_re_printf( aTHX_
3382                     "Regex match can't succeed, so not even tried\n"));
3383         return 0;
3384     }
3385
3386     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3387      * which will call destuctors to reset PL_regmatch_state, free higher
3388      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3389      * regmatch_info_aux_eval */
3390
3391     oldsave = PL_savestack_ix;
3392
3393     s = startpos;
3394
3395     if ((prog->extflags & RXf_USE_INTUIT)
3396         && !(flags & REXEC_CHECKED))
3397     {
3398         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3399                                     flags, NULL);
3400         if (!s)
3401             return 0;
3402
3403         if (prog->extflags & RXf_CHECK_ALL) {
3404             /* we can match based purely on the result of INTUIT.
3405              * Set up captures etc just for $& and $-[0]
3406              * (an intuit-only match wont have $1,$2,..) */
3407             assert(!prog->nparens);
3408
3409             /* s/// doesn't like it if $& is earlier than where we asked it to
3410              * start searching (which can happen on something like /.\G/) */
3411             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3412                     && (s < stringarg))
3413             {
3414                 /* this should only be possible under \G */
3415                 assert(prog->intflags & PREGf_GPOS_SEEN);
3416                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3417                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3418                 goto phooey;
3419             }
3420
3421             /* match via INTUIT shouldn't have any captures.
3422              * Let @-, @+, $^N know */
3423             prog->lastparen = prog->lastcloseparen = 0;
3424             RXp_MATCH_UTF8_set(prog, utf8_target);
3425             prog->offs[0].start = s - strbeg;
3426             prog->offs[0].end = utf8_target
3427                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3428                 : s - strbeg + prog->minlenret;
3429             if ( !(flags & REXEC_NOT_FIRST) )
3430                 S_reg_set_capture_string(aTHX_ rx,
3431                                         strbeg, strend,
3432                                         sv, flags, utf8_target);
3433
3434             return 1;
3435         }
3436     }
3437
3438     multiline = prog->extflags & RXf_PMf_MULTILINE;
3439
3440     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3441         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3442                               "String too short [regexec_flags]...\n"));
3443         goto phooey;
3444     }
3445
3446     /* Check validity of program. */
3447     if (UCHARAT(progi->program) != REG_MAGIC) {
3448         Perl_croak(aTHX_ "corrupted regexp program");
3449     }
3450
3451     RXp_MATCH_TAINTED_off(prog);
3452     RXp_MATCH_UTF8_set(prog, utf8_target);
3453
3454     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3455     reginfo->intuit = 0;
3456     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3457     reginfo->warned = FALSE;
3458     reginfo->sv = sv;
3459     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3460     /* see how far we have to get to not match where we matched before */
3461     reginfo->till = stringarg + minend;
3462
3463     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3464         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3465            S_cleanup_regmatch_info_aux has executed (registered by
3466            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3467            magic belonging to this SV.
3468            Not newSVsv, either, as it does not COW.
3469         */
3470         reginfo->sv = newSV(0);
3471         SvSetSV_nosteal(reginfo->sv, sv);
3472         SAVEFREESV(reginfo->sv);
3473     }
3474
3475     /* reserve next 2 or 3 slots in PL_regmatch_state:
3476      * slot N+0: may currently be in use: skip it
3477      * slot N+1: use for regmatch_info_aux struct
3478      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3479      * slot N+3: ready for use by regmatch()
3480      */
3481
3482     {
3483         regmatch_state *old_regmatch_state;
3484         regmatch_slab  *old_regmatch_slab;
3485         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3486
3487         /* on first ever match, allocate first slab */
3488         if (!PL_regmatch_slab) {
3489             Newx(PL_regmatch_slab, 1, regmatch_slab);
3490             PL_regmatch_slab->prev = NULL;
3491             PL_regmatch_slab->next = NULL;
3492             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3493         }
3494
3495         old_regmatch_state = PL_regmatch_state;
3496         old_regmatch_slab  = PL_regmatch_slab;
3497
3498         for (i=0; i <= max; i++) {
3499             if (i == 1)
3500                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3501             else if (i ==2)
3502                 reginfo->info_aux_eval =
3503                 reginfo->info_aux->info_aux_eval =
3504                             &(PL_regmatch_state->u.info_aux_eval);
3505
3506             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3507                 PL_regmatch_state = S_push_slab(aTHX);
3508         }
3509
3510         /* note initial PL_regmatch_state position; at end of match we'll
3511          * pop back to there and free any higher slabs */
3512
3513         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3514         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3515         reginfo->info_aux->poscache = NULL;
3516
3517         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3518
3519         if ((prog->extflags & RXf_EVAL_SEEN))
3520             S_setup_eval_state(aTHX_ reginfo);
3521         else
3522             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3523     }
3524
3525     /* If there is a "must appear" string, look for it. */
3526
3527     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3528         /* We have to be careful. If the previous successful match
3529            was from this regex we don't want a subsequent partially
3530            successful match to clobber the old results.
3531            So when we detect this possibility we add a swap buffer
3532            to the re, and switch the buffer each match. If we fail,
3533            we switch it back; otherwise we leave it swapped.
3534         */
3535         swap = prog->offs;
3536         /* do we need a save destructor here for eval dies? */
3537         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3538         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3539             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3540             0,
3541             PTR2UV(prog),
3542             PTR2UV(swap),
3543             PTR2UV(prog->offs)
3544         ));
3545     }
3546
3547     if (prog->recurse_locinput)
3548         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3549
3550     /* Simplest case: anchored match need be tried only once, or with
3551      * MBOL, only at the beginning of each line.
3552      *
3553      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3554      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3555      * match at the start of the string then it won't match anywhere else
3556      * either; while with /.*.../, if it doesn't match at the beginning,
3557      * the earliest it could match is at the start of the next line */
3558
3559     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3560         char *end;
3561
3562         if (regtry(reginfo, &s))
3563             goto got_it;
3564
3565         if (!(prog->intflags & PREGf_ANCH_MBOL))
3566             goto phooey;
3567
3568         /* didn't match at start, try at other newline positions */
3569
3570         if (minlen)
3571             dontbother = minlen - 1;
3572         end = HOP3c(strend, -dontbother, strbeg) - 1;
3573
3574         /* skip to next newline */
3575
3576         while (s <= end) { /* note it could be possible to match at the end of the string */
3577             /* NB: newlines are the same in unicode as they are in latin */
3578             if (*s++ != '\n')
3579                 continue;
3580             if (prog->check_substr || prog->check_utf8) {
3581             /* note that with PREGf_IMPLICIT, intuit can only fail
3582              * or return the start position, so it's of limited utility.
3583              * Nevertheless, I made the decision that the potential for
3584              * quick fail was still worth it - DAPM */
3585                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3586                 if (!s)
3587                     goto phooey;
3588             }
3589             if (regtry(reginfo, &s))
3590                 goto got_it;
3591         }
3592         goto phooey;
3593     } /* end anchored search */
3594
3595     if (prog->intflags & PREGf_ANCH_GPOS)
3596     {
3597         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3598         assert(prog->intflags & PREGf_GPOS_SEEN);
3599         /* For anchored \G, the only position it can match from is
3600          * (ganch-gofs); we already set startpos to this above; if intuit
3601          * moved us on from there, we can't possibly succeed */
3602         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3603         if (s == startpos && regtry(reginfo, &s))
3604             goto got_it;
3605         goto phooey;
3606     }
3607
3608     /* Messy cases:  unanchored match. */
3609     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3610         /* we have /x+whatever/ */
3611         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3612         char ch;
3613 #ifdef DEBUGGING
3614         int did_match = 0;
3615 #endif
3616         if (utf8_target) {
3617             if (! prog->anchored_utf8) {
3618                 to_utf8_substr(prog);
3619             }
3620             ch = SvPVX_const(prog->anchored_utf8)[0];
3621             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3622                 if (*s == ch) {
3623                     DEBUG_EXECUTE_r( did_match = 1 );
3624                     if (regtry(reginfo, &s)) goto got_it;
3625                     s += UTF8SKIP(s);
3626                     while (s < strend && *s == ch)
3627                         s += UTF8SKIP(s);
3628                 }
3629             );
3630
3631         }
3632         else {
3633             if (! prog->anchored_substr) {
3634                 if (! to_byte_substr(prog)) {
3635                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3636                 }
3637             }
3638             ch = SvPVX_const(prog->anchored_substr)[0];
3639             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3640                 if (*s == ch) {
3641                     DEBUG_EXECUTE_r( did_match = 1 );
3642                     if (regtry(reginfo, &s)) goto got_it;
3643                     s++;
3644                     while (s < strend && *s == ch)
3645                         s++;
3646                 }
3647             );
3648         }
3649         DEBUG_EXECUTE_r(if (!did_match)
3650                 Perl_re_printf( aTHX_
3651                                   "Did not find anchored character...\n")
3652                );
3653     }
3654     else if (prog->anchored_substr != NULL
3655               || prog->anchored_utf8 != NULL
3656               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3657                   && prog->float_max_offset < strend - s)) {
3658         SV *must;
3659         SSize_t back_max;
3660         SSize_t back_min;
3661         char *last;
3662         char *last1;            /* Last position checked before */
3663 #ifdef DEBUGGING
3664         int did_match = 0;
3665 #endif
3666         if (prog->anchored_substr || prog->anchored_utf8) {
3667             if (utf8_target) {
3668                 if (! prog->anchored_utf8) {
3669                     to_utf8_substr(prog);
3670                 }
3671                 must = prog->anchored_utf8;
3672             }
3673             else {
3674                 if (! prog->anchored_substr) {
3675                     if (! to_byte_substr(prog)) {
3676                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3677                     }
3678                 }
3679                 must = prog->anchored_substr;
3680             }
3681             back_max = back_min = prog->anchored_offset;
3682         } else {
3683             if (utf8_target) {
3684                 if (! prog->float_utf8) {
3685                     to_utf8_substr(prog);
3686                 }
3687                 must = prog->float_utf8;
3688             }
3689             else {
3690                 if (! prog->float_substr) {
3691                     if (! to_byte_substr(prog)) {
3692                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3693                     }
3694                 }
3695                 must = prog->float_substr;
3696             }
3697             back_max = prog->float_max_offset;
3698             back_min = prog->float_min_offset;
3699         }
3700
3701         if (back_min<0) {
3702             last = strend;
3703         } else {
3704             last = HOP3c(strend,        /* Cannot start after this */
3705                   -(SSize_t)(CHR_SVLEN(must)
3706                          - (SvTAIL(must) != 0) + back_min), strbeg);
3707         }
3708         if (s > reginfo->strbeg)
3709             last1 = HOPc(s, -1);
3710         else
3711             last1 = s - 1;      /* bogus */
3712
3713         /* XXXX check_substr already used to find "s", can optimize if
3714            check_substr==must. */
3715         dontbother = 0;
3716         strend = HOPc(strend, -dontbother);
3717         while ( (s <= last) &&
3718                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3719                                   (unsigned char*)strend, must,
3720                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3721             DEBUG_EXECUTE_r( did_match = 1 );
3722             if (HOPc(s, -back_max) > last1) {
3723                 last1 = HOPc(s, -back_min);
3724                 s = HOPc(s, -back_max);
3725             }
3726             else {
3727                 char * const t = (last1 >= reginfo->strbeg)
3728                                     ? HOPc(last1, 1) : last1 + 1;
3729
3730                 last1 = HOPc(s, -back_min);
3731                 s = t;
3732             }
3733             if (utf8_target) {
3734                 while (s <= last1) {
3735                     if (regtry(reginfo, &s))
3736                         goto got_it;
3737                     if (s >= last1) {
3738                         s++; /* to break out of outer loop */
3739                         break;
3740                     }
3741                     s += UTF8SKIP(s);
3742                 }
3743             }
3744             else {
3745                 while (s <= last1) {
3746                     if (regtry(reginfo, &s))
3747                         goto got_it;
3748                     s++;
3749                 }
3750             }
3751         }
3752         DEBUG_EXECUTE_r(if (!did_match) {
3753             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3754                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3755             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3756                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3757                                ? "anchored" : "floating"),
3758                 quoted, RE_SV_TAIL(must));
3759         });
3760         goto phooey;
3761     }
3762     else if ( (c = progi->regstclass) ) {
3763         if (minlen) {
3764             const OPCODE op = OP(progi->regstclass);
3765             /* don't bother with what can't match */
3766             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3767                 strend = HOPc(strend, -(minlen - 1));
3768         }
3769         DEBUG_EXECUTE_r({
3770             SV * const prop = sv_newmortal();
3771             regprop(prog, prop, c, reginfo, NULL);
3772             {
3773                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3774                     s,strend-s,PL_dump_re_max_len);
3775                 Perl_re_printf( aTHX_
3776                     "Matching stclass %.*s against %s (%d bytes)\n",
3777                     (int)SvCUR(prop), SvPVX_const(prop),
3778                      quoted, (int)(strend - s));
3779             }
3780         });
3781         if (find_byclass(prog, c, s, strend, reginfo))
3782             goto got_it;
3783         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3784     }
3785     else {
3786         dontbother = 0;
3787         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3788             /* Trim the end. */
3789             char *last= NULL;
3790             SV* float_real;
3791             STRLEN len;
3792             const char *little;
3793
3794             if (utf8_target) {
3795                 if (! prog->float_utf8) {
3796                     to_utf8_substr(prog);
3797                 }
3798                 float_real = prog->float_utf8;
3799             }
3800             else {
3801                 if (! prog->float_substr) {
3802                     if (! to_byte_substr(prog)) {
3803                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3804                     }
3805                 }
3806                 float_real = prog->float_substr;
3807             }
3808
3809             little = SvPV_const(float_real, len);
3810             if (SvTAIL(float_real)) {
3811                     /* This means that float_real contains an artificial \n on
3812                      * the end due to the presence of something like this:
3813                      * /foo$/ where we can match both "foo" and "foo\n" at the
3814                      * end of the string.  So we have to compare the end of the
3815                      * string first against the float_real without the \n and
3816                      * then against the full float_real with the string.  We
3817                      * have to watch out for cases where the string might be
3818                      * smaller than the float_real or the float_real without
3819                      * the \n. */
3820                     char *checkpos= strend - len;
3821                     DEBUG_OPTIMISE_r(
3822                         Perl_re_printf( aTHX_
3823                             "%sChecking for float_real.%s\n",
3824                             PL_colors[4], PL_colors[5]));
3825                     if (checkpos + 1 < strbeg) {
3826                         /* can't match, even if we remove the trailing \n
3827                          * string is too short to match */
3828                         DEBUG_EXECUTE_r(
3829                             Perl_re_printf( aTHX_
3830                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3831                                 PL_colors[4], PL_colors[5]));
3832                         goto phooey;
3833                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3834                         /* can match, the end of the string matches without the
3835                          * "\n" */
3836                         last = checkpos + 1;
3837                     } else if (checkpos < strbeg) {
3838                         /* cant match, string is too short when the "\n" is
3839                          * included */
3840                         DEBUG_EXECUTE_r(
3841                             Perl_re_printf( aTHX_
3842                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3843                                 PL_colors[4], PL_colors[5]));
3844                         goto phooey;
3845                     } else if (!multiline) {
3846                         /* non multiline match, so compare with the "\n" at the
3847                          * end of the string */
3848                         if (memEQ(checkpos, little, len)) {
3849                             last= checkpos;
3850                         } else {
3851                             DEBUG_EXECUTE_r(
3852                                 Perl_re_printf( aTHX_
3853                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3854                                     PL_colors[4], PL_colors[5]));
3855                             goto phooey;
3856                         }
3857                     } else {
3858                         /* multiline match, so we have to search for a place
3859                          * where the full string is located */
3860                         goto find_last;
3861                     }
3862             } else {
3863                   find_last:
3864                     if (len)
3865                         last = rninstr(s, strend, little, little + len);
3866                     else
3867                         last = strend;  /* matching "$" */
3868             }
3869             if (!last) {
3870                 /* at one point this block contained a comment which was
3871                  * probably incorrect, which said that this was a "should not
3872                  * happen" case.  Even if it was true when it was written I am
3873                  * pretty sure it is not anymore, so I have removed the comment
3874                  * and replaced it with this one. Yves */
3875                 DEBUG_EXECUTE_r(
3876                     Perl_re_printf( aTHX_
3877                         "%sString does not contain required substring, cannot match.%s\n",
3878                         PL_colors[4], PL_colors[5]
3879                     ));
3880                 goto phooey;
3881             }
3882             dontbother = strend - last + prog->float_min_offset;
3883         }
3884         if (minlen && (dontbother < minlen))
3885             dontbother = minlen - 1;
3886         strend -= dontbother;              /* this one's always in bytes! */
3887         /* We don't know much -- general case. */
3888         if (utf8_target) {
3889             for (;;) {
3890                 if (regtry(reginfo, &s))
3891                     goto got_it;
3892                 if (s >= strend)
3893                     break;
3894                 s += UTF8SKIP(s);
3895             };
3896         }
3897         else {
3898             do {
3899                 if (regtry(reginfo, &s))
3900                     goto got_it;
3901             } while (s++ < strend);
3902         }
3903     }
3904
3905     /* Failure. */
3906     goto phooey;
3907
3908   got_it:
3909     /* s/// doesn't like it if $& is earlier than where we asked it to
3910      * start searching (which can happen on something like /.\G/) */
3911     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3912             && (prog->offs[0].start < stringarg - strbeg))
3913     {
3914         /* this should only be possible under \G */
3915         assert(prog->intflags & PREGf_GPOS_SEEN);
3916         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3917             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3918         goto phooey;
3919     }
3920
3921     DEBUG_BUFFERS_r(
3922         if (swap)
3923             Perl_re_exec_indentf( aTHX_
3924                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3925                 0,
3926                 PTR2UV(prog),
3927                 PTR2UV(swap)
3928             );
3929     );
3930     Safefree(swap);
3931
3932     /* clean up; this will trigger destructors that will free all slabs
3933      * above the current one, and cleanup the regmatch_info_aux
3934      * and regmatch_info_aux_eval sructs */
3935
3936     LEAVE_SCOPE(oldsave);
3937
3938     if (RXp_PAREN_NAMES(prog))
3939         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3940
3941     /* make sure $`, $&, $', and $digit will work later */
3942     if ( !(flags & REXEC_NOT_FIRST) )
3943         S_reg_set_capture_string(aTHX_ rx,
3944                                     strbeg, reginfo->strend,
3945                                     sv, flags, utf8_target);
3946
3947     return 1;
3948
3949   phooey:
3950     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3951                           PL_colors[4], PL_colors[5]));
3952
3953     /* clean up; this will trigger destructors that will free all slabs
3954      * above the current one, and cleanup the regmatch_info_aux
3955      * and regmatch_info_aux_eval sructs */
3956
3957     LEAVE_SCOPE(oldsave);
3958
3959     if (swap) {
3960         /* we failed :-( roll it back */
3961         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3962             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3963             0,
3964             PTR2UV(prog),
3965             PTR2UV(prog->offs),
3966             PTR2UV(swap)
3967         ));
3968         Safefree(prog->offs);
3969         prog->offs = swap;
3970     }
3971     return 0;
3972 }
3973
3974
3975 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3976  * Do inc before dec, in case old and new rex are the same */
3977 #define SET_reg_curpm(Re2)                          \
3978     if (reginfo->info_aux_eval) {                   \
3979         (void)ReREFCNT_inc(Re2);                    \
3980         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3981         PM_SETRE((PL_reg_curpm), (Re2));            \
3982     }
3983
3984
3985 /*
3986  - regtry - try match at specific point
3987  */
3988 STATIC bool                     /* 0 failure, 1 success */
3989 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3990 {
3991     CHECKPOINT lastcp;
3992     REGEXP *const rx = reginfo->prog;
3993     regexp *const prog = ReANY(rx);
3994     SSize_t result;
3995 #ifdef DEBUGGING
3996     U32 depth = 0; /* used by REGCP_SET */
3997 #endif
3998     RXi_GET_DECL(prog,progi);
3999     GET_RE_DEBUG_FLAGS_DECL;
4000
4001     PERL_ARGS_ASSERT_REGTRY;
4002
4003     reginfo->cutpoint=NULL;
4004
4005     prog->offs[0].start = *startposp - reginfo->strbeg;
4006     prog->lastparen = 0;
4007     prog->lastcloseparen = 0;
4008
4009     /* XXXX What this code is doing here?!!!  There should be no need
4010        to do this again and again, prog->lastparen should take care of
4011        this!  --ilya*/
4012
4013     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4014      * Actually, the code in regcppop() (which Ilya may be meaning by
4015      * prog->lastparen), is not needed at all by the test suite
4016      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4017      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4018      * Meanwhile, this code *is* needed for the
4019      * above-mentioned test suite tests to succeed.  The common theme
4020      * on those tests seems to be returning null fields from matches.
4021      * --jhi updated by dapm */
4022
4023     /* After encountering a variant of the issue mentioned above I think
4024      * the point Ilya was making is that if we properly unwind whenever
4025      * we set lastparen to a smaller value then we should not need to do
4026      * this every time, only when needed. So if we have tests that fail if
4027      * we remove this, then it suggests somewhere else we are improperly
4028      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4029      * places it is called, and related regcp() routines. - Yves */
4030 #if 1
4031     if (prog->nparens) {
4032         regexp_paren_pair *pp = prog->offs;
4033         I32 i;
4034         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4035             ++pp;
4036             pp->start = -1;
4037             pp->end = -1;
4038         }
4039     }
4040 #endif
4041     REGCP_SET(lastcp);
4042     result = regmatch(reginfo, *startposp, progi->program + 1);
4043     if (result != -1) {
4044         prog->offs[0].end = result;
4045         return 1;
4046     }
4047     if (reginfo->cutpoint)
4048         *startposp= reginfo->cutpoint;
4049     REGCP_UNWIND(lastcp);
4050     return 0;
4051 }
4052
4053
4054 #define sayYES goto yes
4055 #define sayNO goto no
4056 #define sayNO_SILENT goto no_silent
4057
4058 /* we dont use STMT_START/END here because it leads to
4059    "unreachable code" warnings, which are bogus, but distracting. */
4060 #define CACHEsayNO \
4061     if (ST.cache_mask) \
4062        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
4063     sayNO
4064
4065 /* this is used to determine how far from the left messages like
4066    'failed...' are printed in regexec.c. It should be set such that
4067    messages are inline with the regop output that created them.
4068 */
4069 #define REPORT_CODE_OFF 29
4070 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4071 #ifdef DEBUGGING
4072 int
4073 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4074 {
4075     va_list ap;
4076     int result;
4077     PerlIO *f= Perl_debug_log;
4078     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4079     va_start(ap, depth);
4080     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4081     result = PerlIO_vprintf(f, fmt, ap);
4082     va_end(ap);
4083     return result;
4084 }
4085 #endif /* DEBUGGING */
4086
4087
4088 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4089 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4090 #define CHRTEST_NOT_A_CP_1 -999
4091 #define CHRTEST_NOT_A_CP_2 -998
4092
4093 /* grab a new slab and return the first slot in it */
4094
4095 STATIC regmatch_state *
4096 S_push_slab(pTHX)
4097 {
4098     regmatch_slab *s = PL_regmatch_slab->next;
4099     if (!s) {
4100         Newx(s, 1, regmatch_slab);
4101         s->prev = PL_regmatch_slab;
4102         s->next = NULL;
4103         PL_regmatch_slab->next = s;
4104     }
4105     PL_regmatch_slab = s;
4106     return SLAB_FIRST(s);
4107 }
4108
4109
4110 /* push a new state then goto it */
4111
4112 #define PUSH_STATE_GOTO(state, node, input) \
4113     pushinput = input; \
4114     scan = node; \
4115     st->resume_state = state; \
4116     goto push_state;
4117
4118 /* push a new state with success backtracking, then goto it */
4119
4120 #define PUSH_YES_STATE_GOTO(state, node, input) \
4121     pushinput = input; \
4122     scan = node; \
4123     st->resume_state = state; \
4124     goto push_yes_state;
4125
4126
4127
4128
4129 /*
4130
4131 regmatch() - main matching routine
4132
4133 This is basically one big switch statement in a loop. We execute an op,
4134 set 'next' to point the next op, and continue. If we come to a point which
4135 we may need to backtrack to on failure such as (A|B|C), we push a
4136 backtrack state onto the backtrack stack. On failure, we pop the top
4137 state, and re-enter the loop at the state indicated. If there are no more
4138 states to pop, we return failure.
4139
4140 Sometimes we also need to backtrack on success; for example /A+/, where
4141 after successfully matching one A, we need to go back and try to
4142 match another one; similarly for lookahead assertions: if the assertion
4143 completes successfully, we backtrack to the state just before the assertion
4144 and then carry on.  In these cases, the pushed state is marked as
4145 'backtrack on success too'. This marking is in fact done by a chain of
4146 pointers, each pointing to the previous 'yes' state. On success, we pop to
4147 the nearest yes state, discarding any intermediate failure-only states.
4148 Sometimes a yes state is pushed just to force some cleanup code to be
4149 called at the end of a successful match or submatch; e.g. (??{$re}) uses
4150 it to free the inner regex.
4151
4152 Note that failure backtracking rewinds the cursor position, while
4153 success backtracking leaves it alone.
4154
4155 A pattern is complete when the END op is executed, while a subpattern
4156 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
4157 ops trigger the "pop to last yes state if any, otherwise return true"
4158 behaviour.
4159
4160 A common convention in this function is to use A and B to refer to the two
4161 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
4162 the subpattern to be matched possibly multiple times, while B is the entire
4163 rest of the pattern. Variable and state names reflect this convention.
4164
4165 The states in the main switch are the union of ops and failure/success of
4166 substates associated with with that op.  For example, IFMATCH is the op
4167 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
4168 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
4169 successfully matched A and IFMATCH_A_fail is a state saying that we have
4170 just failed to match A. Resume states always come in pairs. The backtrack
4171 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
4172 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
4173 on success or failure.
4174
4175 The struct that holds a backtracking state is actually a big union, with
4176 one variant for each major type of op. The variable st points to the
4177 top-most backtrack struct. To make the code clearer, within each
4178 block of code we #define ST to alias the relevant union.
4179
4180 Here's a concrete example of a (vastly oversimplified) IFMATCH
4181 implementation:
4182
4183     switch (state) {
4184     ....
4185
4186 #define ST st->u.ifmatch
4187
4188     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4189         ST.foo = ...; // some state we wish to save
4190         ...
4191         // push a yes backtrack state with a resume value of
4192         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
4193         // first node of A:
4194         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
4195         // NOTREACHED
4196
4197     case IFMATCH_A: // we have successfully executed A; now continue with B
4198         next = B;
4199         bar = ST.foo; // do something with the preserved value
4200         break;
4201
4202     case IFMATCH_A_fail: // A failed, so the assertion failed
4203         ...;   // do some housekeeping, then ...
4204         sayNO; // propagate the failure
4205
4206 #undef ST
4207
4208     ...
4209     }
4210
4211 For any old-timers reading this who are familiar with the old recursive
4212 approach, the code above is equivalent to:
4213
4214     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4215     {
4216         int foo = ...
4217         ...
4218         if (regmatch(A)) {
4219             next = B;
4220             bar = foo;
4221             break;
4222         }
4223         ...;   // do some housekeeping, then ...
4224         sayNO; // propagate the failure
4225     }
4226
4227 The topmost backtrack state, pointed to by st, is usually free. If you
4228 want to claim it, populate any ST.foo fields in it with values you wish to
4229 save, then do one of
4230
4231         PUSH_STATE_GOTO(resume_state, node, newinput);
4232         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
4233
4234 which sets that backtrack state's resume value to 'resume_state', pushes a
4235 new free entry to the top of the backtrack stack, then goes to 'node'.
4236 On backtracking, the free slot is popped, and the saved state becomes the
4237 new free state. An ST.foo field in this new top state can be temporarily
4238 accessed to retrieve values, but once the main loop is re-entered, it
4239 becomes available for reuse.
4240
4241 Note that the depth of the backtrack stack constantly increases during the
4242 left-to-right execution of the pattern, rather than going up and down with
4243 the pattern nesting. For example the stack is at its maximum at Z at the
4244 end of the pattern, rather than at X in the following:
4245
4246     /(((X)+)+)+....(Y)+....Z/
4247
4248 The only exceptions to this are lookahead/behind assertions and the cut,
4249 (?>A), which pop all the backtrack states associated with A before
4250 continuing.
4251
4252 Backtrack state structs are allocated in slabs of about 4K in size.
4253 PL_regmatch_state and st always point to the currently active state,
4254 and PL_regmatch_slab points to the slab currently containing
4255 PL_regmatch_state.  The first time regmatch() is called, the first slab is
4256 allocated, and is never freed until interpreter destruction. When the slab
4257 is full, a new one is allocated and chained to the end. At exit from
4258 regmatch(), slabs allocated since entry are freed.
4259
4260 */
4261
4262
4263 #define DEBUG_STATE_pp(pp)                                  \
4264     DEBUG_STATE_r({                                         \
4265         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
4266         Perl_re_printf( aTHX_                                           \
4267             "%*s" pp " %s%s%s%s%s\n",                       \
4268             INDENT_CHARS(depth), "",                        \
4269             PL_reg_name[st->resume_state],                  \
4270             ((st==yes_state||st==mark_state) ? "[" : ""),   \
4271             ((st==yes_state) ? "Y" : ""),                   \
4272             ((st==mark_state) ? "M" : ""),                  \
4273             ((st==yes_state||st==mark_state) ? "]" : "")    \
4274         );                                                  \
4275     });
4276
4277
4278 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4279
4280 #ifdef DEBUGGING
4281
4282 STATIC void
4283 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4284     const char *start, const char *end, const char *blurb)
4285 {
4286     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4287
4288     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4289
4290     if (!PL_colorset)
4291             reginitcolors();
4292     {
4293         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4294             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4295
4296         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4297             start, end - start, PL_dump_re_max_len);
4298
4299         Perl_re_printf( aTHX_
4300             "%s%s REx%s %s against %s\n",
4301                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4302
4303         if (utf8_target||utf8_pat)
4304             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4305                 utf8_pat ? "pattern" : "",
4306                 utf8_pat && utf8_target ? " and " : "",
4307                 utf8_target ? "string" : ""
4308             );
4309     }
4310 }
4311
4312 STATIC void
4313 S_dump_exec_pos(pTHX_ const char *locinput,
4314                       const regnode *scan,
4315                       const char *loc_regeol,
4316                       const char *loc_bostr,
4317                       const char *loc_reg_starttry,
4318                       const bool utf8_target,
4319                       const U32 depth
4320                 )
4321 {
4322     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4323     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4324     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4325     /* The part of the string before starttry has one color
4326        (pref0_len chars), between starttry and current
4327        position another one (pref_len - pref0_len chars),
4328        after the current position the third one.
4329        We assume that pref0_len <= pref_len, otherwise we
4330        decrease pref0_len.  */
4331     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4332         ? (5 + taill) - l : locinput - loc_bostr;
4333     int pref0_len;
4334
4335     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4336
4337     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4338         pref_len++;
4339     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4340     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4341         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4342               ? (5 + taill) - pref_len : loc_regeol - locinput);
4343     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4344         l--;
4345     if (pref0_len < 0)
4346         pref0_len = 0;
4347     if (pref0_len > pref_len)
4348         pref0_len = pref_len;
4349     {
4350         const int is_uni = utf8_target ? 1 : 0;
4351
4352         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4353             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4354
4355         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4356                     (locinput - pref_len + pref0_len),
4357                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4358
4359         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4360                     locinput, loc_regeol - locinput, 10, 0, 1);
4361
4362         const STRLEN tlen=len0+len1+len2;
4363         Perl_re_printf( aTHX_
4364                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4365                     (IV)(locinput - loc_bostr),
4366                     len0, s0,
4367                     len1, s1,
4368                     (docolor ? "" : "> <"),
4369                     len2, s2,
4370                     (int)(tlen > 19 ? 0 :  19 - tlen),
4371                     "",
4372                     depth);
4373     }
4374 }
4375
4376 #endif
4377
4378 /* reg_check_named_buff_matched()
4379  * Checks to see if a named buffer has matched. The data array of
4380  * buffer numbers corresponding to the buffer is expected to reside
4381  * in the regexp->data->data array in the slot stored in the ARG() of
4382  * node involved. Note that this routine doesn't actually care about the
4383  * name, that information is not preserved from compilation to execution.
4384  * Returns the index of the leftmost defined buffer with the given name
4385  * or 0 if non of the buffers matched.
4386  */
4387 STATIC I32
4388 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4389 {
4390     I32 n;
4391     RXi_GET_DECL(rex,rexi);
4392     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4393     I32 *nums=(I32*)SvPVX(sv_dat);
4394
4395     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4396
4397     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4398         if ((I32)rex->lastparen >= nums[n] &&
4399             rex->offs[nums[n]].end != -1)
4400         {
4401             return nums[n];
4402         }
4403     }
4404     return 0;
4405 }
4406
4407
4408 static bool
4409 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4410         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4411 {
4412     /* This function determines if there are one or two characters that match
4413      * the first character of the passed-in EXACTish node <text_node>, and if
4414      * so, returns them in the passed-in pointers.
4415      *
4416      * If it determines that no possible character in the target string can
4417      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4418      * the first character in <text_node> requires UTF-8 to represent, and the
4419      * target string isn't in UTF-8.)
4420      *
4421      * If there are more than two characters that could match the beginning of
4422      * <text_node>, or if more context is required to determine a match or not,
4423      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4424      *
4425      * The motiviation behind this function is to allow the caller to set up
4426      * tight loops for matching.  If <text_node> is of type EXACT, there is
4427      * only one possible character that can match its first character, and so
4428      * the situation is quite simple.  But things get much more complicated if
4429      * folding is involved.  It may be that the first character of an EXACTFish
4430      * node doesn't participate in any possible fold, e.g., punctuation, so it
4431      * can be matched only by itself.  The vast majority of characters that are
4432      * in folds match just two things, their lower and upper-case equivalents.
4433      * But not all are like that; some have multiple possible matches, or match
4434      * sequences of more than one character.  This function sorts all that out.
4435      *
4436      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4437      * loop of trying to match A*, we know we can't exit where the thing
4438      * following it isn't a B.  And something can't be a B unless it is the
4439      * beginning of B.  By putting a quick test for that beginning in a tight
4440      * loop, we can rule out things that can't possibly be B without having to
4441      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4442      * character, we can make a tight loop matching A*, using the outputs of
4443      * this function.
4444      *
4445      * If the target string to match isn't in UTF-8, and there aren't
4446      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4447      * the one or two possible octets (which are characters in this situation)
4448      * that can match.  In all cases, if there is only one character that can
4449      * match, *<c1p> and *<c2p> will be identical.
4450      *
4451      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4452      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4453      * can match the beginning of <text_node>.  They should be declared with at
4454      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4455      * undefined what these contain.)  If one or both of the buffers are
4456      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4457      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4458      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4459      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4460      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4461
4462     const bool utf8_target = reginfo->is_utf8_target;
4463
4464     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4465     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4466     bool use_chrtest_void = FALSE;
4467     const bool is_utf8_pat = reginfo->is_utf8_pat;
4468
4469     /* Used when we have both utf8 input and utf8 output, to avoid converting
4470      * to/from code points */
4471     bool utf8_has_been_setup = FALSE;
4472
4473     dVAR;
4474
4475     U8 *pat = (U8*)STRING(text_node);
4476     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4477
4478     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4479
4480         /* In an exact node, only one thing can be matched, that first
4481          * character.  If both the pat and the target are UTF-8, we can just
4482          * copy the input to the output, avoiding finding the code point of
4483          * that character */
4484         if (!is_utf8_pat) {
4485             c2 = c1 = *pat;
4486         }
4487         else if (utf8_target) {
4488             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4489             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4490             utf8_has_been_setup = TRUE;
4491         }
4492         else {
4493             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4494         }
4495     }
4496     else { /* an EXACTFish node */
4497         U8 *pat_end = pat + STR_LEN(text_node);
4498
4499         /* An EXACTFL node has at least some characters unfolded, because what
4500          * they match is not known until now.  So, now is the time to fold
4501          * the first few of them, as many as are needed to determine 'c1' and
4502          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4503          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4504          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4505          * need to fold as many characters as a single character can fold to,
4506          * so that later we can check if the first ones are such a multi-char
4507          * fold.  But, in such a pattern only locale-problematic characters
4508          * aren't folded, so we can skip this completely if the first character
4509          * in the node isn't one of the tricky ones */
4510         if (OP(text_node) == EXACTFL) {
4511
4512             if (! is_utf8_pat) {
4513                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4514                 {
4515                     folded[0] = folded[1] = 's';
4516                     pat = folded;
4517                     pat_end = folded + 2;
4518                 }
4519             }
4520             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4521                 U8 *s = pat;
4522                 U8 *d = folded;
4523                 int i;
4524
4525                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4526                     if (isASCII(*s)) {
4527                         *(d++) = (U8) toFOLD_LC(*s);
4528                         s++;
4529                     }
4530                     else {
4531                         STRLEN len;
4532                         _toFOLD_utf8_flags(s,
4533                                            pat_end,
4534                                            d,
4535                                            &len,
4536                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4537                         d += len;
4538                         s += UTF8SKIP(s);
4539                     }
4540                 }
4541
4542                 pat = folded;
4543                 pat_end = d;
4544             }
4545         }
4546
4547         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4548              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4549         {
4550             /* Multi-character folds require more context to sort out.  Also
4551              * PL_utf8_foldclosures used below doesn't handle them, so have to
4552              * be handled outside this routine */
4553             use_chrtest_void = TRUE;
4554         }
4555         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4556             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4557             if (c1 > 255) {
4558                 const unsigned int * remaining_folds_to_list;
4559                 unsigned int first_folds_to;
4560
4561                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4562                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4563                 Size_t folds_to_count = _inverse_folds(c1,
4564                                                      &first_folds_to,
4565                                                      &remaining_folds_to_list);
4566                 if (folds_to_count == 0) {
4567                     c2 = c1;    /* there is only a single character that could
4568                                    match */
4569                 }
4570                 else if (folds_to_count != 1) {
4571                     /* If there aren't exactly two folds to this (itself and
4572                      * another), it is outside the scope of this function */
4573                     use_chrtest_void = TRUE;
4574                 }
4575                 else {  /* There are two.  We already have one, get the other */
4576                     c2 = first_folds_to;
4577
4578                     /* Folds that cross the 255/256 boundary are forbidden if
4579                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4580                      * ASCIII.  The only other match to c1 is c2, and since c1
4581                      * is above 255, c2 better be as well under these
4582                      * circumstances.  If it isn't, it means the only legal
4583                      * match of c1 is itself. */
4584                     if (    c2 < 256
4585                         && (   (   OP(text_node) == EXACTFL
4586                                 && ! IN_UTF8_CTYPE_LOCALE)
4587                             || ((     OP(text_node) == EXACTFAA
4588                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4589                                 && (isASCII(c1) || isASCII(c2)))))
4590                     {
4591                         c2 = c1;
4592                     }
4593                 }
4594             }
4595             else /* Here, c1 is <= 255 */
4596                 if (utf8_target
4597                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4598                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4599                     && ((OP(text_node) != EXACTFAA
4600                         && OP(text_node) != EXACTFAA_NO_TRIE)
4601                         || ! isASCII(c1)))
4602             {
4603                 /* Here, there could be something above Latin1 in the target
4604                  * which folds to this character in the pattern.  All such
4605                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4606                  * than two characters involved in their folds, so are outside
4607                  * the scope of this function */
4608                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4609                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4610                 }
4611                 else {
4612                     use_chrtest_void = TRUE;
4613                 }
4614             }
4615             else { /* Here nothing above Latin1 can fold to the pattern
4616                       character */
4617                 switch (OP(text_node)) {
4618
4619                     case EXACTFL:   /* /l rules */
4620                         c2 = PL_fold_locale[c1];
4621                         break;
4622
4623                     case EXACTF:   /* This node only generated for non-utf8
4624                                     patterns */
4625                         assert(! is_utf8_pat);
4626                         if (! utf8_target) {    /* /d rules */
4627                             c2 = PL_fold[c1];
4628                             break;
4629                         }
4630                         /* FALLTHROUGH */
4631                         /* /u rules for all these.  This happens to work for
4632                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4633                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4634                                                 non-utf8 patterns */
4635                         assert(! is_utf8_pat);
4636                         /* FALLTHROUGH */
4637                     case EXACTFAA:
4638                     case EXACTFU_SS:
4639                     case EXACTFU:
4640                         c2 = PL_fold_latin1[c1];
4641                         break;
4642
4643                     default:
4644                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4645                         NOT_REACHED; /* NOTREACHED */
4646                 }
4647             }
4648         }
4649     }
4650
4651     /* Here have figured things out.  Set up the returns */
4652     if (use_chrtest_void) {
4653         *c2p = *c1p = CHRTEST_VOID;
4654     }
4655     else if (utf8_target) {
4656         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4657             uvchr_to_utf8(c1_utf8, c1);
4658             uvchr_to_utf8(c2_utf8, c2);
4659         }
4660
4661         /* Invariants are stored in both the utf8 and byte outputs; Use
4662          * negative numbers otherwise for the byte ones.  Make sure that the
4663          * byte ones are the same iff the utf8 ones are the same */
4664         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4665         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4666                 ? *c2_utf8
4667                 : (c1 == c2)
4668                   ? CHRTEST_NOT_A_CP_1
4669                   : CHRTEST_NOT_A_CP_2;
4670     }
4671     else if (c1 > 255) {
4672        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4673                            can represent */
4674            return FALSE;
4675        }
4676
4677        *c1p = *c2p = c2;    /* c2 is the only representable value */
4678     }
4679     else {  /* c1 is representable; see about c2 */
4680        *c1p = c1;
4681        *c2p = (c2 < 256) ? c2 : c1;
4682     }
4683
4684     return TRUE;
4685 }
4686
4687 STATIC bool
4688 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4689 {
4690     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4691      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4692
4693     PERL_ARGS_ASSERT_ISGCB;
4694
4695     switch (GCB_table[before][after]) {
4696         case GCB_BREAKABLE:
4697             return TRUE;
4698
4699         case GCB_NOBREAK:
4700             return FALSE;
4701
4702         case GCB_RI_then_RI:
4703             {
4704                 int RI_count = 1;
4705                 U8 * temp_pos = (U8 *) curpos;
4706
4707                 /* Do not break within emoji flag sequences. That is, do not
4708                  * break between regional indicator (RI) symbols if there is an
4709                  * odd number of RI characters before the break point.
4710                  *  GB12   sot (RI RI)* RI × RI
4711                  *  GB13 [^RI] (RI RI)* RI × RI */
4712
4713                 while (backup_one_GCB(strbeg,
4714                                     &temp_pos,
4715                                     utf8_target) == GCB_Regional_Indicator)
4716                 {
4717                     RI_count++;
4718                 }
4719
4720                 return RI_count % 2 != 1;
4721             }
4722
4723         case GCB_EX_then_EM:
4724
4725             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4726             {
4727                 U8 * temp_pos = (U8 *) curpos;
4728                 GCB_enum prev;
4729
4730                 do {
4731                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4732                 }
4733                 while (prev == GCB_Extend);
4734
4735                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4736             }
4737
4738         case GCB_Maybe_Emoji_NonBreak:
4739
4740             {
4741
4742             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4743               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4744               */
4745                 U8 * temp_pos = (U8 *) curpos;
4746                 GCB_enum prev;
4747
4748                 do {
4749                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4750                 }
4751                 while (prev == GCB_Extend);
4752
4753                 return prev != GCB_XPG_XX;
4754             }
4755
4756         default:
4757             break;
4758     }
4759
4760 #ifdef DEBUGGING
4761     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4762                                   before, after, GCB_table[before][after]);
4763     assert(0);
4764 #endif
4765     return TRUE;
4766 }
4767
4768 STATIC GCB_enum
4769 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4770 {
4771     GCB_enum gcb;
4772
4773     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4774
4775     if (*curpos < strbeg) {
4776         return GCB_EDGE;
4777     }
4778
4779     if (utf8_target) {
4780         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4781         U8 * prev_prev_char_pos;
4782
4783         if (! prev_char_pos) {
4784             return GCB_EDGE;
4785         }
4786
4787         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4788             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4789             *curpos = prev_char_pos;
4790             prev_char_pos = prev_prev_char_pos;
4791         }
4792         else {
4793             *curpos = (U8 *) strbeg;
4794             return GCB_EDGE;
4795         }
4796     }
4797     else {
4798         if (*curpos - 2 < strbeg) {
4799             *curpos = (U8 *) strbeg;
4800             return GCB_EDGE;
4801         }
4802         (*curpos)--;
4803         gcb = getGCB_VAL_CP(*(*curpos - 1));
4804     }
4805
4806     return gcb;
4807 }
4808
4809 /* Combining marks attach to most classes that precede them, but this defines
4810  * the exceptions (from TR14) */
4811 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4812                                      || prev == LB_Mandatory_Break      \
4813                                      || prev == LB_Carriage_Return      \
4814                                      || prev == LB_Line_Feed            \
4815                                      || prev == LB_Next_Line            \
4816                                      || prev == LB_Space                \
4817                                      || prev == LB_ZWSpace))
4818
4819 STATIC bool
4820 S_isLB(pTHX_ LB_enum before,
4821              LB_enum after,
4822              const U8 * const strbeg,
4823              const U8 * const curpos,
4824              const U8 * const strend,
4825              const bool utf8_target)
4826 {
4827     U8 * temp_pos = (U8 *) curpos;
4828     LB_enum prev = before;
4829
4830     /* Is the boundary between 'before' and 'after' line-breakable?
4831      * Most of this is just a table lookup of a generated table from Unicode
4832      * rules.  But some rules require context to decide, and so have to be
4833      * implemented in code */
4834
4835     PERL_ARGS_ASSERT_ISLB;
4836
4837     /* Rule numbers in the comments below are as of Unicode 9.0 */
4838
4839   redo:
4840     before = prev;
4841     switch (LB_table[before][after]) {
4842         case LB_BREAKABLE:
4843             return TRUE;
4844
4845         case LB_NOBREAK:
4846         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4847             return FALSE;
4848
4849         case LB_SP_foo + LB_BREAKABLE:
4850         case LB_SP_foo + LB_NOBREAK:
4851         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4852
4853             /* When we have something following a SP, we have to look at the
4854              * context in order to know what to do.
4855              *
4856              * SP SP should not reach here because LB7: Do not break before
4857              * spaces.  (For two spaces in a row there is nothing that
4858              * overrides that) */
4859             assert(after != LB_Space);
4860
4861             /* Here we have a space followed by a non-space.  Mostly this is a
4862              * case of LB18: "Break after spaces".  But there are complications
4863              * as the handling of spaces is somewhat tricky.  They are in a
4864              * number of rules, which have to be applied in priority order, but
4865              * something earlier in the string can cause a rule to be skipped
4866              * and a lower priority rule invoked.  A prime example is LB7 which
4867              * says don't break before a space.  But rule LB8 (lower priority)
4868              * says that the first break opportunity after a ZW is after any
4869              * span of spaces immediately after it.  If a ZW comes before a SP
4870              * in the input, rule LB8 applies, and not LB7.  Other such rules
4871              * involve combining marks which are rules 9 and 10, but they may
4872              * override higher priority rules if they come earlier in the
4873              * string.  Since we're doing random access into the middle of the
4874              * string, we have to look for rules that should get applied based
4875              * on both string position and priority.  Combining marks do not
4876              * attach to either ZW nor SP, so we don't have to consider them
4877              * until later.
4878              *
4879              * To check for LB8, we have to find the first non-space character
4880              * before this span of spaces */
4881             do {
4882                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4883             }
4884             while (prev == LB_Space);
4885
4886             /* LB8 Break before any character following a zero-width space,
4887              * even if one or more spaces intervene.
4888              *      ZW SP* ÷
4889              * So if we have a ZW just before this span, and to get here this
4890              * is the final space in the span. */
4891             if (prev == LB_ZWSpace) {
4892                 return TRUE;
4893             }
4894
4895             /* Here, not ZW SP+.  There are several rules that have higher
4896              * priority than LB18 and can be resolved now, as they don't depend
4897              * on anything earlier in the string (except ZW, which we have
4898              * already handled).  One of these rules is LB11 Do not break
4899              * before Word joiner, but we have specially encoded that in the
4900              * lookup table so it is caught by the single test below which
4901              * catches the other ones. */
4902             if (LB_table[LB_Space][after] - LB_SP_foo
4903                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4904             {
4905                 return FALSE;
4906             }
4907
4908             /* If we get here, we have to XXX consider combining marks. */
4909             if (prev == LB_Combining_Mark) {
4910
4911                 /* What happens with these depends on the character they
4912                  * follow.  */
4913                 do {
4914                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4915                 }
4916                 while (prev == LB_Combining_Mark);
4917
4918                 /* Most times these attach to and inherit the characteristics
4919                  * of that character, but not always, and when not, they are to
4920                  * be treated as AL by rule LB10. */
4921                 if (! LB_CM_ATTACHES_TO(prev)) {
4922                     prev = LB_Alphabetic;
4923                 }
4924             }
4925
4926             /* Here, we have the character preceding the span of spaces all set
4927              * up.  We follow LB18: "Break after spaces" unless the table shows
4928              * that is overriden */
4929             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4930
4931         case LB_CM_ZWJ_foo:
4932
4933             /* We don't know how to treat the CM except by looking at the first
4934              * non-CM character preceding it.  ZWJ is treated as CM */
4935             do {
4936                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4937             }
4938             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4939
4940             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4941              * attatches to it, then it inherits the behavior of 'prev'.  If it
4942              * doesn't attach, it is to be treated as an AL */
4943             if (! LB_CM_ATTACHES_TO(prev)) {
4944                 prev = LB_Alphabetic;
4945             }
4946
4947             goto redo;
4948
4949         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4950         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4951
4952             /* LB21a Don't break after Hebrew + Hyphen.
4953              * HL (HY | BA) × */
4954
4955             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4956                                                           == LB_Hebrew_Letter)
4957             {
4958                 return FALSE;
4959             }
4960
4961             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4962
4963         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4964         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4965
4966             /* LB25a (PR | PO) × ( OP | HY )? NU */
4967             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4968                 return FALSE;
4969             }
4970
4971             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4972                                                                 == LB_BREAKABLE;
4973
4974         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4975         case LB_SY_or_IS_then_various + LB_NOBREAK:
4976         {
4977             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4978
4979             LB_enum temp = prev;
4980             do {
4981                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4982             }
4983             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4984             if (temp == LB_Numeric) {
4985                 return FALSE;
4986             }
4987
4988             return LB_table[prev][after] - LB_SY_or_IS_then_various
4989                                                                == LB_BREAKABLE;
4990         }
4991
4992         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4993         case LB_various_then_PO_or_PR + LB_NOBREAK:
4994         {
4995             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4996
4997             LB_enum temp = prev;
4998             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4999             {
5000                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5001             }
5002             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5003                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5004             }
5005             if (temp == LB_Numeric) {
5006                 return FALSE;
5007             }
5008             return LB_various_then_PO_or_PR;
5009         }
5010
5011         case LB_RI_then_RI + LB_NOBREAK:
5012         case LB_RI_then_RI + LB_BREAKABLE:
5013             {
5014                 int RI_count = 1;
5015
5016                 /* LB30a Break between two regional indicator symbols if and
5017                  * only if there are an even number of regional indicators
5018                  * preceding the position of the break.
5019                  *
5020                  *    sot (RI RI)* RI × RI
5021                  *  [^RI] (RI RI)* RI × RI */
5022
5023                 while (backup_one_LB(strbeg,
5024                                      &temp_pos,
5025                                      utf8_target) == LB_Regional_Indicator)
5026                 {
5027                     RI_count++;
5028                 }
5029
5030                 return RI_count % 2 == 0;
5031             }
5032
5033         default:
5034             break;
5035     }
5036
5037 #ifdef DEBUGGING
5038     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5039                                   before, after, LB_table[before][after]);
5040     assert(0);
5041 #endif
5042     return TRUE;
5043 }
5044
5045 STATIC LB_enum
5046 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5047 {
5048     LB_enum lb;
5049
5050     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5051
5052     if (*curpos >= strend) {
5053         return LB_EDGE;
5054     }
5055
5056     if (utf8_target) {
5057         *curpos += UTF8SKIP(*curpos);
5058         if (*curpos >= strend) {
5059             return LB_EDGE;
5060         }
5061         lb = getLB_VAL_UTF8(*curpos, strend);
5062     }
5063     else {
5064         (*curpos)++;
5065         if (*curpos >= strend) {
5066             return LB_EDGE;
5067         }
5068         lb = getLB_VAL_CP(**curpos);
5069     }
5070
5071     return lb;
5072 }
5073
5074 STATIC LB_enum
5075 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5076 {
5077     LB_enum lb;
5078
5079     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5080
5081     if (*curpos < strbeg) {
5082         return LB_EDGE;
5083     }
5084
5085     if (utf8_target) {
5086         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5087         U8 * prev_prev_char_pos;
5088
5089         if (! prev_char_pos) {
5090             return LB_EDGE;
5091         }
5092
5093         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5094             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5095             *curpos = prev_char_pos;
5096             prev_char_pos = prev_prev_char_pos;
5097         }
5098         else {
5099             *curpos = (U8 *) strbeg;
5100             return LB_EDGE;
5101         }
5102     }
5103     else {
5104         if (*curpos - 2 < strbeg) {
5105             *curpos = (U8 *) strbeg;
5106             return LB_EDGE;
5107         }
5108         (*curpos)--;
5109         lb = getLB_VAL_CP(*(*curpos - 1));
5110     }
5111
5112     return lb;
5113 }
5114
5115 STATIC bool
5116 S_isSB(pTHX_ SB_enum before,
5117              SB_enum after,
5118              const U8 * const strbeg,
5119              const U8 * const curpos,
5120              const U8 * const strend,
5121              const bool utf8_target)
5122 {
5123     /* returns a boolean indicating if there is a Sentence Boundary Break
5124      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
5125
5126     U8 * lpos = (U8 *) curpos;
5127     bool has_para_sep = FALSE;
5128     bool has_sp = FALSE;
5129
5130     PERL_ARGS_ASSERT_ISSB;
5131
5132     /* Break at the start and end of text.
5133         SB1.  sot  ÷
5134         SB2.  ÷  eot
5135       But unstated in Unicode is don't break if the text is empty */
5136     if (before == SB_EDGE || after == SB_EDGE) {
5137         return before != after;
5138     }
5139
5140     /* SB 3: Do not break within CRLF. */
5141     if (before == SB_CR && after == SB_LF) {
5142         return FALSE;
5143     }
5144
5145     /* Break after paragraph separators.  CR and LF are considered
5146      * so because Unicode views text as like word processing text where there
5147      * are no newlines except between paragraphs, and the word processor takes
5148      * care of wrapping without there being hard line-breaks in the text *./
5149        SB4.  Sep | CR | LF  ÷ */
5150     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5151         return TRUE;
5152     }
5153
5154     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5155      * (See Section 6.2, Replacing Ignore Rules.)
5156         SB5.  X (Extend | Format)*  →  X */
5157     if (after == SB_Extend || after == SB_Format) {
5158
5159         /* Implied is that the these characters attach to everything
5160          * immediately prior to them except for those separator-type
5161          * characters.  And the rules earlier have already handled the case
5162          * when one of those immediately precedes the extend char */
5163         return FALSE;
5164     }
5165
5166     if (before == SB_Extend || before == SB_Format) {
5167         U8 * temp_pos = lpos;
5168         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5169         if (   backup != SB_EDGE
5170             && backup != SB_Sep
5171             && backup != SB_CR
5172             && backup != SB_LF)
5173         {
5174             before = backup;
5175             lpos = temp_pos;
5176         }
5177
5178         /* Here, both 'before' and 'backup' are these types; implied is that we
5179          * don't break between them */
5180         if (backup == SB_Extend || backup == SB_Format) {
5181             return FALSE;
5182         }
5183     }
5184
5185     /* Do not break after ambiguous terminators like period, if they are
5186      * immediately followed by a number or lowercase letter, if they are
5187      * between uppercase letters, if the first following letter (optionally
5188      * after certain punctuation) is lowercase, or if they are followed by
5189      * "continuation" punctuation such as comma, colon, or semicolon. For
5190      * example, a period may be an abbreviation or numeric period, and thus may
5191      * not mark the end of a sentence.
5192
5193      * SB6. ATerm  ×  Numeric */
5194     if (before == SB_ATerm && after == SB_Numeric) {
5195         return FALSE;
5196     }
5197
5198     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5199     if (before == SB_ATerm && after == SB_Upper) {
5200         U8 * temp_pos = lpos;
5201         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5202         if (backup == SB_Upper || backup == SB_Lower) {
5203             return FALSE;
5204         }
5205     }
5206
5207     /* The remaining rules that aren't the final one, all require an STerm or
5208      * an ATerm after having backed up over some Close* Sp*, and in one case an
5209      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5210      * So do that backup now, setting flags if either Sp or a paragraph
5211      * separator are found */
5212
5213     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5214         has_para_sep = TRUE;
5215         before = backup_one_SB(strbeg, &lpos, utf8_target);
5216     }
5217
5218     if (before == SB_Sp) {
5219         has_sp = TRUE;
5220         do {
5221             before = backup_one_SB(strbeg, &lpos, utf8_target);
5222         }
5223         while (before == SB_Sp);
5224     }
5225
5226     while (before == SB_Close) {
5227         before = backup_one_SB(strbeg, &lpos, utf8_target);
5228     }
5229
5230     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5231      * most cases an STerm */
5232     if (before == SB_STerm || before == SB_ATerm) {
5233
5234         /* So, here the lhs matches
5235          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5236          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5237          * The rules that apply here are:
5238          *
5239          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5240                                            | LF | STerm | ATerm) )* Lower
5241            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5242            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5243            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5244            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5245          */
5246
5247         /* And all but SB11 forbid having seen a paragraph separator */
5248         if (! has_para_sep) {
5249             if (before == SB_ATerm) {          /* SB8 */
5250                 U8 * rpos = (U8 *) curpos;
5251                 SB_enum later = after;
5252
5253                 while (    later != SB_OLetter
5254                         && later != SB_Upper
5255                         && later != SB_Lower
5256                         && later != SB_Sep
5257                         && later != SB_CR
5258                         && later != SB_LF
5259                         && later != SB_STerm
5260                         && later != SB_ATerm
5261                         && later != SB_EDGE)
5262                 {
5263                     later = advance_one_SB(&rpos, strend, utf8_target);
5264                 }
5265                 if (later == SB_Lower) {
5266                     return FALSE;
5267                 }
5268             }
5269
5270             if (   after == SB_SContinue    /* SB8a */
5271                 || after == SB_STerm
5272                 || after == SB_ATerm)
5273             {
5274                 return FALSE;
5275             }
5276
5277             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5278                 if (   after == SB_Close
5279                     || after == SB_Sp
5280                     || after == SB_Sep
5281                     || after == SB_CR
5282                     || after == SB_LF)
5283                 {
5284                     return FALSE;
5285                 }
5286             }
5287
5288             /* SB10.  This and SB9 could probably be combined some way, but khw
5289              * has decided to follow the Unicode rule book precisely for
5290              * simplified maintenance */
5291             if (   after == SB_Sp
5292                 || after == SB_Sep
5293                 || after == SB_CR
5294                 || after == SB_LF)
5295             {
5296                 return FALSE;
5297             }
5298         }
5299
5300         /* SB11.  */
5301         return TRUE;
5302     }
5303
5304     /* Otherwise, do not break.
5305     SB12.  Any  ×  Any */
5306
5307     return FALSE;
5308 }
5309
5310 STATIC SB_enum
5311 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5312 {
5313     SB_enum sb;
5314
5315     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5316
5317     if (*curpos >= strend) {
5318         return SB_EDGE;
5319     }
5320
5321     if (utf8_target) {
5322         do {
5323             *curpos += UTF8SKIP(*curpos);
5324             if (*curpos >= strend) {
5325                 return SB_EDGE;
5326             }
5327             sb = getSB_VAL_UTF8(*curpos, strend);
5328         } while (sb == SB_Extend || sb == SB_Format);
5329     }
5330     else {
5331         do {
5332             (*curpos)++;
5333             if (*curpos >= strend) {
5334                 return SB_EDGE;
5335             }
5336             sb = getSB_VAL_CP(**curpos);
5337         } while (sb == SB_Extend || sb == SB_Format);
5338     }
5339
5340     return sb;
5341 }
5342
5343 STATIC SB_enum
5344 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5345 {
5346     SB_enum sb;
5347
5348     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5349
5350     if (*curpos < strbeg) {
5351         return SB_EDGE;
5352     }
5353
5354     if (utf8_target) {
5355         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5356         if (! prev_char_pos) {
5357             return SB_EDGE;
5358         }
5359
5360         /* Back up over Extend and Format.  curpos is always just to the right
5361          * of the characater whose value we are getting */
5362         do {
5363             U8 * prev_prev_char_pos;
5364             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5365                                                                       strbeg)))
5366             {
5367                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5368                 *curpos = prev_char_pos;
5369                 prev_char_pos = prev_prev_char_pos;
5370             }
5371             else {
5372                 *curpos = (U8 *) strbeg;
5373                 return SB_EDGE;
5374             }
5375         } while (sb == SB_Extend || sb == SB_Format);
5376     }
5377     else {
5378         do {
5379             if (*curpos - 2 < strbeg) {
5380                 *curpos = (U8 *) strbeg;
5381                 return SB_EDGE;
5382             }
5383             (*curpos)--;
5384             sb = getSB_VAL_CP(*(*curpos - 1));
5385         } while (sb == SB_Extend || sb == SB_Format);
5386     }
5387
5388     return sb;
5389 }
5390
5391 STATIC bool
5392 S_isWB(pTHX_ WB_enum previous,
5393              WB_enum before,
5394              WB_enum after,
5395              const U8 * const strbeg,
5396              const U8 * const curpos,
5397              const U8 * const strend,
5398              const bool utf8_target)
5399 {
5400     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5401      *  a Unicode word break, using their published algorithm, but tailored for
5402      *  Perl by treating spans of white space as one unit.  Context may be
5403      *  needed to make this determination.  If the value for the character
5404      *  before 'before' is known, it is passed as 'previous'; otherwise that
5405      *  should be set to WB_UNKNOWN.  The other input parameters give the
5406      *  boundaries and current position in the matching of the string.  That
5407      *  is, 'curpos' marks the position where the character whose wb value is
5408      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5409
5410     U8 * before_pos = (U8 *) curpos;
5411     U8 * after_pos = (U8 *) curpos;
5412     WB_enum prev = before;
5413     WB_enum next;
5414
5415     PERL_ARGS_ASSERT_ISWB;
5416
5417     /* Rule numbers in the comments below are as of Unicode 9.0 */
5418
5419   redo:
5420     before = prev;
5421     switch (WB_table[before][after]) {
5422         case WB_BREAKABLE:
5423             return TRUE;
5424
5425         case WB_NOBREAK:
5426             return FALSE;
5427
5428         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5429             next = advance_one_WB(&after_pos, strend, utf8_target,
5430                                  FALSE /* Don't skip Extend nor Format */ );
5431             /* A space immediately preceeding an Extend or Format is attached
5432              * to by them, and hence gets separated from previous spaces.
5433              * Otherwise don't break between horizontal white space */
5434             return next == WB_Extend || next == WB_Format;
5435
5436         /* WB4 Ignore Format and Extend characters, except when they appear at
5437          * the beginning of a region of text.  This code currently isn't
5438          * general purpose, but it works as the rules are currently and likely
5439          * to be laid out.  The reason it works is that when 'they appear at
5440          * the beginning of a region of text', the rule is to break before
5441          * them, just like any other character.  Therefore, the default rule
5442          * applies and we don't have to look in more depth.  Should this ever
5443          * change, we would have to have 2 'case' statements, like in the rules
5444          * below, and backup a single character (not spacing over the extend
5445          * ones) and then see if that is one of the region-end characters and
5446          * go from there */
5447         case WB_Ex_or_FO_or_ZWJ_then_foo:
5448             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5449             goto redo;
5450
5451         case WB_DQ_then_HL + WB_BREAKABLE:
5452         case WB_DQ_then_HL + WB_NOBREAK:
5453
5454             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5455
5456             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5457                                                             == WB_Hebrew_Letter)
5458             {
5459                 return FALSE;
5460             }
5461
5462              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5463
5464         case WB_HL_then_DQ + WB_BREAKABLE:
5465         case WB_HL_then_DQ + WB_NOBREAK:
5466
5467             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5468
5469             if (advance_one_WB(&after_pos, strend, utf8_target,
5470                                        TRUE /* Do skip Extend and Format */ )
5471                                                             == WB_Hebrew_Letter)
5472             {
5473                 return FALSE;
5474             }
5475
5476             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5477
5478         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5479         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5480
5481             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5482              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5483
5484             next = advance_one_WB(&after_pos, strend, utf8_target,
5485                                        TRUE /* Do skip Extend and Format */ );
5486
5487             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5488             {
5489                 return FALSE;
5490             }
5491
5492             return WB_table[before][after]
5493                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5494
5495         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5496         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5497
5498             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5499              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5500
5501             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5502             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5503             {
5504                 return FALSE;
5505             }
5506
5507             return WB_table[before][after]
5508                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5509
5510         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5511         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5512
5513             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5514              * */
5515
5516             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5517                                                             == WB_Numeric)
5518             {
5519                 return FALSE;
5520             }
5521
5522             return WB_table[before][after]
5523                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5524
5525         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5526         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5527
5528             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5529
5530             if (advance_one_WB(&after_pos, strend, utf8_target,
5531                                        TRUE /* Do skip Extend and Format */ )
5532                                                             == WB_Numeric)
5533             {
5534                 return FALSE;
5535             }
5536
5537             return WB_table[before][after]
5538                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5539
5540         case WB_RI_then_RI + WB_NOBREAK:
5541         case WB_RI_then_RI + WB_BREAKABLE:
5542             {
5543                 int RI_count = 1;
5544
5545                 /* Do not break within emoji flag sequences. That is, do not
5546                  * break between regional indicator (RI) symbols if there is an
5547                  * odd number of RI characters before the potential break
5548                  * point.
5549                  *
5550                  * WB15   sot (RI RI)* RI × RI
5551                  * WB16 [^RI] (RI RI)* RI × RI */
5552
5553                 while (backup_one_WB(&previous,
5554                                      strbeg,
5555                                      &before_pos,
5556                                      utf8_target) == WB_Regional_Indicator)
5557                 {
5558                     RI_count++;
5559                 }
5560
5561                 return RI_count % 2 != 1;
5562             }
5563
5564         default:
5565             break;
5566     }
5567
5568 #ifdef DEBUGGING
5569     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5570                                   before, after, WB_table[before][after]);
5571     assert(0);
5572 #endif
5573     return TRUE;
5574 }
5575
5576 STATIC WB_enum
5577 S_advance_one_WB(pTHX_ U8 ** curpos,
5578                        const U8 * const strend,
5579                        const bool utf8_target,
5580                        const bool skip_Extend_Format)
5581 {
5582     WB_enum wb;
5583
5584     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5585
5586     if (*curpos >= strend) {
5587         return WB_EDGE;
5588     }
5589
5590     if (utf8_target) {
5591
5592         /* Advance over Extend and Format */
5593         do {
5594             *curpos += UTF8SKIP(*curpos);
5595             if (*curpos >= strend) {
5596                 return WB_EDGE;
5597             }
5598             wb = getWB_VAL_UTF8(*curpos, strend);
5599         } while (    skip_Extend_Format
5600                  && (wb == WB_Extend || wb == WB_Format));
5601     }
5602     else {
5603         do {
5604             (*curpos)++;
5605             if (*curpos >= strend) {
5606                 return WB_EDGE;
5607             }
5608             wb = getWB_VAL_CP(**curpos);
5609         } while (    skip_Extend_Format
5610                  && (wb == WB_Extend || wb == WB_Format));
5611     }
5612
5613     return wb;
5614 }
5615
5616 STATIC WB_enum
5617 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5618 {
5619     WB_enum wb;
5620
5621     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5622
5623     /* If we know what the previous character's break value is, don't have
5624         * to look it up */
5625     if (*previous != WB_UNKNOWN) {
5626         wb = *previous;
5627
5628         /* But we need to move backwards by one */
5629         if (utf8_target) {
5630             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5631             if (! *curpos) {
5632                 *previous = WB_EDGE;
5633                 *curpos = (U8 *) strbeg;
5634             }
5635             else {
5636                 *previous = WB_UNKNOWN;
5637             }
5638         }
5639         else {
5640             (*curpos)--;
5641             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5642         }
5643
5644         /* And we always back up over these three types */
5645         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5646             return wb;
5647         }
5648     }
5649
5650     if (*curpos < strbeg) {
5651         return WB_EDGE;
5652     }
5653
5654     if (utf8_target) {
5655         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5656         if (! prev_char_pos) {
5657             return WB_EDGE;
5658         }
5659
5660         /* Back up over Extend and Format.  curpos is always just to the right
5661          * of the characater whose value we are getting */
5662         do {
5663             U8 * prev_prev_char_pos;
5664             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5665                                                    -1,
5666                                                    strbeg)))
5667             {
5668                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5669                 *curpos = prev_char_pos;
5670                 prev_char_pos = prev_prev_char_pos;
5671             }
5672             else {
5673                 *curpos = (U8 *) strbeg;
5674                 return WB_EDGE;
5675             }
5676         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5677     }
5678     else {
5679         do {
5680             if (*curpos - 2 < strbeg) {
5681                 *curpos = (U8 *) strbeg;
5682                 return WB_EDGE;
5683             }
5684             (*curpos)--;
5685             wb = getWB_VAL_CP(*(*curpos - 1));
5686         } while (wb == WB_Extend || wb == WB_Format);
5687     }
5688
5689     return wb;
5690 }
5691
5692 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5693 (                                                           \
5694     (   ( st )                                         ) && \
5695     (   ( st )->u.eval.close_paren                     ) && \
5696     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5697 )
5698
5699 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5700 (                                                           \
5701     (   ( st )                                         ) && \
5702     (   ( st )->u.eval.close_paren                     ) && \
5703     (   ( expr )                                       ) && \
5704     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5705 )
5706
5707
5708 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5709     (st)->u.eval.close_paren = ( (expr) + 1 )
5710
5711 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5712     (st)->u.eval.close_paren = 0
5713
5714 /* returns -1 on failure, $+[0] on success */
5715 STATIC SSize_t
5716 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5717 {
5718     dVAR;
5719     const bool utf8_target = reginfo->is_utf8_target;
5720     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5721     REGEXP *rex_sv = reginfo->prog;
5722     regexp *rex = ReANY(rex_sv);
5723     RXi_GET_DECL(rex,rexi);
5724     /* the current state. This is a cached copy of PL_regmatch_state */
5725     regmatch_state *st;
5726     /* cache heavy used fields of st in registers */
5727     regnode *scan;
5728     regnode *next;
5729     U32 n = 0;  /* general value; init to avoid compiler warning */
5730     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5731     SSize_t endref = 0; /* offset of end of backref when ln is start */
5732     char *locinput = startpos;
5733     char *pushinput; /* where to continue after a PUSH */
5734     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5735
5736     bool result = 0;        /* return value of S_regmatch */
5737     U32 depth = 0;            /* depth of backtrack stack */
5738     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5739     const U32 max_nochange_depth =
5740         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5741         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5742     regmatch_state *yes_state = NULL; /* state to pop to on success of
5743                                                             subpattern */
5744     /* mark_state piggy backs on the yes_state logic so that when we unwind
5745        the stack on success we can update the mark_state as we go */
5746     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5747     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5748     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5749     U32 state_num;
5750     bool no_final = 0;      /* prevent failure from backtracking? */
5751     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5752     char *startpoint = locinput;
5753     SV *popmark = NULL;     /* are we looking for a mark? */
5754     SV *sv_commit = NULL;   /* last mark name seen in failure */
5755     SV *sv_yes_mark = NULL; /* last mark name we have seen
5756                                during a successful match */
5757     U32 lastopen = 0;       /* last open we saw */
5758     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5759     SV* const oreplsv = GvSVn(PL_replgv);
5760     /* these three flags are set by various ops to signal information to
5761      * the very next op. They have a useful lifetime of exactly one loop
5762      * iteration, and are not preserved or restored by state pushes/pops
5763      */
5764     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5765     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5766     int logical = 0;        /* the following EVAL is:
5767                                 0: (?{...})
5768                                 1: (?(?{...})X|Y)
5769                                 2: (??{...})
5770                                or the following IFMATCH/UNLESSM is:
5771                                 false: plain (?=foo)
5772                                 true:  used as a condition: (?(?=foo))
5773                             */
5774     PAD* last_pad = NULL;
5775     dMULTICALL;
5776     U8 gimme = G_SCALAR;
5777     CV *caller_cv = NULL;       /* who called us */
5778     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5779     U32 maxopenparen = 0;       /* max '(' index seen so far */
5780     int to_complement;  /* Invert the result? */
5781     _char_class_number classnum;
5782     bool is_utf8_pat = reginfo->is_utf8_pat;
5783     bool match = FALSE;
5784     I32 orig_savestack_ix = PL_savestack_ix;
5785     U8 * script_run_begin = NULL;
5786
5787 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5788 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5789 #  define SOLARIS_BAD_OPTIMIZER
5790     const U32 *pl_charclass_dup = PL_charclass;
5791 #  define PL_charclass pl_charclass_dup
5792 #endif
5793
5794 #ifdef DEBUGGING
5795     GET_RE_DEBUG_FLAGS_DECL;
5796 #endif
5797
5798     /* protect against undef(*^R) */
5799     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5800
5801     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5802     multicall_oldcatch = 0;
5803     PERL_UNUSED_VAR(multicall_cop);
5804
5805     PERL_ARGS_ASSERT_REGMATCH;
5806
5807     st = PL_regmatch_state;
5808
5809     /* Note that nextchr is a byte even in UTF */
5810     SET_nextchr;
5811     scan = prog;
5812
5813     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5814             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5815             Perl_re_printf( aTHX_ "regmatch start\n" );
5816     }));
5817
5818     while (scan != NULL) {
5819         next = scan + NEXT_OFF(scan);
5820         if (next == scan)
5821             next = NULL;
5822         state_num = OP(scan);
5823
5824       reenter_switch:
5825         DEBUG_EXECUTE_r(
5826             if (state_num <= REGNODE_MAX) {
5827                 SV * const prop = sv_newmortal();
5828                 regnode *rnext = regnext(scan);
5829
5830                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5831                 regprop(rex, prop, scan, reginfo, NULL);
5832                 Perl_re_printf( aTHX_
5833                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5834                     INDENT_CHARS(depth), "",
5835                     (IV)(scan - rexi->program),
5836                     SvPVX_const(prop),
5837                     (PL_regkind[OP(scan)] == END || !rnext) ?
5838                         0 : (IV)(rnext - rexi->program));
5839             }
5840         );
5841
5842         to_complement = 0;
5843
5844         SET_nextchr;
5845         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5846
5847         switch (state_num) {
5848         case SBOL: /*  /^../ and /\A../  */
5849             if (locinput == reginfo->strbeg)
5850                 break;
5851             sayNO;
5852
5853         case MBOL: /*  /^../m  */
5854             if (locinput == reginfo->strbeg ||
5855                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5856             {
5857                 break;
5858             }
5859             sayNO;
5860
5861         case GPOS: /*  \G  */
5862             if (locinput == reginfo->ganch)
5863                 break;
5864             sayNO;
5865
5866         case KEEPS: /*   \K  */
5867             /* update the startpoint */
5868             st->u.keeper.val = rex->offs[0].start;
5869             rex->offs[0].start = locinput - reginfo->strbeg;
5870             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5871             NOT_REACHED; /* NOTREACHED */
5872
5873         case KEEPS_next_fail:
5874             /* rollback the start point change */
5875             rex->offs[0].start = st->u.keeper.val;
5876             sayNO_SILENT;
5877             NOT_REACHED; /* NOTREACHED */
5878
5879         case MEOL: /* /..$/m  */
5880             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5881                 sayNO;
5882             break;
5883
5884         case SEOL: /* /..$/  */
5885             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5886                 sayNO;
5887             if (reginfo->strend - locinput > 1)
5888                 sayNO;
5889             break;
5890
5891         case EOS: /*  \z  */
5892             if (!NEXTCHR_IS_EOS)
5893                 sayNO;
5894             break;
5895
5896         case SANY: /*  /./s  */
5897             if (NEXTCHR_IS_EOS)
5898                 sayNO;
5899             goto increment_locinput;
5900
5901         case REG_ANY: /*  /./  */
5902             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5903                 sayNO;
5904             goto increment_locinput;
5905
5906
5907 #undef  ST
5908 #define ST st->u.trie
5909         case TRIEC: /* (ab|cd) with known charclass */
5910             /* In this case the charclass data is available inline so
5911                we can fail fast without a lot of extra overhead.
5912              */
5913             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5914                 DEBUG_EXECUTE_r(
5915                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5916                               depth, PL_colors[4], PL_colors[5])
5917                 );
5918                 sayNO_SILENT;
5919                 NOT_REACHED; /* NOTREACHED */
5920             }
5921             /* FALLTHROUGH */
5922         case TRIE:  /* (ab|cd)  */
5923             /* the basic plan of execution of the trie is:
5924              * At the beginning, run though all the states, and
5925              * find the longest-matching word. Also remember the position
5926              * of the shortest matching word. For example, this pattern:
5927              *    1  2 3 4    5
5928              *    ab|a|x|abcd|abc
5929              * when matched against the string "abcde", will generate
5930              * accept states for all words except 3, with the longest
5931              * matching word being 4, and the shortest being 2 (with
5932              * the position being after char 1 of the string).
5933              *
5934              * Then for each matching word, in word order (i.e. 1,2,4,5),
5935              * we run the remainder of the pattern; on each try setting
5936              * the current position to the character following the word,
5937              * returning to try the next word on failure.
5938              *
5939              * We avoid having to build a list of words at runtime by
5940              * using a compile-time structure, wordinfo[].prev, which
5941              * gives, for each word, the previous accepting word (if any).
5942              * In the case above it would contain the mappings 1->2, 2->0,
5943              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5944              * the longest word (4 above), a list of all words, by
5945              * following the list of prev pointers; this gives us the
5946              * unordered list 4,5,1,2. Then given the current word we have
5947              * just tried, we can go through the list and find the
5948              * next-biggest word to try (so if we just failed on word 2,
5949              * the next in the list is 4).
5950              *
5951              * Since at runtime we don't record the matching position in
5952              * the string for each word, we have to work that out for
5953              * each word we're about to process. The wordinfo table holds
5954              * the character length of each word; given that we recorded
5955              * at the start: the position of the shortest word and its
5956              * length in chars, we just need to move the pointer the
5957              * difference between the two char lengths. Depending on
5958              * Unicode status and folding, that's cheap or expensive.
5959              *
5960              * This algorithm is optimised for the case where are only a
5961              * small number of accept states, i.e. 0,1, or maybe 2.
5962              * With lots of accepts states, and having to try all of them,
5963              * it becomes quadratic on number of accept states to find all
5964              * the next words.
5965              */
5966
5967             {
5968                 /* what type of TRIE am I? (utf8 makes this contextual) */
5969                 DECL_TRIE_TYPE(scan);
5970
5971                 /* what trie are we using right now */
5972                 reg_trie_data * const trie
5973                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5974                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5975                 U32 state = trie->startstate;
5976
5977                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5978                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5979                     if (utf8_target
5980                         && ! NEXTCHR_IS_EOS
5981                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5982                         && scan->flags == EXACTL)
5983                     {
5984                         /* We only output for EXACTL, as we let the folder
5985                          * output this message for EXACTFLU8 to avoid
5986                          * duplication */
5987                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5988                                                                reginfo->strend);
5989                     }
5990                 }
5991                 if (   trie->bitmap
5992                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5993                 {
5994                     if (trie->states[ state ].wordnum) {
5995                          DEBUG_EXECUTE_r(
5996                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5997                                           depth, PL_colors[4], PL_colors[5])
5998                         );
5999                         if (!trie->jump)
6000                             break;
6001                     } else {
6002                         DEBUG_EXECUTE_r(
6003                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6004                                           depth, PL_colors[4], PL_colors[5])
6005                         );
6006                         sayNO_SILENT;
6007                    }
6008                 }
6009
6010             {
6011                 U8 *uc = ( U8* )locinput;
6012
6013                 STRLEN len = 0;
6014                 STRLEN foldlen = 0;
6015                 U8 *uscan = (U8*)NULL;
6016                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6017                 U32 charcount = 0; /* how many input chars we have matched */
6018                 U32 accepted = 0; /* have we seen any accepting states? */
6019
6020                 ST.jump = trie->jump;
6021                 ST.me = scan;
6022                 ST.firstpos = NULL;
6023                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6024                 ST.nextword = 0;
6025
6026                 /* fully traverse the TRIE; note the position of the
6027                    shortest accept state and the wordnum of the longest
6028                    accept state */
6029
6030                 while ( state && uc <= (U8*)(reginfo->strend) ) {
6031                     U32 base = trie->states[ state ].trans.base;
6032                     UV uvc = 0;
6033                     U16 charid = 0;
6034                     U16 wordnum;
6035                     wordnum = trie->states[ state ].wordnum;
6036
6037                     if (wordnum) { /* it's an accept state */
6038                         if (!accepted) {
6039                             accepted = 1;
6040                             /* record first match position */
6041                             if (ST.longfold) {
6042                                 ST.firstpos = (U8*)locinput;
6043                                 ST.firstchars = 0;
6044                             }
6045                             else {
6046                                 ST.firstpos = uc;
6047                                 ST.firstchars = charcount;
6048                             }
6049                         }
6050                         if (!ST.nextword || wordnum < ST.nextword)
6051                             ST.nextword = wordnum;
6052                         ST.topword = wordnum;
6053                     }
6054
6055                     DEBUG_TRIE_EXECUTE_r({
6056                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6057                                 /* HERE */
6058                                 PerlIO_printf( Perl_debug_log,
6059                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6060                                     INDENT_CHARS(depth), "", PL_colors[4],
6061                                     (UV)state, (accepted ? 'Y' : 'N'));
6062                     });
6063
6064                     /* read a char and goto next state */
6065                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6066                         I32 offset;
6067                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6068                                              (U8 *) reginfo->strend, uscan,
6069                                              len, uvc, charid, foldlen,
6070                                              foldbuf, uniflags);
6071                         charcount++;
6072                         if (foldlen>0)
6073                             ST.longfold = TRUE;
6074                         if (charid &&
6075                              ( ((offset =
6076                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6077
6078                              && ((U32)offset < trie->lasttrans)
6079                              && trie->trans[offset].check == state)
6080                         {
6081                             state = trie->trans[offset].next;
6082                         }
6083                         else {
6084                             state = 0;
6085                         }
6086                         uc += len;
6087
6088                     }
6089                     else {
6090                         state = 0;
6091                     }
6092                     DEBUG_TRIE_EXECUTE_r(
6093                         Perl_re_printf( aTHX_
6094                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6095                             charid, uvc, (UV)state, PL_colors[5] );
6096                     );
6097                 }
6098                 if (!accepted)
6099                    sayNO;
6100
6101                 /* calculate total number of accept states */
6102                 {
6103                     U16 w = ST.topword;
6104                     accepted = 0;
6105                     while (w) {
6106                         w = trie->wordinfo[w].prev;
6107                         accepted++;
6108                     }
6109                     ST.accepted = accepted;
6110                 }
6111
6112                 DEBUG_EXECUTE_r(
6113                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6114                         depth,
6115                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6116                 );
6117                 goto trie_first_try; /* jump into the fail handler */
6118             }}
6119             NOT_REACHED; /* NOTREACHED */
6120
6121         case TRIE_next_fail: /* we failed - try next alternative */
6122         {
6123             U8 *uc;
6124             if ( ST.jump ) {
6125                 /* undo any captures done in the tail part of a branch,
6126                  * e.g.
6127                  *    /(?:X(.)(.)|Y(.)).../
6128                  * where the trie just matches X then calls out to do the
6129                  * rest of the branch */
6130                 REGCP_UNWIND(ST.cp);
6131                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6132             }
6133             if (!--ST.accepted) {
6134                 DEBUG_EXECUTE_r({
6135                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6136                         depth,
6137                         PL_colors[4],
6138                         PL_colors[5] );
6139                 });
6140                 sayNO_SILENT;
6141             }
6142             {
6143                 /* Find next-highest word to process.  Note that this code
6144                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6145                 U16 min = 0;
6146                 U16 word;
6147                 U16 const nextword = ST.nextword;
6148                 reg_trie_wordinfo * const wordinfo
6149                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6150                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6151                     if (word > nextword && (!min || word < min))
6152                         min = word;
6153                 }
6154                 ST.nextword = min;
6155             }
6156
6157           trie_first_try:
6158             if (do_cutgroup) {
6159                 do_cutgroup = 0;
6160                 no_final = 0;
6161             }
6162
6163             if ( ST.jump ) {
6164                 ST.lastparen = rex->lastparen;
6165                 ST.lastcloseparen = rex->lastcloseparen;
6166                 REGCP_SET(ST.cp);
6167             }
6168
6169             /* find start char of end of current word */
6170             {
6171                 U32 chars; /* how many chars to skip */
6172                 reg_trie_data * const trie
6173                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6174
6175                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6176                             >=  ST.firstchars);
6177                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6178                             - ST.firstchars;
6179                 uc = ST.firstpos;
6180
6181                 if (ST.longfold) {
6182                     /* the hard option - fold each char in turn and find
6183                      * its folded length (which may be different */
6184                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6185                     STRLEN foldlen;
6186                     STRLEN len;
6187                     UV uvc;
6188                     U8 *uscan;
6189
6190                     while (chars) {
6191                         if (utf8_target) {
6192                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6193                                                     uniflags);
6194                             uc += len;
6195                         }
6196                         else {
6197                             uvc = *uc;
6198                             uc++;
6199                         }
6200                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6201                         uscan = foldbuf;
6202                         while (foldlen) {
6203                             if (!--chars)
6204                                 break;
6205                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6206                                                  uniflags);
6207                             uscan += len;
6208                             foldlen -= len;
6209                         }
6210                     }
6211                 }
6212                 else {
6213                     if (utf8_target)
6214                         while (chars--)
6215                             uc += UTF8SKIP(uc);
6216                     else
6217                         uc += chars;
6218                 }
6219             }
6220
6221             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6222                             ? ST.jump[ST.nextword]
6223                             : NEXT_OFF(ST.me));
6224
6225             DEBUG_EXECUTE_r({
6226                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6227                     depth,
6228                     PL_colors[4],
6229                     ST.nextword,
6230                     PL_colors[5]
6231                     );
6232             });
6233
6234             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6235                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6236                 NOT_REACHED; /* NOTREACHED */
6237             }
6238             /* only one choice left - just continue */
6239             DEBUG_EXECUTE_r({
6240                 AV *const trie_words
6241                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6242                 SV ** const tmp = trie_words
6243                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6244                 SV *sv= tmp ? sv_newmortal() : NULL;
6245
6246                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6247                     depth, PL_colors[4],
6248                     ST.nextword,
6249                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6250                             PL_colors[0], PL_colors[1],
6251                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6252                         )
6253                     : "not compiled under -Dr",
6254                     PL_colors[5] );
6255             });
6256
6257             locinput = (char*)uc;
6258             continue; /* execute rest of RE */
6259             /* NOTREACHED */
6260         }
6261 #undef  ST
6262
6263         case EXACTL:             /*  /abc/l       */
6264             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6265
6266             /* Complete checking would involve going through every character
6267              * matched by the string to see if any is above latin1.  But the
6268              * comparision otherwise might very well be a fast assembly
6269              * language routine, and I (khw) don't think slowing things down
6270              * just to check for this warning is worth it.  So this just checks
6271              * the first character */
6272             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6273                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6274             }
6275             /* FALLTHROUGH */
6276         case EXACT: {            /*  /abc/        */
6277             char *s = STRING(scan);
6278             ln = STR_LEN(scan);
6279             if (utf8_target != is_utf8_pat) {
6280                 /* The target and the pattern have differing utf8ness. */
6281                 char *l = locinput;
6282                 const char * const e = s + ln;
6283
6284                 if (utf8_target) {
6285                     /* The target is utf8, the pattern is not utf8.
6286                      * Above-Latin1 code points can't match the pattern;
6287                      * invariants match exactly, and the other Latin1 ones need
6288                      * to be downgraded to a single byte in order to do the
6289                      * comparison.  (If we could be confident that the target
6290                      * is not malformed, this could be refactored to have fewer
6291                      * tests by just assuming that if the first bytes match, it
6292                      * is an invariant, but there are tests in the test suite
6293                      * dealing with (??{...}) which violate this) */
6294                     while (s < e) {
6295                         if (l >= reginfo->strend
6296                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6297                         {
6298                             sayNO;
6299                         }
6300                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6301                             if (*l != *s) {
6302                                 sayNO;
6303                             }
6304                             l++;
6305                         }
6306                         else {
6307                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6308                             {
6309                                 sayNO;
6310                             }
6311                             l += 2;
6312                         }
6313                         s++;
6314                     }
6315                 }
6316                 else {
6317                     /* The target is not utf8, the pattern is utf8. */
6318                     while (s < e) {
6319                         if (l >= reginfo->strend
6320                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6321                         {
6322                             sayNO;
6323                         }
6324                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6325                             if (*s != *l) {
6326                                 sayNO;
6327                             }
6328                             s++;
6329                         }
6330                         else {
6331                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6332                             {
6333                                 sayNO;
6334                             }
6335                             s += 2;
6336                         }
6337                         l++;
6338                     }
6339                 }
6340                 locinput = l;
6341             }
6342             else {
6343                 /* The target and the pattern have the same utf8ness. */
6344                 /* Inline the first character, for speed. */
6345                 if (reginfo->strend - locinput < ln
6346                     || UCHARAT(s) != nextchr
6347                     || (ln > 1 && memNE(s, locinput, ln)))
6348                 {
6349                     sayNO;
6350                 }
6351                 locinput += ln;
6352             }
6353             break;
6354             }
6355
6356         case EXACTFL: {          /*  /abc/il      */
6357             re_fold_t folder;
6358             const U8 * fold_array;
6359             const char * s;
6360             U32 fold_utf8_flags;
6361
6362             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6363             folder = foldEQ_locale;
6364             fold_array = PL_fold_locale;
6365             fold_utf8_flags = FOLDEQ_LOCALE;
6366             goto do_exactf;
6367
6368         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6369                                       is effectively /u; hence to match, target
6370                                       must be UTF-8. */
6371             if (! utf8_target) {
6372                 sayNO;
6373             }
6374             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6375                                              | FOLDEQ_S1_FOLDS_SANE;
6376             folder = foldEQ_latin1;
6377             fold_array = PL_fold_latin1;
6378             goto do_exactf;
6379
6380         case EXACTFU_SS:         /*  /\x{df}/iu   */
6381         case EXACTFU:            /*  /abc/iu      */
6382             folder = foldEQ_latin1;
6383             fold_array = PL_fold_latin1;
6384             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6385             goto do_exactf;
6386
6387         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6388                                    patterns */
6389             assert(! is_utf8_pat);
6390             /* FALLTHROUGH */
6391         case EXACTFAA:            /*  /abc/iaa     */
6392             folder = foldEQ_latin1;
6393             fold_array = PL_fold_latin1;
6394             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6395             goto do_exactf;
6396
6397         case EXACTF:             /*  /abc/i    This node only generated for
6398                                                non-utf8 patterns */
6399             assert(! is_utf8_pat);
6400             folder = foldEQ;
6401             fold_array = PL_fold;
6402             fold_utf8_flags = 0;
6403
6404           do_exactf:
6405             s = STRING(scan);
6406             ln = STR_LEN(scan);
6407
6408             if (utf8_target
6409                 || is_utf8_pat
6410                 || state_num == EXACTFU_SS
6411                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6412             {
6413               /* Either target or the pattern are utf8, or has the issue where
6414                * the fold lengths may differ. */
6415                 const char * const l = locinput;
6416                 char *e = reginfo->strend;
6417
6418                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6419                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6420                 {
6421                     sayNO;
6422                 }
6423                 locinput = e;
6424                 break;
6425             }
6426
6427             /* Neither the target nor the pattern are utf8 */
6428             if (UCHARAT(s) != nextchr
6429                 && !NEXTCHR_IS_EOS
6430                 && UCHARAT(s) != fold_array[nextchr])
6431             {
6432                 sayNO;
6433             }
6434             if (reginfo->strend - locinput < ln)
6435                 sayNO;
6436             if (ln > 1 && ! folder(s, locinput, ln))
6437                 sayNO;
6438             locinput += ln;
6439             break;
6440         }
6441
6442         case NBOUNDL: /*  /\B/l  */
6443             to_complement = 1;
6444             /* FALLTHROUGH */
6445
6446         case BOUNDL:  /*  /\b/l  */
6447         {
6448             bool b1, b2;
6449             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6450
6451             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6452                 if (! IN_UTF8_CTYPE_LOCALE) {
6453                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6454                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6455                 }
6456                 goto boundu;
6457             }
6458
6459             if (utf8_target) {
6460                 if (locinput == reginfo->strbeg)
6461                     b1 = isWORDCHAR_LC('\n');
6462                 else {
6463                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6464                                                         (U8*)(reginfo->strbeg)),
6465                                                  (U8*)(reginfo->strend));
6466                 }
6467                 b2 = (NEXTCHR_IS_EOS)
6468                     ? isWORDCHAR_LC('\n')
6469                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6470                                               (U8*) reginfo->strend);
6471             }
6472             else { /* Here the string isn't utf8 */
6473                 b1 = (locinput == reginfo->strbeg)
6474                      ? isWORDCHAR_LC('\n')
6475                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6476                 b2 = (NEXTCHR_IS_EOS)
6477                     ? isWORDCHAR_LC('\n')
6478                     : isWORDCHAR_LC(nextchr);
6479             }
6480             if (to_complement ^ (b1 == b2)) {
6481                 sayNO;
6482             }
6483             break;
6484         }
6485
6486         case NBOUND:  /*  /\B/   */
6487             to_complement = 1;
6488             /* FALLTHROUGH */
6489
6490         case BOUND:   /*  /\b/   */
6491             if (utf8_target) {
6492                 goto bound_utf8;
6493             }
6494             goto bound_ascii_match_only;
6495
6496         case NBOUNDA: /*  /\B/a  */
6497             to_complement = 1;
6498             /* FALLTHROUGH */
6499
6500         case BOUNDA:  /*  /\b/a  */
6501         {
6502             bool b1, b2;
6503
6504           bound_ascii_match_only:
6505             /* Here the string isn't utf8, or is utf8 and only ascii characters
6506              * are to match \w.  In the latter case looking at the byte just
6507              * prior to the current one may be just the final byte of a
6508              * multi-byte character.  This is ok.  There are two cases:
6509              * 1) it is a single byte character, and then the test is doing
6510              *    just what it's supposed to.
6511              * 2) it is a multi-byte character, in which case the final byte is
6512              *    never mistakable for ASCII, and so the test will say it is
6513              *    not a word character, which is the correct answer. */
6514             b1 = (locinput == reginfo->strbeg)
6515                  ? isWORDCHAR_A('\n')
6516                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6517             b2 = (NEXTCHR_IS_EOS)
6518                 ? isWORDCHAR_A('\n')
6519                 : isWORDCHAR_A(nextchr);
6520             if (to_complement ^ (b1 == b2)) {
6521                 sayNO;
6522             }
6523             break;
6524         }
6525
6526         case NBOUNDU: /*  /\B/u  */
6527             to_complement = 1;
6528             /* FALLTHROUGH */
6529
6530         case BOUNDU:  /*  /\b/u  */
6531
6532           boundu:
6533             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6534                 match = FALSE;
6535             }
6536             else if (utf8_target) {
6537               bound_utf8:
6538                 switch((bound_type) FLAGS(scan)) {
6539                     case TRADITIONAL_BOUND:
6540                     {
6541                         bool b1, b2;
6542                         b1 = (locinput == reginfo->strbeg)
6543                              ? 0 /* isWORDCHAR_L1('\n') */
6544                              : isWORDCHAR_utf8_safe(
6545                                                reghop3((U8*)locinput,
6546                                                        -1,
6547                                                        (U8*)(reginfo->strbeg)),
6548                                                     (U8*) reginfo->strend);
6549                         b2 = (NEXTCHR_IS_EOS)
6550                             ? 0 /* isWORDCHAR_L1('\n') */
6551                             : isWORDCHAR_utf8_safe((U8*)locinput,
6552                                                    (U8*) reginfo->strend);
6553                         match = cBOOL(b1 != b2);
6554                         break;
6555                     }
6556                     case GCB_BOUND:
6557                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6558                             match = TRUE; /* GCB always matches at begin and
6559                                              end */
6560                         }
6561                         else {
6562                             /* Find the gcb values of previous and current
6563                              * chars, then see if is a break point */
6564                             match = isGCB(getGCB_VAL_UTF8(
6565                                                 reghop3((U8*)locinput,
6566                                                         -1,
6567                                                         (U8*)(reginfo->strbeg)),
6568                                                 (U8*) reginfo->strend),
6569                                           getGCB_VAL_UTF8((U8*) locinput,
6570                                                         (U8*) reginfo->strend),
6571                                           (U8*) reginfo->strbeg,
6572                                           (U8*) locinput,
6573                                           utf8_target);
6574                         }
6575                         break;
6576
6577                     case LB_BOUND:
6578                         if (locinput == reginfo->strbeg) {
6579                             match = FALSE;
6580                         }
6581                         else if (NEXTCHR_IS_EOS) {
6582                             match = TRUE;
6583                         }
6584                         else {
6585                             match = isLB(getLB_VAL_UTF8(
6586                                                 reghop3((U8*)locinput,
6587                                                         -1,
6588                                                         (U8*)(reginfo->strbeg)),
6589                                                 (U8*) reginfo->strend),
6590                                           getLB_VAL_UTF8((U8*) locinput,
6591                                                         (U8*) reginfo->strend),
6592                                           (U8*) reginfo->strbeg,
6593                                           (U8*) locinput,
6594                                           (U8*) reginfo->strend,
6595                                           utf8_target);
6596                         }
6597                         break;
6598
6599                     case SB_BOUND: /* Always matches at begin and end */
6600                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6601                             match = TRUE;
6602                         }
6603                         else {
6604                             match = isSB(getSB_VAL_UTF8(
6605                                                 reghop3((U8*)locinput,
6606                                                         -1,
6607                                                         (U8*)(reginfo->strbeg)),
6608                                                 (U8*) reginfo->strend),
6609                                           getSB_VAL_UTF8((U8*) locinput,
6610                                                         (U8*) reginfo->strend),
6611                                           (U8*) reginfo->strbeg,
6612                                           (U8*) locinput,
6613                                           (U8*) reginfo->strend,
6614                                           utf8_target);
6615                         }
6616                         break;
6617
6618                     case WB_BOUND:
6619                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6620                             match = TRUE;
6621                         }
6622                         else {
6623                             match = isWB(WB_UNKNOWN,
6624                                          getWB_VAL_UTF8(
6625                                                 reghop3((U8*)locinput,
6626                                                         -1,
6627                                                         (U8*)(reginfo->strbeg)),
6628                                                 (U8*) reginfo->strend),
6629                                           getWB_VAL_UTF8((U8*) locinput,
6630                                                         (U8*) reginfo->strend),
6631                                           (U8*) reginfo->strbeg,
6632                                           (U8*) locinput,
6633                                           (U8*) reginfo->strend,
6634                                           utf8_target);
6635                         }
6636                         break;
6637                 }
6638             }
6639             else {  /* Not utf8 target */
6640                 switch((bound_type) FLAGS(scan)) {
6641                     case TRADITIONAL_BOUND:
6642                     {
6643                         bool b1, b2;
6644                         b1 = (locinput == reginfo->strbeg)
6645                             ? 0 /* isWORDCHAR_L1('\n') */
6646                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6647                         b2 = (NEXTCHR_IS_EOS)
6648                             ? 0 /* isWORDCHAR_L1('\n') */
6649                             : isWORDCHAR_L1(nextchr);
6650                         match = cBOOL(b1 != b2);
6651                         break;
6652                     }
6653
6654                     case GCB_BOUND:
6655                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6656                             match = TRUE; /* GCB always matches at begin and
6657                                              end */
6658                         }
6659                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6660                                    range */
6661                             match =    UCHARAT(locinput - 1) != '\r'
6662                                     || UCHARAT(locinput) != '\n';
6663                         }
6664                         break;
6665
6666                     case LB_BOUND:
6667                         if (locinput == reginfo->strbeg) {
6668                             match = FALSE;
6669                         }
6670                         else if (NEXTCHR_IS_EOS) {
6671                             match = TRUE;
6672                         }
6673                         else {
6674                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6675                                          getLB_VAL_CP(UCHARAT(locinput)),
6676                                          (U8*) reginfo->strbeg,
6677                                          (U8*) locinput,
6678                                          (U8*) reginfo->strend,
6679                                          utf8_target);
6680                         }
6681                         break;
6682
6683                     case SB_BOUND: /* Always matches at begin and end */
6684                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6685                             match = TRUE;
6686                         }
6687                         else {
6688                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6689                                          getSB_VAL_CP(UCHARAT(locinput)),
6690                                          (U8*) reginfo->strbeg,
6691                                          (U8*) locinput,
6692                                          (U8*) reginfo->strend,
6693                                          utf8_target);
6694                         }
6695                         break;
6696
6697                     case WB_BOUND:
6698                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6699                             match = TRUE;
6700                         }
6701                         else {
6702                             match = isWB(WB_UNKNOWN,
6703                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6704                                          getWB_VAL_CP(UCHARAT(locinput)),
6705                                          (U8*) reginfo->strbeg,
6706                                          (U8*) locinput,
6707                                          (U8*) reginfo->strend,
6708                                          utf8_target);
6709                         }
6710                         break;
6711                 }
6712             }
6713
6714             if (to_complement ^ ! match) {
6715                 sayNO;
6716             }
6717             break;
6718
6719         case ANYOFPOSIXL:
6720         case ANYOFL:  /*  /[abc]/l      */
6721             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6722
6723             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6724             {
6725               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6726             }
6727             /* FALLTHROUGH */
6728         case ANYOFD:  /*   /[abc]/d       */
6729         case ANYOF:  /*   /[abc]/       */
6730             if (NEXTCHR_IS_EOS)
6731                 sayNO;
6732             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6733                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6734                                                                    utf8_target))
6735                     sayNO;
6736                 locinput += UTF8SKIP(locinput);
6737             }
6738             else {
6739                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6740                     sayNO;
6741                 locinput++;
6742             }
6743             break;
6744
6745         case ANYOFM:
6746             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
6747                 sayNO;
6748             }
6749             locinput++;
6750             break;
6751
6752         case NANYOFM:
6753             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)) {
6754                 sayNO;
6755             }
6756             goto increment_locinput;
6757             break;
6758
6759         case ASCII:
6760             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6761                 sayNO;
6762             }
6763
6764             locinput++;     /* ASCII is always single byte */
6765             break;
6766
6767         case NASCII:
6768             if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6769                 sayNO;
6770             }
6771
6772             goto increment_locinput;
6773             break;
6774
6775         /* The argument (FLAGS) to all the POSIX node types is the class number
6776          * */
6777
6778         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6779             to_complement = 1;
6780             /* FALLTHROUGH */
6781
6782         case POSIXL:    /* \w or [:punct:] etc. under /l */
6783             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6784             if (NEXTCHR_IS_EOS)
6785                 sayNO;
6786
6787             /* Use isFOO_lc() for characters within Latin1.  (Note that
6788              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6789              * wouldn't be invariant) */
6790             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6791                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6792                     sayNO;
6793                 }
6794
6795                 locinput++;
6796                 break;
6797             }
6798
6799             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6800                 /* An above Latin-1 code point, or malformed */
6801                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6802                                                        reginfo->strend);
6803                 goto utf8_posix_above_latin1;
6804             }
6805
6806             /* Here is a UTF-8 variant code point below 256 and the target is
6807              * UTF-8 */
6808             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6809                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6810                                             *(locinput + 1))))))
6811             {
6812                 sayNO;
6813             }
6814
6815             goto increment_locinput;
6816
6817         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6818             to_complement = 1;
6819             /* FALLTHROUGH */
6820
6821         case POSIXD:    /* \w or [:punct:] etc. under /d */
6822             if (utf8_target) {
6823                 goto utf8_posix;
6824             }
6825             goto posixa;
6826
6827         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6828
6829             if (NEXTCHR_IS_EOS) {
6830                 sayNO;
6831             }
6832
6833             /* All UTF-8 variants match */
6834             if (! UTF8_IS_INVARIANT(nextchr)) {
6835                 goto increment_locinput;
6836             }
6837
6838             to_complement = 1;
6839             goto join_nposixa;
6840
6841         case POSIXA:    /* \w or [:punct:] etc. under /a */
6842
6843           posixa:
6844             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6845              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6846              * character is a single byte */
6847
6848             if (NEXTCHR_IS_EOS) {
6849                 sayNO;
6850             }
6851
6852           join_nposixa:
6853
6854             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6855                                                                 FLAGS(scan)))))
6856             {
6857                 sayNO;
6858             }
6859
6860             /* Here we are either not in utf8, or we matched a utf8-invariant,
6861              * so the next char is the next byte */
6862             locinput++;
6863             break;
6864
6865         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6866             to_complement = 1;
6867             /* FALLTHROUGH */
6868
6869         case POSIXU:    /* \w or [:punct:] etc. under /u */
6870           utf8_posix:
6871             if (NEXTCHR_IS_EOS) {
6872                 sayNO;
6873             }
6874
6875             /* Use _generic_isCC() for characters within Latin1.  (Note that
6876              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6877              * wouldn't be invariant) */
6878             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6879                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6880                                                            FLAGS(scan)))))
6881                 {
6882                     sayNO;
6883                 }
6884                 locinput++;
6885             }
6886             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6887                 if (! (to_complement
6888                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6889                                                                *(locinput + 1)),
6890                                              FLAGS(scan)))))
6891                 {
6892                     sayNO;
6893                 }
6894                 locinput += 2;
6895             }
6896             else {  /* Handle above Latin-1 code points */
6897               utf8_posix_above_latin1:
6898                 classnum = (_char_class_number) FLAGS(scan);
6899                 switch (classnum) {
6900                     default:
6901                         if (! (to_complement
6902                            ^ cBOOL(_invlist_contains_cp(
6903                                       PL_XPosix_ptrs[classnum],
6904                                       utf8_to_uvchr_buf((U8 *) locinput,
6905                                                         (U8 *) reginfo->strend,
6906                                                         NULL)))))
6907                         {
6908                             sayNO;
6909                         }
6910                         break;
6911                     case _CC_ENUM_SPACE:
6912                         if (! (to_complement
6913                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6914                         {
6915                             sayNO;
6916                         }
6917                         break;
6918                     case _CC_ENUM_BLANK:
6919                         if (! (to_complement
6920                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6921                         {
6922                             sayNO;
6923                         }
6924                         break;
6925                     case _CC_ENUM_XDIGIT:
6926                         if (! (to_complement
6927                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6928                         {
6929                             sayNO;
6930                         }
6931                         break;
6932                     case _CC_ENUM_VERTSPACE:
6933                         if (! (to_complement
6934                                         ^ cBOOL(is_VERTWS_high(locinput))))
6935                         {
6936                             sayNO;
6937                         }
6938                         break;
6939                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6940                     case _CC_ENUM_ASCII:
6941                         if (! to_complement) {
6942                             sayNO;
6943                         }
6944                         break;
6945                 }
6946                 locinput += UTF8SKIP(locinput);
6947             }
6948             break;
6949
6950         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6951                        a Unicode extended Grapheme Cluster */
6952             if (NEXTCHR_IS_EOS)
6953                 sayNO;
6954             if  (! utf8_target) {
6955
6956                 /* Match either CR LF  or '.', as all the other possibilities
6957                  * require utf8 */
6958                 locinput++;         /* Match the . or CR */
6959                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6960                                        match the LF */
6961                     && locinput < reginfo->strend
6962                     && UCHARAT(locinput) == '\n')
6963                 {
6964                     locinput++;
6965                 }
6966             }
6967             else {
6968
6969                 /* Get the gcb type for the current character */
6970                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6971                                                        (U8*) reginfo->strend);
6972
6973                 /* Then scan through the input until we get to the first
6974                  * character whose type is supposed to be a gcb with the
6975                  * current character.  (There is always a break at the
6976                  * end-of-input) */
6977                 locinput += UTF8SKIP(locinput);
6978                 while (locinput < reginfo->strend) {
6979                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6980                                                          (U8*) reginfo->strend);
6981                     if (isGCB(prev_gcb, cur_gcb,
6982                               (U8*) reginfo->strbeg, (U8*) locinput,
6983                               utf8_target))
6984                     {
6985                         break;
6986                     }
6987
6988                     prev_gcb = cur_gcb;
6989                     locinput += UTF8SKIP(locinput);
6990                 }
6991
6992
6993             }
6994             break;
6995
6996         case NREFFL:  /*  /\g{name}/il  */
6997         {   /* The capture buffer cases.  The ones beginning with N for the
6998                named buffers just convert to the equivalent numbered and
6999                pretend they were called as the corresponding numbered buffer
7000                op.  */
7001             /* don't initialize these in the declaration, it makes C++
7002                unhappy */
7003             const char *s;
7004             char type;
7005             re_fold_t folder;
7006             const U8 *fold_array;
7007             UV utf8_fold_flags;
7008
7009             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7010             folder = foldEQ_locale;
7011             fold_array = PL_fold_locale;
7012             type = REFFL;
7013             utf8_fold_flags = FOLDEQ_LOCALE;
7014             goto do_nref;
7015
7016         case NREFFA:  /*  /\g{name}/iaa  */
7017             folder = foldEQ_latin1;
7018             fold_array = PL_fold_latin1;
7019             type = REFFA;
7020             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7021             goto do_nref;
7022
7023         case NREFFU:  /*  /\g{name}/iu  */
7024             folder = foldEQ_latin1;
7025             fold_array = PL_fold_latin1;
7026             type = REFFU;
7027             utf8_fold_flags = 0;
7028             goto do_nref;
7029
7030         case NREFF:  /*  /\g{name}/i  */
7031             folder = foldEQ;
7032             fold_array = PL_fold;
7033             type = REFF;
7034             utf8_fold_flags = 0;
7035             goto do_nref;
7036
7037         case NREF:  /*  /\g{name}/   */
7038             type = REF;
7039             folder = NULL;
7040             fold_array = NULL;
7041             utf8_fold_flags = 0;
7042           do_nref:
7043
7044             /* For the named back references, find the corresponding buffer
7045              * number */
7046             n = reg_check_named_buff_matched(rex,scan);
7047
7048             if ( ! n ) {
7049                 sayNO;
7050             }
7051             goto do_nref_ref_common;
7052
7053         case REFFL:  /*  /\1/il  */
7054             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7055             folder = foldEQ_locale;
7056             fold_array = PL_fold_locale;
7057             utf8_fold_flags = FOLDEQ_LOCALE;
7058             goto do_ref;
7059
7060         case REFFA:  /*  /\1/iaa  */
7061             folder = foldEQ_latin1;
7062             fold_array = PL_fold_latin1;
7063             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7064             goto do_ref;
7065
7066         case REFFU:  /*  /\1/iu  */
7067             folder = foldEQ_latin1;
7068             fold_array = PL_fold_latin1;
7069             utf8_fold_flags = 0;
7070             goto do_ref;
7071
7072         case REFF:  /*  /\1/i  */
7073             folder = foldEQ;
7074             fold_array = PL_fold;
7075             utf8_fold_flags = 0;
7076             goto do_ref;
7077
7078         case REF:  /*  /\1/    */
7079             folder = NULL;
7080             fold_array = NULL;
7081             utf8_fold_flags = 0;
7082
7083           do_ref:
7084             type = OP(scan);
7085             n = ARG(scan);  /* which paren pair */
7086
7087           do_nref_ref_common:
7088             ln = rex->offs[n].start;
7089             endref = rex->offs[n].end;
7090             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7091             if (rex->lastparen < n || ln == -1 || endref == -1)
7092                 sayNO;                  /* Do not match unless seen CLOSEn. */
7093             if (ln == endref)
7094                 break;
7095
7096             s = reginfo->strbeg + ln;
7097             if (type != REF     /* REF can do byte comparison */
7098                 && (utf8_target || type == REFFU || type == REFFL))
7099             {
7100                 char * limit = reginfo->strend;
7101
7102                 /* This call case insensitively compares the entire buffer
7103                     * at s, with the current input starting at locinput, but
7104                     * not going off the end given by reginfo->strend, and
7105                     * returns in <limit> upon success, how much of the
7106                     * current input was matched */
7107                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7108                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7109                 {
7110                     sayNO;
7111                 }
7112                 locinput = limit;
7113                 break;
7114             }
7115
7116             /* Not utf8:  Inline the first character, for speed. */
7117             if (!NEXTCHR_IS_EOS &&
7118                 UCHARAT(s) != nextchr &&
7119                 (type == REF ||
7120                  UCHARAT(s) != fold_array[nextchr]))
7121                 sayNO;
7122             ln = endref - ln;
7123             if (locinput + ln > reginfo->strend)
7124                 sayNO;
7125             if (ln > 1 && (type == REF
7126                            ? memNE(s, locinput, ln)
7127                            : ! folder(s, locinput, ln)))
7128                 sayNO;
7129             locinput += ln;
7130             break;
7131         }
7132
7133         case NOTHING: /* null op; e.g. the 'nothing' following
7134                        * the '*' in m{(a+|b)*}' */
7135             break;
7136         case TAIL: /* placeholder while compiling (A|B|C) */
7137             break;
7138
7139 #undef  ST
7140 #define ST st->u.eval
7141 #define CUR_EVAL cur_eval->u.eval
7142
7143         {
7144             SV *ret;
7145             REGEXP *re_sv;
7146             regexp *re;
7147             regexp_internal *rei;
7148             regnode *startpoint;
7149             U32 arg;
7150
7151         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7152             arg= (U32)ARG(scan);
7153             if (cur_eval && cur_eval->locinput == locinput) {
7154                 if ( ++nochange_depth > max_nochange_depth )
7155                     Perl_croak(aTHX_
7156                         "Pattern subroutine nesting without pos change"
7157                         " exceeded limit in regex");
7158             } else {
7159                 nochange_depth = 0;
7160             }
7161             re_sv = rex_sv;
7162             re = rex;
7163             rei = rexi;
7164             startpoint = scan + ARG2L(scan);
7165             EVAL_CLOSE_PAREN_SET( st, arg );
7166             /* Detect infinite recursion
7167              *
7168              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7169              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7170              * So we track the position in the string we are at each time
7171              * we recurse and if we try to enter the same routine twice from
7172              * the same position we throw an error.
7173              */
7174             if ( rex->recurse_locinput[arg] == locinput ) {
7175                 /* FIXME: we should show the regop that is failing as part
7176                  * of the error message. */
7177                 Perl_croak(aTHX_ "Infinite recursion in regex");
7178             } else {
7179                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7180                 rex->recurse_locinput[arg]= locinput;
7181
7182                 DEBUG_r({
7183                     GET_RE_DEBUG_FLAGS_DECL;
7184                     DEBUG_STACK_r({
7185                         Perl_re_exec_indentf( aTHX_
7186                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7187                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7188                         );
7189                     });
7190                 });
7191             }
7192
7193             /* Save all the positions seen so far. */
7194             ST.cp = regcppush(rex, 0, maxopenparen);
7195             REGCP_SET(ST.lastcp);
7196
7197             /* and then jump to the code we share with EVAL */
7198             goto eval_recurse_doit;
7199             /* NOTREACHED */
7200
7201         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7202             if (cur_eval && cur_eval->locinput==locinput) {
7203                 if ( ++nochange_depth > max_nochange_depth )
7204                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7205             } else {
7206                 nochange_depth = 0;
7207             }
7208             {
7209                 /* execute the code in the {...} */
7210
7211                 dSP;
7212                 IV before;
7213                 OP * const oop = PL_op;
7214                 COP * const ocurcop = PL_curcop;
7215                 OP *nop;
7216                 CV *newcv;
7217
7218                 /* save *all* paren positions */
7219                 regcppush(rex, 0, maxopenparen);
7220                 REGCP_SET(ST.lastcp);
7221
7222                 if (!caller_cv)
7223                     caller_cv = find_runcv(NULL);
7224
7225                 n = ARG(scan);
7226
7227                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7228                     newcv = (ReANY(
7229                                     (REGEXP*)(rexi->data->data[n])
7230                             ))->qr_anoncv;
7231                     nop = (OP*)rexi->data->data[n+1];
7232                 }
7233                 else if (rexi->data->what[n] == 'l') { /* literal code */
7234                     newcv = caller_cv;
7235                     nop = (OP*)rexi->data->data[n];
7236                     assert(CvDEPTH(newcv));
7237                 }
7238                 else {
7239                     /* literal with own CV */
7240                     assert(rexi->data->what[n] == 'L');
7241                     newcv = rex->qr_anoncv;
7242                     nop = (OP*)rexi->data->data[n];
7243                 }
7244
7245                 /* Some notes about MULTICALL and the context and save stacks.
7246                  *
7247                  * In something like
7248                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7249                  * since codeblocks don't introduce a new scope (so that
7250                  * local() etc accumulate), at the end of a successful
7251                  * match there will be a SAVEt_CLEARSV on the savestack
7252                  * for each of $x, $y, $z. If the three code blocks above
7253                  * happen to have come from different CVs (e.g. via
7254                  * embedded qr//s), then we must ensure that during any
7255                  * savestack unwinding, PL_comppad always points to the
7256                  * right pad at each moment. We achieve this by
7257                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7258                  * there is a change of pad.
7259                  * In theory whenever we call a code block, we should
7260                  * push a CXt_SUB context, then pop it on return from
7261                  * that code block. This causes a bit of an issue in that
7262                  * normally popping a context also clears the savestack
7263                  * back to cx->blk_oldsaveix, but here we specifically
7264                  * don't want to clear the save stack on exit from the
7265                  * code block.
7266                  * Also for efficiency we don't want to keep pushing and
7267                  * popping the single SUB context as we backtrack etc.
7268                  * So instead, we push a single context the first time
7269                  * we need, it, then hang onto it until the end of this
7270                  * function. Whenever we encounter a new code block, we
7271                  * update the CV etc if that's changed. During the times
7272                  * in this function where we're not executing a code
7273                  * block, having the SUB context still there is a bit
7274                  * naughty - but we hope that no-one notices.
7275                  * When the SUB context is initially pushed, we fake up
7276                  * cx->blk_oldsaveix to be as if we'd pushed this context
7277                  * on first entry to S_regmatch rather than at some random
7278                  * point during the regexe execution. That way if we
7279                  * croak, popping the context stack will ensure that
7280                  * *everything* SAVEd by this function is undone and then
7281                  * the context popped, rather than e.g., popping the
7282                  * context (and restoring the original PL_comppad) then
7283                  * popping more of the savestack and restoring a bad
7284                  * PL_comppad.
7285                  */
7286
7287                 /* If this is the first EVAL, push a MULTICALL. On
7288                  * subsequent calls, if we're executing a different CV, or
7289                  * if PL_comppad has got messed up from backtracking
7290                  * through SAVECOMPPADs, then refresh the context.
7291                  */
7292                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7293                 {
7294                     U8 flags = (CXp_SUB_RE |
7295                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7296                     SAVECOMPPAD();
7297                     if (last_pushed_cv) {
7298                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7299                     }
7300                     else {
7301                         PUSH_MULTICALL_FLAGS(newcv, flags);
7302                     }
7303                     /* see notes above */
7304                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7305
7306                     last_pushed_cv = newcv;
7307                 }
7308                 else {
7309                     /* these assignments are just to silence compiler
7310                      * warnings */
7311                     multicall_cop = NULL;
7312                 }
7313                 last_pad = PL_comppad;
7314
7315                 /* the initial nextstate you would normally execute
7316                  * at the start of an eval (which would cause error
7317                  * messages to come from the eval), may be optimised
7318                  * away from the execution path in the regex code blocks;
7319                  * so manually set PL_curcop to it initially */
7320                 {
7321                     OP *o = cUNOPx(nop)->op_first;
7322                     assert(o->op_type == OP_NULL);
7323                     if (o->op_targ == OP_SCOPE) {
7324                         o = cUNOPo->op_first;
7325                     }
7326                     else {
7327                         assert(o->op_targ == OP_LEAVE);
7328                         o = cUNOPo->op_first;
7329                         assert(o->op_type == OP_ENTER);
7330                         o = OpSIBLING(o);
7331                     }
7332
7333                     if (o->op_type != OP_STUB) {
7334                         assert(    o->op_type == OP_NEXTSTATE
7335                                 || o->op_type == OP_DBSTATE
7336                                 || (o->op_type == OP_NULL
7337                                     &&  (  o->op_targ == OP_NEXTSTATE
7338                                         || o->op_targ == OP_DBSTATE
7339                                         )
7340                                     )
7341                         );
7342                         PL_curcop = (COP*)o;
7343                     }
7344                 }
7345                 nop = nop->op_next;
7346
7347                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7348                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7349
7350                 rex->offs[0].end = locinput - reginfo->strbeg;
7351                 if (reginfo->info_aux_eval->pos_magic)
7352                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7353                                   reginfo->sv, reginfo->strbeg,
7354                                   locinput - reginfo->strbeg);
7355
7356                 if (sv_yes_mark) {
7357                     SV *sv_mrk = get_sv("REGMARK", 1);
7358                     sv_setsv(sv_mrk, sv_yes_mark);
7359                 }
7360
7361                 /* we don't use MULTICALL here as we want to call the
7362                  * first op of the block of interest, rather than the
7363                  * first op of the sub. Also, we don't want to free
7364                  * the savestack frame */
7365                 before = (IV)(SP-PL_stack_base);
7366                 PL_op = nop;
7367                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7368                 SPAGAIN;
7369                 if ((IV)(SP-PL_stack_base) == before)
7370                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7371                 else {
7372                     ret = POPs;
7373                     PUTBACK;
7374                 }
7375
7376                 /* before restoring everything, evaluate the returned
7377                  * value, so that 'uninit' warnings don't use the wrong
7378                  * PL_op or pad. Also need to process any magic vars
7379                  * (e.g. $1) *before* parentheses are restored */
7380
7381                 PL_op = NULL;
7382
7383                 re_sv = NULL;
7384                 if (logical == 0)        /*   (?{})/   */
7385                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7386                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7387                     sw = cBOOL(SvTRUE_NN(ret));
7388                     logical = 0;
7389                 }
7390                 else {                   /*  /(??{})  */
7391                     /*  if its overloaded, let the regex compiler handle
7392                      *  it; otherwise extract regex, or stringify  */
7393                     if (SvGMAGICAL(ret))
7394                         ret = sv_mortalcopy(ret);
7395                     if (!SvAMAGIC(ret)) {
7396                         SV *sv = ret;
7397                         if (SvROK(sv))
7398                             sv = SvRV(sv);
7399                         if (SvTYPE(sv) == SVt_REGEXP)
7400                             re_sv = (REGEXP*) sv;
7401                         else if (SvSMAGICAL(ret)) {
7402                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7403                             if (mg)
7404                                 re_sv = (REGEXP *) mg->mg_obj;
7405                         }
7406
7407                         /* force any undef warnings here */
7408                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7409                             ret = sv_mortalcopy(ret);
7410                             (void) SvPV_force_nolen(ret);
7411                         }
7412                     }
7413
7414                 }
7415
7416                 /* *** Note that at this point we don't restore
7417                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7418                  * be used again soon. This is safe as long as nothing
7419                  * in the regexp code uses the pad ! */
7420                 PL_op = oop;
7421                 PL_curcop = ocurcop;
7422                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7423                 PL_curpm_under = PL_curpm;
7424                 PL_curpm = PL_reg_curpm;
7425
7426                 if (logical != 2) {
7427                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7428                     /* NOTREACHED */
7429                 }
7430             }
7431
7432                 /* only /(??{})/  from now on */
7433                 logical = 0;
7434                 {
7435                     /* extract RE object from returned value; compiling if
7436                      * necessary */
7437
7438                     if (re_sv) {
7439                         re_sv = reg_temp_copy(NULL, re_sv);
7440                     }
7441                     else {
7442                         U32 pm_flags = 0;
7443
7444                         if (SvUTF8(ret) && IN_BYTES) {
7445                             /* In use 'bytes': make a copy of the octet
7446                              * sequence, but without the flag on */
7447                             STRLEN len;
7448                             const char *const p = SvPV(ret, len);
7449                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7450                         }
7451                         if (rex->intflags & PREGf_USE_RE_EVAL)
7452                             pm_flags |= PMf_USE_RE_EVAL;
7453
7454                         /* if we got here, it should be an engine which
7455                          * supports compiling code blocks and stuff */
7456                         assert(rex->engine && rex->engine->op_comp);
7457                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7458                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7459                                     rex->engine, NULL, NULL,
7460                                     /* copy /msixn etc to inner pattern */
7461                                     ARG2L(scan),
7462                                     pm_flags);
7463
7464                         if (!(SvFLAGS(ret)
7465                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7466                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7467                             /* This isn't a first class regexp. Instead, it's
7468                                caching a regexp onto an existing, Perl visible
7469                                scalar.  */
7470                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7471                         }
7472                     }
7473                     SAVEFREESV(re_sv);
7474                     re = ReANY(re_sv);
7475                 }
7476                 RXp_MATCH_COPIED_off(re);
7477                 re->subbeg = rex->subbeg;
7478                 re->sublen = rex->sublen;
7479                 re->suboffset = rex->suboffset;
7480                 re->subcoffset = rex->subcoffset;
7481                 re->lastparen = 0;
7482                 re->lastcloseparen = 0;
7483                 rei = RXi_GET(re);
7484                 DEBUG_EXECUTE_r(
7485                     debug_start_match(re_sv, utf8_target, locinput,
7486                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7487                 );
7488                 startpoint = rei->program + 1;
7489                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7490                                              * close_paren only for GOSUB */
7491                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7492                 /* Save all the seen positions so far. */
7493                 ST.cp = regcppush(rex, 0, maxopenparen);
7494                 REGCP_SET(ST.lastcp);
7495                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7496                 maxopenparen = 0;
7497                 /* run the pattern returned from (??{...}) */
7498
7499               eval_recurse_doit: /* Share code with GOSUB below this line
7500                             * At this point we expect the stack context to be
7501                             * set up correctly */
7502
7503                 /* invalidate the S-L poscache. We're now executing a
7504                  * different set of WHILEM ops (and their associated
7505                  * indexes) against the same string, so the bits in the
7506                  * cache are meaningless. Setting maxiter to zero forces
7507                  * the cache to be invalidated and zeroed before reuse.
7508                  * XXX This is too dramatic a measure. Ideally we should
7509                  * save the old cache and restore when running the outer
7510                  * pattern again */
7511                 reginfo->poscache_maxiter = 0;
7512
7513                 /* the new regexp might have a different is_utf8_pat than we do */
7514                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7515
7516                 ST.prev_rex = rex_sv;
7517                 ST.prev_curlyx = cur_curlyx;
7518                 rex_sv = re_sv;
7519                 SET_reg_curpm(rex_sv);
7520                 rex = re;
7521                 rexi = rei;
7522                 cur_curlyx = NULL;
7523                 ST.B = next;
7524                 ST.prev_eval = cur_eval;
7525                 cur_eval = st;
7526                 /* now continue from first node in postoned RE */
7527                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7528                 NOT_REACHED; /* NOTREACHED */
7529         }
7530
7531         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7532             /* note: this is called twice; first after popping B, then A */
7533             DEBUG_STACK_r({
7534                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7535                     depth, cur_eval, ST.prev_eval);
7536             });
7537
7538 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7539             if ( cur_eval && CUR_EVAL.close_paren ) {\
7540                 DEBUG_STACK_r({ \
7541                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7542                         depth,    \
7543                         CUR_EVAL.close_paren - 1,\
7544                         cur_eval, \
7545                         VAL);     \
7546                 });               \
7547                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7548             }
7549
7550             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7551
7552             rex_sv = ST.prev_rex;
7553             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7554             SET_reg_curpm(rex_sv);
7555             rex = ReANY(rex_sv);
7556             rexi = RXi_GET(rex);
7557             {
7558                 /* preserve $^R across LEAVE's. See Bug 121070. */
7559                 SV *save_sv= GvSV(PL_replgv);
7560                 SvREFCNT_inc(save_sv);
7561                 regcpblow(ST.cp); /* LEAVE in disguise */
7562                 sv_setsv(GvSV(PL_replgv), save_sv);
7563                 SvREFCNT_dec(save_sv);
7564             }
7565             cur_eval = ST.prev_eval;
7566             cur_curlyx = ST.prev_curlyx;
7567
7568             /* Invalidate cache. See "invalidate" comment above. */
7569             reginfo->poscache_maxiter = 0;
7570             if ( nochange_depth )
7571                 nochange_depth--;
7572
7573             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7574             sayYES;
7575
7576
7577         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7578             REGCP_UNWIND(ST.lastcp);
7579             sayNO;
7580
7581         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7582             /* note: this is called twice; first after popping B, then A */
7583             DEBUG_STACK_r({
7584                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7585                     depth, cur_eval, ST.prev_eval);
7586             });
7587
7588             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7589
7590             rex_sv = ST.prev_rex;
7591             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7592             SET_reg_curpm(rex_sv);
7593             rex = ReANY(rex_sv);
7594             rexi = RXi_GET(rex);
7595
7596             REGCP_UNWIND(ST.lastcp);
7597             regcppop(rex, &maxopenparen);
7598             cur_eval = ST.prev_eval;
7599             cur_curlyx = ST.prev_curlyx;
7600
7601             /* Invalidate cache. See "invalidate" comment above. */
7602             reginfo->poscache_maxiter = 0;
7603             if ( nochange_depth )
7604                 nochange_depth--;
7605
7606             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7607             sayNO_SILENT;
7608 #undef ST
7609
7610         case OPEN: /*  (  */
7611             n = ARG(scan);  /* which paren pair */
7612             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7613             if (n > maxopenparen)
7614                 maxopenparen = n;
7615             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7616                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7617                 depth,
7618                 PTR2UV(rex),
7619                 PTR2UV(rex->offs),
7620                 (UV)n,
7621                 (IV)rex->offs[n].start_tmp,
7622                 (UV)maxopenparen
7623             ));
7624             lastopen = n;
7625             break;
7626
7627         case SROPEN: /*  (*SCRIPT_RUN:  */
7628             script_run_begin = (U8 *) locinput;
7629             break;
7630
7631
7632         case CLOSE:  /*  )  */
7633             n = ARG(scan);  /* which paren pair */
7634             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7635                              locinput - reginfo->strbeg);
7636             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7637                 goto fake_end;
7638
7639             break;
7640
7641         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7642
7643             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7644             {
7645                 sayNO;
7646             }
7647
7648             break;
7649
7650
7651         case ACCEPT:  /*  (*ACCEPT)  */
7652             if (scan->flags)
7653                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7654             if (ARG2L(scan)){
7655                 regnode *cursor;
7656                 for (cursor=scan;
7657                      cursor && OP(cursor)!=END;
7658                      cursor=regnext(cursor))
7659                 {
7660                     if ( OP(cursor)==CLOSE ){
7661                         n = ARG(cursor);
7662                         if ( n <= lastopen ) {
7663                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7664                                              locinput - reginfo->strbeg);
7665                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7666                                 break;
7667                         }
7668                     }
7669                 }
7670             }
7671             goto fake_end;
7672             /* NOTREACHED */
7673
7674         case GROUPP:  /*  (?(1))  */
7675             n = ARG(scan);  /* which paren pair */
7676             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7677             break;
7678
7679         case NGROUPP:  /*  (?(<name>))  */
7680             /* reg_check_named_buff_matched returns 0 for no match */
7681             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7682             break;
7683
7684         case INSUBP:   /*  (?(R))  */
7685             n = ARG(scan);
7686             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7687              * of SCAN is already set up as matches a eval.close_paren */
7688             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7689             break;
7690
7691         case DEFINEP:  /*  (?(DEFINE))  */
7692             sw = 0;
7693             break;
7694
7695         case IFTHEN:   /*  (?(cond)A|B)  */
7696             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7697             if (sw)
7698                 next = NEXTOPER(NEXTOPER(scan));
7699             else {
7700                 next = scan + ARG(scan);
7701                 if (OP(next) == IFTHEN) /* Fake one. */
7702                     next = NEXTOPER(NEXTOPER(next));
7703             }
7704             break;
7705
7706         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7707             logical = scan->flags;
7708             break;
7709
7710 /*******************************************************************
7711
7712 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7713 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7714 STAR/PLUS/CURLY/CURLYN are used instead.)
7715
7716 A*B is compiled as <CURLYX><A><WHILEM><B>
7717
7718 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7719 state, which contains the current count, initialised to -1. It also sets
7720 cur_curlyx to point to this state, with any previous value saved in the
7721 state block.
7722
7723 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7724 since the pattern may possibly match zero times (i.e. it's a while {} loop
7725 rather than a do {} while loop).
7726
7727 Each entry to WHILEM represents a successful match of A. The count in the
7728 CURLYX block is incremented, another WHILEM state is pushed, and execution
7729 passes to A or B depending on greediness and the current count.
7730
7731 For example, if matching against the string a1a2a3b (where the aN are
7732 substrings that match /A/), then the match progresses as follows: (the
7733 pushed states are interspersed with the bits of strings matched so far):
7734
7735     <CURLYX cnt=-1>
7736     <CURLYX cnt=0><WHILEM>
7737     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7738     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7739     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7740     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7741
7742 (Contrast this with something like CURLYM, which maintains only a single
7743 backtrack state:
7744
7745     <CURLYM cnt=0> a1
7746     a1 <CURLYM cnt=1> a2
7747     a1 a2 <CURLYM cnt=2> a3
7748     a1 a2 a3 <CURLYM cnt=3> b
7749 )
7750
7751 Each WHILEM state block marks a point to backtrack to upon partial failure
7752 of A or B, and also contains some minor state data related to that
7753 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7754 overall state, such as the count, and pointers to the A and B ops.
7755
7756 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7757 must always point to the *current* CURLYX block, the rules are:
7758
7759 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7760 and set cur_curlyx to point the new block.
7761
7762 When popping the CURLYX block after a successful or unsuccessful match,
7763 restore the previous cur_curlyx.
7764
7765 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7766 to the outer one saved in the CURLYX block.
7767
7768 When popping the WHILEM block after a successful or unsuccessful B match,
7769 restore the previous cur_curlyx.
7770
7771 Here's an example for the pattern (AI* BI)*BO
7772 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7773
7774 cur_
7775 curlyx backtrack stack
7776 ------ ---------------
7777 NULL
7778 CO     <CO prev=NULL> <WO>
7779 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7780 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7781 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7782
7783 At this point the pattern succeeds, and we work back down the stack to
7784 clean up, restoring as we go:
7785
7786 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7787 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7788 CO     <CO prev=NULL> <WO>
7789 NULL
7790
7791 *******************************************************************/
7792
7793 #define ST st->u.curlyx
7794
7795         case CURLYX:    /* start of /A*B/  (for complex A) */
7796         {
7797             /* No need to save/restore up to this paren */
7798             I32 parenfloor = scan->flags;
7799
7800             assert(next); /* keep Coverity happy */
7801             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7802                 next += ARG(next);
7803
7804             /* XXXX Probably it is better to teach regpush to support
7805                parenfloor > maxopenparen ... */
7806             if (parenfloor > (I32)rex->lastparen)
7807                 parenfloor = rex->lastparen; /* Pessimization... */
7808
7809             ST.prev_curlyx= cur_curlyx;
7810             cur_curlyx = st;
7811             ST.cp = PL_savestack_ix;
7812
7813             /* these fields contain the state of the current curly.
7814              * they are accessed by subsequent WHILEMs */
7815             ST.parenfloor = parenfloor;
7816             ST.me = scan;
7817             ST.B = next;
7818             ST.minmod = minmod;
7819             minmod = 0;
7820             ST.count = -1;      /* this will be updated by WHILEM */
7821             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7822
7823             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7824             NOT_REACHED; /* NOTREACHED */
7825         }
7826
7827         case CURLYX_end: /* just finished matching all of A*B */
7828             cur_curlyx = ST.prev_curlyx;
7829             sayYES;
7830             NOT_REACHED; /* NOTREACHED */
7831
7832         case CURLYX_end_fail: /* just failed to match all of A*B */
7833             regcpblow(ST.cp);
7834             cur_curlyx = ST.prev_curlyx;
7835             sayNO;
7836             NOT_REACHED; /* NOTREACHED */
7837
7838
7839 #undef ST
7840 #define ST st->u.whilem
7841
7842         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7843         {
7844             /* see the discussion above about CURLYX/WHILEM */
7845             I32 n;
7846             int min, max;
7847             regnode *A;
7848
7849             assert(cur_curlyx); /* keep Coverity happy */
7850
7851             min = ARG1(cur_curlyx->u.curlyx.me);
7852             max = ARG2(cur_curlyx->u.curlyx.me);
7853             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7854             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7855             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7856             ST.cache_offset = 0;
7857             ST.cache_mask = 0;
7858
7859
7860             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7861                   depth, (long)n, min, max)
7862             );
7863
7864             /* First just match a string of min A's. */
7865
7866             if (n < min) {
7867                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7868                 cur_curlyx->u.curlyx.lastloc = locinput;
7869                 REGCP_SET(ST.lastcp);
7870
7871                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7872                 NOT_REACHED; /* NOTREACHED */
7873             }
7874
7875             /* If degenerate A matches "", assume A done. */
7876
7877             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7878                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7879                    depth)
7880                 );
7881                 goto do_whilem_B_max;
7882             }
7883
7884             /* super-linear cache processing.
7885              *
7886              * The idea here is that for certain types of CURLYX/WHILEM -
7887              * principally those whose upper bound is infinity (and
7888              * excluding regexes that have things like \1 and other very
7889              * non-regular expresssiony things), then if a pattern like
7890              * /....A*.../ fails and we backtrack to the WHILEM, then we
7891              * make a note that this particular WHILEM op was at string
7892              * position 47 (say) when the rest of pattern failed. Then, if
7893              * we ever find ourselves back at that WHILEM, and at string
7894              * position 47 again, we can just fail immediately rather than
7895              * running the rest of the pattern again.
7896              *
7897              * This is very handy when patterns start to go
7898              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7899              * with a combinatorial explosion of backtracking.
7900              *
7901              * The cache is implemented as a bit array, with one bit per
7902              * string byte position per WHILEM op (up to 16) - so its
7903              * between 0.25 and 2x the string size.
7904              *
7905              * To avoid allocating a poscache buffer every time, we do an
7906              * initially countdown; only after we have  executed a WHILEM
7907              * op (string-length x #WHILEMs) times do we allocate the
7908              * cache.
7909              *
7910              * The top 4 bits of scan->flags byte say how many different
7911              * relevant CURLLYX/WHILEM op pairs there are, while the
7912              * bottom 4-bits is the identifying index number of this
7913              * WHILEM.
7914              */
7915
7916             if (scan->flags) {
7917
7918                 if (!reginfo->poscache_maxiter) {
7919                     /* start the countdown: Postpone detection until we
7920                      * know the match is not *that* much linear. */
7921                     reginfo->poscache_maxiter
7922                         =    (reginfo->strend - reginfo->strbeg + 1)
7923                            * (scan->flags>>4);
7924                     /* possible overflow for long strings and many CURLYX's */
7925                     if (reginfo->poscache_maxiter < 0)
7926                         reginfo->poscache_maxiter = I32_MAX;
7927                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7928                 }
7929
7930                 if (reginfo->poscache_iter-- == 0) {
7931                     /* initialise cache */
7932                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7933                     regmatch_info_aux *const aux = reginfo->info_aux;
7934                     if (aux->poscache) {
7935                         if ((SSize_t)reginfo->poscache_size < size) {
7936                             Renew(aux->poscache, size, char);
7937                             reginfo->poscache_size = size;
7938                         }
7939                         Zero(aux->poscache, size, char);
7940                     }
7941                     else {
7942                         reginfo->poscache_size = size;
7943                         Newxz(aux->poscache, size, char);
7944                     }
7945                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7946       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
7947                               PL_colors[4], PL_colors[5])
7948                     );
7949                 }
7950
7951                 if (reginfo->poscache_iter < 0) {
7952                     /* have we already failed at this position? */
7953                     SSize_t offset, mask;
7954
7955                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7956                     offset  = (scan->flags & 0xf) - 1
7957                                 +   (locinput - reginfo->strbeg)
7958                                   * (scan->flags>>4);
7959                     mask    = 1 << (offset % 8);
7960                     offset /= 8;
7961                     if (reginfo->info_aux->poscache[offset] & mask) {
7962                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
7963                             depth)
7964                         );
7965                         cur_curlyx->u.curlyx.count--;
7966                         sayNO; /* cache records failure */
7967                     }
7968                     ST.cache_offset = offset;
7969                     ST.cache_mask   = mask;
7970                 }
7971             }
7972
7973             /* Prefer B over A for minimal matching. */
7974
7975             if (cur_curlyx->u.curlyx.minmod) {
7976                 ST.save_curlyx = cur_curlyx;
7977                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7978                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7979                                     locinput);
7980                 NOT_REACHED; /* NOTREACHED */
7981             }
7982
7983             /* Prefer A over B for maximal matching. */
7984
7985             if (n < max) { /* More greed allowed? */
7986                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7987                             maxopenparen);
7988                 cur_curlyx->u.curlyx.lastloc = locinput;
7989                 REGCP_SET(ST.lastcp);
7990                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7991                 NOT_REACHED; /* NOTREACHED */
7992             }
7993             goto do_whilem_B_max;
7994         }
7995         NOT_REACHED; /* NOTREACHED */
7996
7997         case WHILEM_B_min: /* just matched B in a minimal match */
7998         case WHILEM_B_max: /* just matched B in a maximal match */
7999             cur_curlyx = ST.save_curlyx;
8000             sayYES;
8001             NOT_REACHED; /* NOTREACHED */
8002
8003         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8004             cur_curlyx = ST.save_curlyx;
8005             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8006             cur_curlyx->u.curlyx.count--;
8007             CACHEsayNO;
8008             NOT_REACHED; /* NOTREACHED */
8009
8010         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8011             /* FALLTHROUGH */
8012         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8013             REGCP_UNWIND(ST.lastcp);
8014             regcppop(rex, &maxopenparen);
8015             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8016             cur_curlyx->u.curlyx.count--;
8017             CACHEsayNO;
8018             NOT_REACHED; /* NOTREACHED */
8019
8020         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8021             REGCP_UNWIND(ST.lastcp);
8022             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8023             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8024                 depth)
8025             );
8026           do_whilem_B_max:
8027             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8028                 && ckWARN(WARN_REGEXP)
8029                 && !reginfo->warned)
8030             {
8031                 reginfo->warned = TRUE;
8032                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8033                      "Complex regular subexpression recursion limit (%d) "
8034                      "exceeded",
8035                      REG_INFTY - 1);
8036             }
8037
8038             /* now try B */
8039             ST.save_curlyx = cur_curlyx;
8040             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8041             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8042                                 locinput);
8043             NOT_REACHED; /* NOTREACHED */
8044
8045         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8046             cur_curlyx = ST.save_curlyx;
8047
8048             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8049                 /* Maximum greed exceeded */
8050                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8051                     && ckWARN(WARN_REGEXP)
8052                     && !reginfo->warned)
8053                 {
8054                     reginfo->warned     = TRUE;
8055                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8056                         "Complex regular subexpression recursion "
8057                         "limit (%d) exceeded",
8058                         REG_INFTY - 1);
8059                 }
8060                 cur_curlyx->u.curlyx.count--;
8061                 CACHEsayNO;
8062             }
8063
8064             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8065             );
8066             /* Try grabbing another A and see if it helps. */
8067             cur_curlyx->u.curlyx.lastloc = locinput;
8068             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8069                             maxopenparen);
8070             REGCP_SET(ST.lastcp);
8071             PUSH_STATE_GOTO(WHILEM_A_min,
8072                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8073                 locinput);
8074             NOT_REACHED; /* NOTREACHED */
8075
8076 #undef  ST
8077 #define ST st->u.branch
8078
8079         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8080             next = scan + ARG(scan);
8081             if (next == scan)
8082                 next = NULL;
8083             scan = NEXTOPER(scan);
8084             /* FALLTHROUGH */
8085
8086         case BRANCH:        /*  /(...|A|...)/ */
8087             scan = NEXTOPER(scan); /* scan now points to inner node */
8088             ST.lastparen = rex->lastparen;
8089             ST.lastcloseparen = rex->lastcloseparen;
8090             ST.next_branch = next;
8091             REGCP_SET(ST.cp);
8092
8093             /* Now go into the branch */
8094             if (has_cutgroup) {
8095                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
8096             } else {
8097                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
8098             }
8099             NOT_REACHED; /* NOTREACHED */
8100
8101         case CUTGROUP:  /*  /(*THEN)/  */
8102             sv_yes_mark = st->u.mark.mark_name = scan->flags
8103                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8104                 : NULL;
8105             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
8106             NOT_REACHED; /* NOTREACHED */
8107
8108         case CUTGROUP_next_fail:
8109             do_cutgroup = 1;
8110             no_final = 1;
8111             if (st->u.mark.mark_name)
8112                 sv_commit = st->u.mark.mark_name;
8113             sayNO;
8114             NOT_REACHED; /* NOTREACHED */
8115
8116         case BRANCH_next:
8117             sayYES;
8118             NOT_REACHED; /* NOTREACHED */
8119
8120         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8121             if (do_cutgroup) {
8122                 do_cutgroup = 0;
8123                 no_final = 0;
8124             }
8125             REGCP_UNWIND(ST.cp);
8126             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8127             scan = ST.next_branch;
8128             /* no more branches? */
8129             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8130                 DEBUG_EXECUTE_r({
8131                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8132                         depth,
8133                         PL_colors[4],
8134                         PL_colors[5] );
8135                 });
8136                 sayNO_SILENT;
8137             }
8138             continue; /* execute next BRANCH[J] op */
8139             /* NOTREACHED */
8140
8141         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8142             minmod = 1;
8143             break;
8144
8145 #undef  ST
8146 #define ST st->u.curlym
8147
8148         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8149
8150             /* This is an optimisation of CURLYX that enables us to push
8151              * only a single backtracking state, no matter how many matches
8152              * there are in {m,n}. It relies on the pattern being constant
8153              * length, with no parens to influence future backrefs
8154              */
8155
8156             ST.me = scan;
8157             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8158
8159             ST.lastparen      = rex->lastparen;
8160             ST.lastcloseparen = rex->lastcloseparen;
8161
8162             /* if paren positive, emulate an OPEN/CLOSE around A */
8163             if (ST.me->flags) {
8164                 U32 paren = ST.me->flags;
8165                 if (paren > maxopenparen)
8166                     maxopenparen = paren;
8167                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8168             }
8169             ST.A = scan;
8170             ST.B = next;
8171             ST.alen = 0;
8172             ST.count = 0;
8173             ST.minmod = minmod;
8174             minmod = 0;
8175             ST.c1 = CHRTEST_UNINIT;
8176             REGCP_SET(ST.cp);
8177
8178             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8179                 goto curlym_do_B;
8180
8181           curlym_do_A: /* execute the A in /A{m,n}B/  */
8182             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
8183             NOT_REACHED; /* NOTREACHED */
8184
8185         case CURLYM_A: /* we've just matched an A */
8186             ST.count++;
8187             /* after first match, determine A's length: u.curlym.alen */
8188             if (ST.count == 1) {
8189                 if (reginfo->is_utf8_target) {
8190                     char *s = st->locinput;
8191                     while (s < locinput) {
8192                         ST.alen++;
8193                         s += UTF8SKIP(s);
8194                     }
8195                 }
8196                 else {
8197                     ST.alen = locinput - st->locinput;
8198                 }
8199                 if (ST.alen == 0)
8200                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8201             }
8202             DEBUG_EXECUTE_r(
8203                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8204                           depth, (IV) ST.count, (IV)ST.alen)
8205             );
8206
8207             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8208                 goto fake_end;
8209
8210             {
8211                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8212                 if ( max == REG_INFTY || ST.count < max )
8213                     goto curlym_do_A; /* try to match another A */
8214             }
8215             goto curlym_do_B; /* try to match B */
8216
8217         case CURLYM_A_fail: /* just failed to match an A */
8218             REGCP_UNWIND(ST.cp);
8219
8220
8221             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8222                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8223                 sayNO;
8224
8225           curlym_do_B: /* execute the B in /A{m,n}B/  */
8226             if (ST.c1 == CHRTEST_UNINIT) {
8227                 /* calculate c1 and c2 for possible match of 1st char
8228                  * following curly */
8229                 ST.c1 = ST.c2 = CHRTEST_VOID;
8230                 assert(ST.B);
8231                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8232                     regnode *text_node = ST.B;
8233                     if (! HAS_TEXT(text_node))
8234                         FIND_NEXT_IMPT(text_node);
8235                     /* this used to be
8236
8237                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8238
8239                         But the former is redundant in light of the latter.
8240
8241                         if this changes back then the macro for
8242                         IS_TEXT and friends need to change.
8243                      */
8244                     if (PL_regkind[OP(text_node)] == EXACT) {
8245                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8246                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8247                            reginfo))
8248                         {
8249                             sayNO;
8250                         }
8251                     }
8252                 }
8253             }
8254
8255             DEBUG_EXECUTE_r(
8256                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8257                     depth, (IV)ST.count)
8258                 );
8259             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8260                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8261                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8262                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8263                     {
8264                         /* simulate B failing */
8265                         DEBUG_OPTIMISE_r(
8266                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8267                                 depth,
8268                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8269                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8270                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8271                         );
8272                         state_num = CURLYM_B_fail;
8273                         goto reenter_switch;
8274                     }
8275                 }
8276                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8277                     /* simulate B failing */
8278                     DEBUG_OPTIMISE_r(
8279                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8280                             depth,
8281                             (int) nextchr, ST.c1, ST.c2)
8282                     );
8283                     state_num = CURLYM_B_fail;
8284                     goto reenter_switch;
8285                 }
8286             }
8287
8288             if (ST.me->flags) {
8289                 /* emulate CLOSE: mark current A as captured */
8290                 U32 paren = (U32)ST.me->flags;
8291                 if (ST.count) {
8292                     CLOSE_CAPTURE(paren,
8293                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
8294                         locinput - reginfo->strbeg);
8295                 }
8296                 else
8297                     rex->offs[paren].end = -1;
8298
8299                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8300                 {
8301                     if (ST.count)
8302                         goto fake_end;
8303                     else
8304                         sayNO;
8305                 }
8306             }
8307
8308             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8309             NOT_REACHED; /* NOTREACHED */
8310
8311         case CURLYM_B_fail: /* just failed to match a B */
8312             REGCP_UNWIND(ST.cp);
8313             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8314             if (ST.minmod) {
8315                 I32 max = ARG2(ST.me);
8316                 if (max != REG_INFTY && ST.count == max)
8317                     sayNO;
8318                 goto curlym_do_A; /* try to match a further A */
8319             }
8320             /* backtrack one A */
8321             if (ST.count == ARG1(ST.me) /* min */)
8322                 sayNO;
8323             ST.count--;
8324             SET_locinput(HOPc(locinput, -ST.alen));
8325             goto curlym_do_B; /* try to match B */
8326
8327 #undef ST
8328 #define ST st->u.curly
8329
8330 #define CURLY_SETPAREN(paren, success) \
8331     if (paren) { \
8332         if (success) { \
8333             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8334                                  locinput - reginfo->strbeg); \
8335         } \
8336         else { \
8337             rex->offs[paren].end = -1; \
8338             rex->lastparen      = ST.lastparen; \
8339             rex->lastcloseparen = ST.lastcloseparen; \
8340         } \
8341     }
8342
8343         case STAR:              /*  /A*B/ where A is width 1 char */
8344             ST.paren = 0;
8345             ST.min = 0;
8346             ST.max = REG_INFTY;
8347             scan = NEXTOPER(scan);
8348             goto repeat;
8349
8350         case PLUS:              /*  /A+B/ where A is width 1 char */
8351             ST.paren = 0;
8352             ST.min = 1;
8353             ST.max = REG_INFTY;
8354             scan = NEXTOPER(scan);
8355             goto repeat;
8356
8357         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8358             ST.paren = scan->flags;     /* Which paren to set */
8359             ST.lastparen      = rex->lastparen;
8360             ST.lastcloseparen = rex->lastcloseparen;
8361             if (ST.paren > maxopenparen)
8362                 maxopenparen = ST.paren;
8363             ST.min = ARG1(scan);  /* min to match */
8364             ST.max = ARG2(scan);  /* max to match */
8365             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8366
8367             /* handle the single-char capture called as a GOSUB etc */
8368             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8369             {
8370                 char *li = locinput;
8371                 if (!regrepeat(rex, &li, scan, reginfo, 1))
8372                     sayNO;
8373                 SET_locinput(li);
8374                 goto fake_end;
8375             }
8376
8377             goto repeat;
8378
8379         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8380             ST.paren = 0;
8381             ST.min = ARG1(scan);  /* min to match */
8382             ST.max = ARG2(scan);  /* max to match */
8383             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8384           repeat:
8385             /*
8386             * Lookahead to avoid useless match attempts
8387             * when we know what character comes next.
8388             *
8389             * Used to only do .*x and .*?x, but now it allows
8390             * for )'s, ('s and (?{ ... })'s to be in the way
8391             * of the quantifier and the EXACT-like node.  -- japhy
8392             */
8393
8394             assert(ST.min <= ST.max);
8395             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8396                 ST.c1 = ST.c2 = CHRTEST_VOID;
8397             }
8398             else {
8399                 regnode *text_node = next;
8400
8401                 if (! HAS_TEXT(text_node))
8402                     FIND_NEXT_IMPT(text_node);
8403
8404                 if (! HAS_TEXT(text_node))
8405                     ST.c1 = ST.c2 = CHRTEST_VOID;
8406                 else {
8407                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8408                         ST.c1 = ST.c2 = CHRTEST_VOID;
8409                     }
8410                     else {
8411
8412                     /*  Currently we only get here when
8413
8414                         PL_rekind[OP(text_node)] == EXACT
8415
8416                         if this changes back then the macro for IS_TEXT and
8417                         friends need to change. */
8418                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8419                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8420                            reginfo))
8421                         {
8422                             sayNO;
8423                         }
8424                     }
8425                 }
8426             }
8427
8428             ST.A = scan;
8429             ST.B = next;
8430             if (minmod) {
8431                 char *li = locinput;
8432                 minmod = 0;
8433                 if (ST.min &&
8434                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8435                             < ST.min)
8436                     sayNO;
8437                 SET_locinput(li);
8438                 ST.count = ST.min;
8439                 REGCP_SET(ST.cp);
8440                 if (ST.c1 == CHRTEST_VOID)
8441                     goto curly_try_B_min;
8442
8443                 ST.oldloc = locinput;
8444
8445                 /* set ST.maxpos to the furthest point along the
8446                  * string that could possibly match */
8447                 if  (ST.max == REG_INFTY) {
8448                     ST.maxpos = reginfo->strend - 1;
8449                     if (utf8_target)
8450                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8451                             ST.maxpos--;
8452                 }
8453                 else if (utf8_target) {
8454                     int m = ST.max - ST.min;
8455                     for (ST.maxpos = locinput;
8456                          m >0 && ST.maxpos < reginfo->strend; m--)
8457                         ST.maxpos += UTF8SKIP(ST.maxpos);
8458                 }
8459                 else {
8460                     ST.maxpos = locinput + ST.max - ST.min;
8461                     if (ST.maxpos >= reginfo->strend)
8462                         ST.maxpos = reginfo->strend - 1;
8463                 }
8464                 goto curly_try_B_min_known;
8465
8466             }
8467             else {
8468                 /* avoid taking address of locinput, so it can remain
8469                  * a register var */
8470                 char *li = locinput;
8471                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8472                 if (ST.count < ST.min)
8473                     sayNO;
8474                 SET_locinput(li);
8475                 if ((ST.count > ST.min)
8476                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8477                 {
8478                     /* A{m,n} must come at the end of the string, there's
8479                      * no point in backing off ... */
8480                     ST.min = ST.count;
8481                     /* ...except that $ and \Z can match before *and* after
8482                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8483                        We may back off by one in this case. */
8484                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8485                         ST.min--;
8486                 }
8487                 REGCP_SET(ST.cp);
8488                 goto curly_try_B_max;
8489             }
8490             NOT_REACHED; /* NOTREACHED */
8491
8492         case CURLY_B_min_fail:
8493             /* failed to find B in a non-greedy match.
8494              * Handles both cases where c1,c2 valid or not */
8495
8496             REGCP_UNWIND(ST.cp);
8497             if (ST.paren) {
8498                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8499             }
8500
8501             if (ST.c1 == CHRTEST_VOID) {
8502                 /* failed -- move forward one */
8503                 char *li = locinput;
8504                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8505                     sayNO;
8506                 }
8507                 locinput = li;
8508                 ST.count++;
8509                 if (!(   ST.count <= ST.max
8510                         /* count overflow ? */
8511                      || (ST.max == REG_INFTY && ST.count > 0))
8512                 )
8513                     sayNO;
8514             }
8515             else {
8516                 int n;
8517                 /* Couldn't or didn't -- move forward. */
8518                 ST.oldloc = locinput;
8519                 if (utf8_target)
8520                     locinput += UTF8SKIP(locinput);
8521                 else
8522                     locinput++;
8523                 ST.count++;
8524
8525               curly_try_B_min_known:
8526                 /* find the next place where 'B' could work, then call B */
8527                 if (utf8_target) {
8528                     n = (ST.oldloc == locinput) ? 0 : 1;
8529                     if (ST.c1 == ST.c2) {
8530                         /* set n to utf8_distance(oldloc, locinput) */
8531                         while (locinput <= ST.maxpos
8532                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8533                         {
8534                             locinput += UTF8SKIP(locinput);
8535                             n++;
8536                         }
8537                     }
8538                     else {
8539                         /* set n to utf8_distance(oldloc, locinput) */
8540                         while (locinput <= ST.maxpos
8541                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8542                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8543                         {
8544                             locinput += UTF8SKIP(locinput);
8545                             n++;
8546                         }
8547                     }
8548                 }
8549                 else {  /* Not utf8_target */
8550                     if (ST.c1 == ST.c2) {
8551                         locinput = (char *) memchr(locinput,
8552                                                    ST.c1,
8553                                                    ST.maxpos + 1 - locinput);
8554                         if (! locinput) {
8555                             locinput = ST.maxpos + 1;
8556                         }
8557                     }
8558                     else {
8559                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8560
8561                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8562                             while (   locinput <= ST.maxpos
8563                                    && UCHARAT(locinput) != ST.c1
8564                                    && UCHARAT(locinput) != ST.c2)
8565                             {
8566                                 locinput++;
8567                             }
8568                         }
8569                         else {
8570                             /* If c1 and c2 only differ by a single bit, we can
8571                              * avoid a conditional each time through the loop,
8572                              * at the expense of a little preliminary setup and
8573                              * an extra mask each iteration.  By masking out
8574                              * that bit, we match exactly two characters, c1
8575                              * and c2, and so we don't have to test for both.
8576                              * On both ASCII and EBCDIC platforms, most of the
8577                              * ASCII-range and Latin1-range folded equivalents
8578                              * differ only in a single bit, so this is actually
8579                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8580                              * 0x61). */
8581                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8582                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8583                             while (   locinput <= ST.maxpos
8584                                    && (UCHARAT(locinput) & c1_c2_mask)
8585                                                                 != c1_masked)
8586                             {
8587                                 locinput++;
8588                             }
8589                         }
8590                     }
8591                     n = locinput - ST.oldloc;
8592                 }
8593                 if (locinput > ST.maxpos)
8594                     sayNO;
8595                 if (n) {
8596                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8597                      * at b; check that everything between oldloc and
8598                      * locinput matches */
8599                     char *li = ST.oldloc;
8600                     ST.count += n;
8601                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8602                         sayNO;
8603                     assert(n == REG_INFTY || locinput == li);
8604                 }
8605             }
8606
8607           curly_try_B_min:
8608             CURLY_SETPAREN(ST.paren, ST.count);
8609             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8610             NOT_REACHED; /* NOTREACHED */
8611
8612
8613           curly_try_B_max:
8614             /* a successful greedy match: now try to match B */
8615             {
8616                 bool could_match = locinput < reginfo->strend;
8617
8618                 /* If it could work, try it. */
8619                 if (ST.c1 != CHRTEST_VOID && could_match) {
8620                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8621                     {
8622                         could_match = memEQ(locinput,
8623                                             ST.c1_utf8,
8624                                             UTF8SKIP(locinput))
8625                                     || memEQ(locinput,
8626                                              ST.c2_utf8,
8627                                              UTF8SKIP(locinput));
8628                     }
8629                     else {
8630                         could_match = UCHARAT(locinput) == ST.c1
8631                                       || UCHARAT(locinput) == ST.c2;
8632                     }
8633                 }
8634                 if (ST.c1 == CHRTEST_VOID || could_match) {
8635                     CURLY_SETPAREN(ST.paren, ST.count);
8636                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8637                     NOT_REACHED; /* NOTREACHED */
8638                 }
8639             }
8640             /* FALLTHROUGH */
8641
8642         case CURLY_B_max_fail:
8643             /* failed to find B in a greedy match */
8644
8645             REGCP_UNWIND(ST.cp);
8646             if (ST.paren) {
8647                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8648             }
8649             /*  back up. */
8650             if (--ST.count < ST.min)
8651                 sayNO;
8652             locinput = HOPc(locinput, -1);
8653             goto curly_try_B_max;
8654
8655 #undef ST
8656
8657         case END: /*  last op of main pattern  */
8658           fake_end:
8659             if (cur_eval) {
8660                 /* we've just finished A in /(??{A})B/; now continue with B */
8661                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8662                 st->u.eval.prev_rex = rex_sv;           /* inner */
8663
8664                 /* Save *all* the positions. */
8665                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8666                 rex_sv = CUR_EVAL.prev_rex;
8667                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8668                 SET_reg_curpm(rex_sv);
8669                 rex = ReANY(rex_sv);
8670                 rexi = RXi_GET(rex);
8671
8672                 st->u.eval.prev_curlyx = cur_curlyx;
8673                 cur_curlyx = CUR_EVAL.prev_curlyx;
8674
8675                 REGCP_SET(st->u.eval.lastcp);
8676
8677                 /* Restore parens of the outer rex without popping the
8678                  * savestack */
8679                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8680
8681                 st->u.eval.prev_eval = cur_eval;
8682                 cur_eval = CUR_EVAL.prev_eval;
8683                 DEBUG_EXECUTE_r(
8684                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8685                                       depth, cur_eval););
8686                 if ( nochange_depth )
8687                     nochange_depth--;
8688
8689                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8690
8691                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8692                                     locinput); /* match B */
8693             }
8694
8695             if (locinput < reginfo->till) {
8696                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8697                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8698                                       PL_colors[4],
8699                                       (long)(locinput - startpos),
8700                                       (long)(reginfo->till - startpos),
8701                                       PL_colors[5]));
8702
8703                 sayNO_SILENT;           /* Cannot match: too short. */
8704             }
8705             sayYES;                     /* Success! */
8706
8707         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8708             DEBUG_EXECUTE_r(
8709             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8710                 depth, PL_colors[4], PL_colors[5]));
8711             sayYES;                     /* Success! */
8712
8713 #undef  ST
8714 #define ST st->u.ifmatch
8715
8716         {
8717             char *newstart;
8718
8719         case SUSPEND:   /* (?>A) */
8720             ST.wanted = 1;
8721             newstart = locinput;
8722             goto do_ifmatch;
8723
8724         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8725             ST.wanted = 0;
8726             goto ifmatch_trivial_fail_test;
8727
8728         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8729             ST.wanted = 1;
8730           ifmatch_trivial_fail_test:
8731             if (scan->flags) {
8732                 char * const s = HOPBACKc(locinput, scan->flags);
8733                 if (!s) {
8734                     /* trivial fail */
8735                     if (logical) {
8736                         logical = 0;
8737                         sw = 1 - cBOOL(ST.wanted);
8738                     }
8739                     else if (ST.wanted)
8740                         sayNO;
8741                     next = scan + ARG(scan);
8742                     if (next == scan)
8743                         next = NULL;
8744                     break;
8745                 }
8746                 newstart = s;
8747             }
8748             else
8749                 newstart = locinput;
8750
8751           do_ifmatch:
8752             ST.me = scan;
8753             ST.logical = logical;
8754             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8755
8756             /* execute body of (?...A) */
8757             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8758             NOT_REACHED; /* NOTREACHED */
8759         }
8760
8761         case IFMATCH_A_fail: /* body of (?...A) failed */
8762             ST.wanted = !ST.wanted;
8763             /* FALLTHROUGH */
8764
8765         case IFMATCH_A: /* body of (?...A) succeeded */
8766             if (ST.logical) {
8767                 sw = cBOOL(ST.wanted);
8768             }
8769             else if (!ST.wanted)
8770                 sayNO;
8771
8772             if (OP(ST.me) != SUSPEND) {
8773                 /* restore old position except for (?>...) */
8774                 locinput = st->locinput;
8775             }
8776             scan = ST.me + ARG(ST.me);
8777             if (scan == ST.me)
8778                 scan = NULL;
8779             continue; /* execute B */
8780
8781 #undef ST
8782
8783         case LONGJMP: /*  alternative with many branches compiles to
8784                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8785             next = scan + ARG(scan);
8786             if (next == scan)
8787                 next = NULL;
8788             break;
8789
8790         case COMMIT:  /*  (*COMMIT)  */
8791             reginfo->cutpoint = reginfo->strend;
8792             /* FALLTHROUGH */
8793
8794         case PRUNE:   /*  (*PRUNE)   */
8795             if (scan->flags)
8796                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8797             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8798             NOT_REACHED; /* NOTREACHED */
8799
8800         case COMMIT_next_fail:
8801             no_final = 1;
8802             /* FALLTHROUGH */
8803             sayNO;
8804             NOT_REACHED; /* NOTREACHED */
8805
8806         case OPFAIL:   /* (*FAIL)  */
8807             if (scan->flags)
8808                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8809             if (logical) {
8810                 /* deal with (?(?!)X|Y) properly,
8811                  * make sure we trigger the no branch
8812                  * of the trailing IFTHEN structure*/
8813                 sw= 0;
8814                 break;
8815             } else {
8816                 sayNO;
8817             }
8818             NOT_REACHED; /* NOTREACHED */
8819
8820 #define ST st->u.mark
8821         case MARKPOINT: /*  (*MARK:foo)  */
8822             ST.prev_mark = mark_state;
8823             ST.mark_name = sv_commit = sv_yes_mark
8824                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8825             mark_state = st;
8826             ST.mark_loc = locinput;
8827             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8828             NOT_REACHED; /* NOTREACHED */
8829
8830         case MARKPOINT_next:
8831             mark_state = ST.prev_mark;
8832             sayYES;
8833             NOT_REACHED; /* NOTREACHED */
8834
8835         case MARKPOINT_next_fail:
8836             if (popmark && sv_eq(ST.mark_name,popmark))
8837             {
8838                 if (ST.mark_loc > startpoint)
8839                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8840                 popmark = NULL; /* we found our mark */
8841                 sv_commit = ST.mark_name;
8842
8843                 DEBUG_EXECUTE_r({
8844                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8845                             depth,
8846                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8847                 });
8848             }
8849             mark_state = ST.prev_mark;
8850             sv_yes_mark = mark_state ?
8851                 mark_state->u.mark.mark_name : NULL;
8852             sayNO;
8853             NOT_REACHED; /* NOTREACHED */
8854
8855         case SKIP:  /*  (*SKIP)  */
8856             if (!scan->flags) {
8857                 /* (*SKIP) : if we fail we cut here*/
8858                 ST.mark_name = NULL;
8859                 ST.mark_loc = locinput;
8860                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8861             } else {
8862                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8863                    otherwise do nothing.  Meaning we need to scan
8864                  */
8865                 regmatch_state *cur = mark_state;
8866                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8867
8868                 while (cur) {
8869                     if ( sv_eq( cur->u.mark.mark_name,
8870                                 find ) )
8871                     {
8872                         ST.mark_name = find;
8873                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8874                     }
8875                     cur = cur->u.mark.prev_mark;
8876                 }
8877             }
8878             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8879             break;
8880
8881         case SKIP_next_fail:
8882             if (ST.mark_name) {
8883                 /* (*CUT:NAME) - Set up to search for the name as we
8884                    collapse the stack*/
8885                 popmark = ST.mark_name;
8886             } else {
8887                 /* (*CUT) - No name, we cut here.*/
8888                 if (ST.mark_loc > startpoint)
8889                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8890                 /* but we set sv_commit to latest mark_name if there
8891                    is one so they can test to see how things lead to this
8892                    cut */
8893                 if (mark_state)
8894                     sv_commit=mark_state->u.mark.mark_name;
8895             }
8896             no_final = 1;
8897             sayNO;
8898             NOT_REACHED; /* NOTREACHED */
8899 #undef ST
8900
8901         case LNBREAK: /* \R */
8902             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8903                 locinput += n;
8904             } else
8905                 sayNO;
8906             break;
8907
8908         default:
8909             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8910                           PTR2UV(scan), OP(scan));
8911             Perl_croak(aTHX_ "regexp memory corruption");
8912
8913         /* this is a point to jump to in order to increment
8914          * locinput by one character */
8915           increment_locinput:
8916             assert(!NEXTCHR_IS_EOS);
8917             if (utf8_target) {
8918                 locinput += PL_utf8skip[nextchr];
8919                 /* locinput is allowed to go 1 char off the end (signifying
8920                  * EOS), but not 2+ */
8921                 if (locinput > reginfo->strend)
8922                     sayNO;
8923             }
8924             else
8925                 locinput++;
8926             break;
8927
8928         } /* end switch */
8929
8930         /* switch break jumps here */
8931         scan = next; /* prepare to execute the next op and ... */
8932         continue;    /* ... jump back to the top, reusing st */
8933         /* NOTREACHED */
8934
8935       push_yes_state:
8936         /* push a state that backtracks on success */
8937         st->u.yes.prev_yes_state = yes_state;
8938         yes_state = st;
8939         /* FALLTHROUGH */
8940       push_state:
8941         /* push a new regex state, then continue at scan  */
8942         {
8943             regmatch_state *newst;
8944
8945             DEBUG_STACK_r({
8946                 regmatch_state *cur = st;
8947                 regmatch_state *curyes = yes_state;
8948                 U32 i;
8949                 regmatch_slab *slab = PL_regmatch_slab;
8950                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8951                     if (cur < SLAB_FIRST(slab)) {
8952                         slab = slab->prev;
8953                         cur = SLAB_LAST(slab);
8954                     }
8955                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8956                         depth,
8957                         i ? "    " : "push",
8958                         depth - i, PL_reg_name[cur->resume_state],
8959                         (curyes == cur) ? "yes" : ""
8960                     );
8961                     if (curyes == cur)
8962                         curyes = cur->u.yes.prev_yes_state;
8963                 }
8964             } else
8965                 DEBUG_STATE_pp("push")
8966             );
8967             depth++;
8968             st->locinput = locinput;
8969             newst = st+1;
8970             if (newst >  SLAB_LAST(PL_regmatch_slab))
8971                 newst = S_push_slab(aTHX);
8972             PL_regmatch_state = newst;
8973
8974             locinput = pushinput;
8975             st = newst;
8976             continue;
8977             /* NOTREACHED */
8978         }
8979     }
8980 #ifdef SOLARIS_BAD_OPTIMIZER
8981 #  undef PL_charclass
8982 #endif
8983
8984     /*
8985     * We get here only if there's trouble -- normally "case END" is
8986     * the terminating point.
8987     */
8988     Perl_croak(aTHX_ "corrupted regexp pointers");
8989     NOT_REACHED; /* NOTREACHED */
8990
8991   yes:
8992     if (yes_state) {
8993         /* we have successfully completed a subexpression, but we must now
8994          * pop to the state marked by yes_state and continue from there */
8995         assert(st != yes_state);
8996 #ifdef DEBUGGING
8997         while (st != yes_state) {
8998             st--;
8999             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9000                 PL_regmatch_slab = PL_regmatch_slab->prev;
9001                 st = SLAB_LAST(PL_regmatch_slab);
9002             }
9003             DEBUG_STATE_r({
9004                 if (no_final) {
9005                     DEBUG_STATE_pp("pop (no final)");
9006                 } else {
9007                     DEBUG_STATE_pp("pop (yes)");
9008                 }
9009             });
9010             depth--;
9011         }
9012 #else
9013         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9014             || yes_state > SLAB_LAST(PL_regmatch_slab))
9015         {
9016             /* not in this slab, pop slab */
9017             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9018             PL_regmatch_slab = PL_regmatch_slab->prev;
9019             st = SLAB_LAST(PL_regmatch_slab);
9020         }
9021         depth -= (st - yes_state);
9022 #endif
9023         st = yes_state;
9024         yes_state = st->u.yes.prev_yes_state;
9025         PL_regmatch_state = st;
9026
9027         if (no_final)
9028             locinput= st->locinput;
9029         state_num = st->resume_state + no_final;
9030         goto reenter_switch;
9031     }
9032
9033     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9034                           PL_colors[4], PL_colors[5]));
9035
9036     if (reginfo->info_aux_eval) {
9037         /* each successfully executed (?{...}) block does the equivalent of
9038          *   local $^R = do {...}
9039          * When popping the save stack, all these locals would be undone;
9040          * bypass this by setting the outermost saved $^R to the latest
9041          * value */
9042         /* I dont know if this is needed or works properly now.
9043          * see code related to PL_replgv elsewhere in this file.
9044          * Yves
9045          */
9046         if (oreplsv != GvSV(PL_replgv))
9047             sv_setsv(oreplsv, GvSV(PL_replgv));
9048     }
9049     result = 1;
9050     goto final_exit;
9051
9052   no:
9053     DEBUG_EXECUTE_r(
9054         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9055             depth,
9056             PL_colors[4], PL_colors[5])
9057         );
9058
9059   no_silent:
9060     if (no_final) {
9061         if (yes_state) {
9062             goto yes;
9063         } else {
9064             goto final_exit;
9065         }
9066     }
9067     if (depth) {
9068         /* there's a previous state to backtrack to */
9069         st--;
9070         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9071             PL_regmatch_slab = PL_regmatch_slab->prev;
9072             st = SLAB_LAST(PL_regmatch_slab);
9073         }
9074         PL_regmatch_state = st;
9075         locinput= st->locinput;
9076
9077         DEBUG_STATE_pp("pop");
9078         depth--;
9079         if (yes_state == st)
9080             yes_state = st->u.yes.prev_yes_state;
9081
9082         state_num = st->resume_state + 1; /* failure = success + 1 */
9083         PERL_ASYNC_CHECK();
9084         goto reenter_switch;
9085     }
9086     result = 0;
9087
9088   final_exit:
9089     if (rex->intflags & PREGf_VERBARG_SEEN) {
9090         SV *sv_err = get_sv("REGERROR", 1);
9091         SV *sv_mrk = get_sv("REGMARK", 1);
9092         if (result) {
9093             sv_commit = &PL_sv_no;
9094             if (!sv_yes_mark)
9095                 sv_yes_mark = &PL_sv_yes;
9096         } else {
9097             if (!sv_commit)
9098                 sv_commit = &PL_sv_yes;
9099             sv_yes_mark = &PL_sv_no;
9100         }
9101         assert(sv_err);
9102         assert(sv_mrk);
9103         sv_setsv(sv_err, sv_commit);
9104         sv_setsv(sv_mrk, sv_yes_mark);
9105     }
9106
9107
9108     if (last_pushed_cv) {
9109         dSP;
9110         /* see "Some notes about MULTICALL" above */
9111         POP_MULTICALL;
9112         PERL_UNUSED_VAR(SP);
9113     }
9114     else
9115         LEAVE_SCOPE(orig_savestack_ix);
9116
9117     assert(!result ||  locinput - reginfo->strbeg >= 0);
9118     return result ?  locinput - reginfo->strbeg : -1;
9119 }
9120
9121 /*
9122  - regrepeat - repeatedly match something simple, report how many
9123  *
9124  * What 'simple' means is a node which can be the operand of a quantifier like
9125  * '+', or {1,3}
9126  *
9127  * startposp - pointer a pointer to the start position.  This is updated
9128  *             to point to the byte following the highest successful
9129  *             match.
9130  * p         - the regnode to be repeatedly matched against.
9131  * reginfo   - struct holding match state, such as strend
9132  * max       - maximum number of things to match.
9133  * depth     - (for debugging) backtracking depth.
9134  */
9135 STATIC I32
9136 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9137             regmatch_info *const reginfo, I32 max _pDEPTH)
9138 {
9139     char *scan;     /* Pointer to current position in target string */
9140     I32 c;
9141     char *loceol = reginfo->strend;   /* local version */
9142     I32 hardcount = 0;  /* How many matches so far */
9143     bool utf8_target = reginfo->is_utf8_target;
9144     unsigned int to_complement = 0;  /* Invert the result? */
9145     UV utf8_flags;
9146     _char_class_number classnum;
9147
9148     PERL_ARGS_ASSERT_REGREPEAT;
9149
9150     scan = *startposp;
9151     if (max == REG_INFTY)
9152         max = I32_MAX;
9153     else if (! utf8_target && loceol - scan > max)
9154         loceol = scan + max;
9155
9156     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
9157      * to the maximum of how far we should go in it (leaving it set to the real
9158      * end, if the maximum permissible would take us beyond that).  This allows
9159      * us to make the loop exit condition that we haven't gone past <loceol> to
9160      * also mean that we haven't exceeded the max permissible count, saving a
9161      * test each time through the loop.  But it assumes that the OP matches a
9162      * single byte, which is true for most of the OPs below when applied to a
9163      * non-UTF-8 target.  Those relatively few OPs that don't have this
9164      * characteristic will have to compensate.
9165      *
9166      * There is no adjustment for UTF-8 targets, as the number of bytes per
9167      * character varies.  OPs will have to test both that the count is less
9168      * than the max permissible (using <hardcount> to keep track), and that we
9169      * are still within the bounds of the string (using <loceol>.  A few OPs
9170      * match a single byte no matter what the encoding.  They can omit the max
9171      * test if, for the UTF-8 case, they do the adjustment that was skipped
9172      * above.
9173      *
9174      * Thus, the code above sets things up for the common case; and exceptional
9175      * cases need extra work; the common case is to make sure <scan> doesn't
9176      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
9177      * count doesn't exceed the maximum permissible */
9178
9179     switch (OP(p)) {
9180     case REG_ANY:
9181         if (utf8_target) {
9182             while (scan < loceol && hardcount < max && *scan != '\n') {
9183                 scan += UTF8SKIP(scan);
9184                 hardcount++;
9185             }
9186         } else {
9187             scan = (char *) memchr(scan, '\n', loceol - scan);
9188             if (! scan) {
9189                 scan = loceol;
9190             }
9191         }
9192         break;
9193     case SANY:
9194         if (utf8_target) {
9195             while (scan < loceol && hardcount < max) {
9196                 scan += UTF8SKIP(scan);
9197                 hardcount++;
9198             }
9199         }
9200         else
9201             scan = loceol;
9202         break;
9203     case EXACTL:
9204         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9205         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9206             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9207         }
9208         /* FALLTHROUGH */
9209     case EXACT:
9210         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9211
9212         c = (U8)*STRING(p);
9213
9214         /* Can use a simple find if the pattern char to match on is invariant
9215          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9216          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9217          * true iff it doesn't matter if the argument is in UTF-8 or not */
9218         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9219             if (utf8_target && loceol - scan > max) {
9220                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9221                  * since here, to match at all, 1 char == 1 byte */
9222                 loceol = scan + max;
9223             }
9224             scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9225         }
9226         else if (reginfo->is_utf8_pat) {
9227             if (utf8_target) {
9228                 STRLEN scan_char_len;
9229
9230                 /* When both target and pattern are UTF-8, we have to do
9231                  * string EQ */
9232                 while (hardcount < max
9233                        && scan < loceol
9234                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9235                        && memEQ(scan, STRING(p), scan_char_len))
9236                 {
9237                     scan += scan_char_len;
9238                     hardcount++;
9239                 }
9240             }
9241             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9242
9243                 /* Target isn't utf8; convert the character in the UTF-8
9244                  * pattern to non-UTF8, and do a simple find */
9245                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9246                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9247             } /* else pattern char is above Latin1, can't possibly match the
9248                  non-UTF-8 target */
9249         }
9250         else {
9251
9252             /* Here, the string must be utf8; pattern isn't, and <c> is
9253              * different in utf8 than not, so can't compare them directly.
9254              * Outside the loop, find the two utf8 bytes that represent c, and
9255              * then look for those in sequence in the utf8 string */
9256             U8 high = UTF8_TWO_BYTE_HI(c);
9257             U8 low = UTF8_TWO_BYTE_LO(c);
9258
9259             while (hardcount < max
9260                     && scan + 1 < loceol
9261                     && UCHARAT(scan) == high
9262                     && UCHARAT(scan + 1) == low)
9263             {
9264                 scan += 2;
9265                 hardcount++;
9266             }
9267         }
9268         break;
9269
9270     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9271         assert(! reginfo->is_utf8_pat);
9272         /* FALLTHROUGH */
9273     case EXACTFAA:
9274         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9275         goto do_exactf;
9276
9277     case EXACTFL:
9278         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9279         utf8_flags = FOLDEQ_LOCALE;
9280         goto do_exactf;
9281
9282     case EXACTF:   /* This node only generated for non-utf8 patterns */
9283         assert(! reginfo->is_utf8_pat);
9284         utf8_flags = 0;
9285         goto do_exactf;
9286
9287     case EXACTFLU8:
9288         if (! utf8_target) {
9289             break;
9290         }
9291         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9292                                     | FOLDEQ_S2_FOLDS_SANE;
9293         goto do_exactf;
9294
9295     case EXACTFU_SS:
9296     case EXACTFU:
9297         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9298
9299       do_exactf: {
9300         int c1, c2;
9301         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9302
9303         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9304
9305         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9306                                         reginfo))
9307         {
9308             if (c1 == CHRTEST_VOID) {
9309                 /* Use full Unicode fold matching */
9310                 char *tmpeol = reginfo->strend;
9311                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9312                 while (hardcount < max
9313                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9314                                              STRING(p), NULL, pat_len,
9315                                              reginfo->is_utf8_pat, utf8_flags))
9316                 {
9317                     scan = tmpeol;
9318                     tmpeol = reginfo->strend;
9319                     hardcount++;
9320                 }
9321             }
9322             else if (utf8_target) {
9323                 if (c1 == c2) {
9324                     while (scan < loceol
9325                            && hardcount < max
9326                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9327                     {
9328                         scan += UTF8SKIP(scan);
9329                         hardcount++;
9330                     }
9331                 }
9332                 else {
9333                     while (scan < loceol
9334                            && hardcount < max
9335                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9336                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9337                     {
9338                         scan += UTF8SKIP(scan);
9339                         hardcount++;
9340                     }
9341                 }
9342             }
9343             else if (c1 == c2) {
9344                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c1);
9345             }
9346             else {
9347                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9348                  * a conditional each time through the loop if the characters
9349                  * differ only in a single bit, as is the usual situation */
9350                 U8 c1_c2_bits_differing = c1 ^ c2;
9351
9352                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9353                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9354
9355                     scan = (char *) find_span_end_mask((U8 *) scan,
9356                                                        (U8 *) loceol,
9357                                                        c1 & c1_c2_mask,
9358                                                        c1_c2_mask);
9359                 }
9360                 else {
9361                     while (    scan < loceol
9362                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9363                     {
9364                         scan++;
9365                     }
9366                 }
9367             }
9368         }
9369         break;
9370     }
9371     case ANYOFPOSIXL:
9372     case ANYOFL:
9373         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9374
9375         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9376             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9377         }
9378         /* FALLTHROUGH */
9379     case ANYOFD:
9380     case ANYOF:
9381         if (utf8_target) {
9382             while (hardcount < max
9383                    && scan < loceol
9384                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9385             {
9386                 scan += UTF8SKIP(scan);
9387                 hardcount++;
9388             }
9389         }
9390         else if (ANYOF_FLAGS(p)) {
9391             while (scan < loceol
9392                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9393                 scan++;
9394         }
9395         else {
9396             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9397                 scan++;
9398         }
9399         break;
9400
9401     case ANYOFM:
9402         if (utf8_target && loceol - scan > max) {
9403
9404             /* We didn't adjust <loceol> at the beginning of this routine
9405              * because is UTF-8, but it is actually ok to do so, since here, to
9406              * match, 1 char == 1 byte. */
9407             loceol = scan + max;
9408         }
9409
9410         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9411         break;
9412
9413     case NANYOFM:
9414         if (utf8_target) {
9415             while (     hardcount < max
9416                    &&   scan < loceol
9417                    &&  (*scan & FLAGS(p)) != ARG(p))
9418             {
9419                 scan += UTF8SKIP(scan);
9420                 hardcount++;
9421             }
9422         }
9423         else {
9424             scan = (char *) find_next_masked((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9425         }
9426         break;
9427
9428     case ASCII:
9429         if (utf8_target && loceol - scan > max) {
9430             loceol = scan + max;
9431         }
9432
9433         scan = find_next_non_ascii(scan, loceol, utf8_target);
9434         break;
9435
9436     case NASCII:
9437         if (utf8_target) {
9438             while (     hardcount < max
9439                    &&   scan < loceol
9440                    && ! isASCII_utf8_safe(scan, loceol))
9441             {
9442                 scan += UTF8SKIP(scan);
9443                 hardcount++;
9444             }
9445         }
9446         else {
9447             scan = find_next_ascii(scan, loceol, utf8_target);
9448         }
9449         break;
9450
9451     /* The argument (FLAGS) to all the POSIX node types is the class number */
9452
9453     case NPOSIXL:
9454         to_complement = 1;
9455         /* FALLTHROUGH */
9456
9457     case POSIXL:
9458         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9459         if (! utf8_target) {
9460             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9461                                                                    *scan)))
9462             {
9463                 scan++;
9464             }
9465         } else {
9466             while (hardcount < max && scan < loceol
9467                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9468                                                                   (U8 *) scan,
9469                                                                   (U8 *) loceol)))
9470             {
9471                 scan += UTF8SKIP(scan);
9472                 hardcount++;
9473             }
9474         }
9475         break;
9476
9477     case POSIXD:
9478         if (utf8_target) {
9479             goto utf8_posix;
9480         }
9481         /* FALLTHROUGH */
9482
9483     case POSIXA:
9484         if (utf8_target && loceol - scan > max) {
9485
9486             /* We didn't adjust <loceol> at the beginning of this routine
9487              * because is UTF-8, but it is actually ok to do so, since here, to
9488              * match, 1 char == 1 byte. */
9489             loceol = scan + max;
9490         }
9491         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9492             scan++;
9493         }
9494         break;
9495
9496     case NPOSIXD:
9497         if (utf8_target) {
9498             to_complement = 1;
9499             goto utf8_posix;
9500         }
9501         /* FALLTHROUGH */
9502
9503     case NPOSIXA:
9504         if (! utf8_target) {
9505             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9506                 scan++;
9507             }
9508         }
9509         else {
9510
9511             /* The complement of something that matches only ASCII matches all
9512              * non-ASCII, plus everything in ASCII that isn't in the class. */
9513             while (hardcount < max && scan < loceol
9514                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9515                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9516             {
9517                 scan += UTF8SKIP(scan);
9518                 hardcount++;
9519             }
9520         }
9521         break;
9522
9523     case NPOSIXU:
9524         to_complement = 1;
9525         /* FALLTHROUGH */
9526
9527     case POSIXU:
9528         if (! utf8_target) {
9529             while (scan < loceol && to_complement
9530                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9531             {
9532                 scan++;
9533             }
9534         }
9535         else {
9536           utf8_posix:
9537             classnum = (_char_class_number) FLAGS(p);
9538             switch (classnum) {
9539                 default:
9540                     while (   hardcount < max && scan < loceol
9541                            && to_complement ^ cBOOL(_invlist_contains_cp(
9542                                               PL_XPosix_ptrs[classnum],
9543                                               utf8_to_uvchr_buf((U8 *) scan,
9544                                                                 (U8 *) loceol,
9545                                                                 NULL))))
9546                     {
9547                         scan += UTF8SKIP(scan);
9548                         hardcount++;
9549                     }
9550                     break;
9551
9552                     /* For the classes below, the knowledge of how to handle
9553                      * every code point is compiled in to Perl via a macro.
9554                      * This code is written for making the loops as tight as
9555                      * possible.  It could be refactored to save space instead.
9556                      * */
9557
9558                 case _CC_ENUM_SPACE:
9559                     while (hardcount < max
9560                            && scan < loceol
9561                            && (to_complement
9562                                ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9563                     {
9564                         scan += UTF8SKIP(scan);
9565                         hardcount++;
9566                     }
9567                     break;
9568                 case _CC_ENUM_BLANK:
9569                     while (hardcount < max
9570                            && scan < loceol
9571                            && (to_complement
9572                                 ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9573                     {
9574                         scan += UTF8SKIP(scan);
9575                         hardcount++;
9576                     }
9577                     break;
9578                 case _CC_ENUM_XDIGIT:
9579                     while (hardcount < max
9580                            && scan < loceol
9581                            && (to_complement
9582                                ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9583                     {
9584                         scan += UTF8SKIP(scan);
9585                         hardcount++;
9586                     }
9587                     break;
9588                 case _CC_ENUM_VERTSPACE:
9589                     while (hardcount < max
9590                            && scan < loceol
9591                            && (to_complement
9592                                ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9593                     {
9594                         scan += UTF8SKIP(scan);
9595                         hardcount++;
9596                     }
9597                     break;
9598                 case _CC_ENUM_CNTRL:
9599                     while (hardcount < max
9600                            && scan < loceol
9601                            && (to_complement
9602                                ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9603                     {
9604                         scan += UTF8SKIP(scan);
9605                         hardcount++;
9606                     }
9607                     break;
9608             }
9609         }
9610         break;
9611
9612     case LNBREAK:
9613         if (utf8_target) {
9614             while (hardcount < max && scan < loceol &&
9615                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9616                 scan += c;
9617                 hardcount++;
9618             }
9619         } else {
9620             /* LNBREAK can match one or two latin chars, which is ok, but we
9621              * have to use hardcount in this situation, and throw away the
9622              * adjustment to <loceol> done before the switch statement */
9623             loceol = reginfo->strend;
9624             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9625                 scan+=c;
9626                 hardcount++;
9627             }
9628         }
9629         break;
9630
9631     case BOUNDL:
9632     case NBOUNDL:
9633         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9634         /* FALLTHROUGH */
9635     case BOUND:
9636     case BOUNDA:
9637     case BOUNDU:
9638     case EOS:
9639     case GPOS:
9640     case KEEPS:
9641     case NBOUND:
9642     case NBOUNDA:
9643     case NBOUNDU:
9644     case OPFAIL:
9645     case SBOL:
9646     case SEOL:
9647         /* These are all 0 width, so match right here or not at all. */
9648         break;
9649
9650     default:
9651         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9652         NOT_REACHED; /* NOTREACHED */
9653
9654     }
9655
9656     if (hardcount)
9657         c = hardcount;
9658     else
9659         c = scan - *startposp;
9660     *startposp = scan;
9661
9662     DEBUG_r({
9663         GET_RE_DEBUG_FLAGS_DECL;
9664         DEBUG_EXECUTE_r({
9665             SV * const prop = sv_newmortal();
9666             regprop(prog, prop, p, reginfo, NULL);
9667             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9668                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9669         });
9670     });
9671
9672     return(c);
9673 }
9674
9675
9676 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9677 /*
9678 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9679 create a copy so that changes the caller makes won't change the shared one.
9680 If <altsvp> is non-null, will return NULL in it, for back-compat.
9681  */
9682 SV *
9683 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9684 {
9685     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9686
9687     if (altsvp) {
9688         *altsvp = NULL;
9689     }
9690
9691     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9692 }
9693
9694 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9695
9696 /*
9697  - reginclass - determine if a character falls into a character class
9698
9699   n is the ANYOF-type regnode
9700   p is the target string
9701   p_end points to one byte beyond the end of the target string
9702   utf8_target tells whether p is in UTF-8.
9703
9704   Returns true if matched; false otherwise.
9705
9706   Note that this can be a synthetic start class, a combination of various
9707   nodes, so things you think might be mutually exclusive, such as locale,
9708   aren't.  It can match both locale and non-locale
9709
9710  */
9711
9712 STATIC bool
9713 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9714 {
9715     dVAR;
9716     const char flags = ANYOF_FLAGS(n);
9717     bool match = FALSE;
9718     UV c = *p;
9719
9720     PERL_ARGS_ASSERT_REGINCLASS;
9721
9722     /* If c is not already the code point, get it.  Note that
9723      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9724     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9725         STRLEN c_len = 0;
9726         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9727         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9728         if (c_len == (STRLEN)-1) {
9729             _force_out_malformed_utf8_message(p, p_end,
9730                                               utf8n_flags,
9731                                               1 /* 1 means die */ );
9732             NOT_REACHED; /* NOTREACHED */
9733         }
9734         if (     c > 255
9735             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9736             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9737         {
9738             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9739         }
9740     }
9741
9742     /* If this character is potentially in the bitmap, check it */
9743     if (c < NUM_ANYOF_CODE_POINTS) {
9744         if (ANYOF_BITMAP_TEST(n, c))
9745             match = TRUE;
9746         else if ((flags
9747                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9748                   && OP(n) == ANYOFD
9749                   && ! utf8_target
9750                   && ! isASCII(c))
9751         {
9752             match = TRUE;
9753         }
9754         else if (flags & ANYOF_LOCALE_FLAGS) {
9755             if ((flags & ANYOFL_FOLD)
9756                 && c < 256
9757                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9758             {
9759                 match = TRUE;
9760             }
9761             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9762                      && c < 256
9763             ) {
9764
9765                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9766                  * if the class includes the Posix character class given by
9767                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9768                  * complemented Posix class given by int(bit/2).  So we loop
9769                  * through the bits, each time changing whether we complement
9770                  * the result or not.  Suppose for the sake of illustration
9771                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9772                  * is set, it means there is a match for this ANYOF node if the
9773                  * character is in the class given by the expression (0 / 2 = 0
9774                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9775                  * and since 'to_complement' is 0, the result will stay TRUE,
9776                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9777                  * bit 1 is 1.  That means there is a match if the character
9778                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9779                  * but will on bit 1.  On the second iteration 'to_complement'
9780                  * will be 1, so the exclusive or will reverse things, so we
9781                  * are testing for \W.  On the third iteration, 'to_complement'
9782                  * will be 0, and we would be testing for \s; the fourth
9783                  * iteration would test for \S, etc.
9784                  *
9785                  * Note that this code assumes that all the classes are closed
9786                  * under folding.  For example, if a character matches \w, then
9787                  * its fold does too; and vice versa.  This should be true for
9788                  * any well-behaved locale for all the currently defined Posix
9789                  * classes, except for :lower: and :upper:, which are handled
9790                  * by the pseudo-class :cased: which matches if either of the
9791                  * other two does.  To get rid of this assumption, an outer
9792                  * loop could be used below to iterate over both the source
9793                  * character, and its fold (if different) */
9794
9795                 int count = 0;
9796                 int to_complement = 0;
9797
9798                 while (count < ANYOF_MAX) {
9799                     if (ANYOF_POSIXL_TEST(n, count)
9800                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9801                     {
9802                         match = TRUE;
9803                         break;
9804                     }
9805                     count++;
9806                     to_complement ^= 1;
9807                 }
9808             }
9809         }
9810     }
9811
9812
9813     /* If the bitmap didn't (or couldn't) match, and something outside the
9814      * bitmap could match, try that. */
9815     if (!match) {
9816         if (c >= NUM_ANYOF_CODE_POINTS
9817             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9818         {
9819             match = TRUE;       /* Everything above the bitmap matches */
9820         }
9821             /* Here doesn't match everything above the bitmap.  If there is
9822              * some information available beyond the bitmap, we may find a
9823              * match in it.  If so, this is most likely because the code point
9824              * is outside the bitmap range.  But rarely, it could be because of
9825              * some other reason.  If so, various flags are set to indicate
9826              * this possibility.  On ANYOFD nodes, there may be matches that
9827              * happen only when the target string is UTF-8; or for other node
9828              * types, because runtime lookup is needed, regardless of the
9829              * UTF-8ness of the target string.  Finally, under /il, there may
9830              * be some matches only possible if the locale is a UTF-8 one. */
9831         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9832                  && (   c >= NUM_ANYOF_CODE_POINTS
9833                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9834                          && (   UNLIKELY(OP(n) != ANYOFD)
9835                              || (utf8_target && ! isASCII_uni(c)
9836 #                               if NUM_ANYOF_CODE_POINTS > 256
9837                                                                  && c < 256
9838 #                               endif
9839                                 )))
9840                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9841                          && IN_UTF8_CTYPE_LOCALE)))
9842         {
9843             SV* only_utf8_locale = NULL;
9844             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9845                                                        &only_utf8_locale, NULL);
9846             if (sw) {
9847                 U8 utf8_buffer[2];
9848                 U8 * utf8_p;
9849                 if (utf8_target) {
9850                     utf8_p = (U8 *) p;
9851                 } else { /* Convert to utf8 */
9852                     utf8_p = utf8_buffer;
9853                     append_utf8_from_native_byte(*p, &utf8_p);
9854                     utf8_p = utf8_buffer;
9855                 }
9856
9857                 if (swash_fetch(sw, utf8_p, TRUE)) {
9858                     match = TRUE;
9859                 }
9860             }
9861             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9862                 match = _invlist_contains_cp(only_utf8_locale, c);
9863             }
9864         }
9865
9866         if (UNICODE_IS_SUPER(c)
9867             && (flags
9868                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9869             && OP(n) != ANYOFD
9870             && ckWARN_d(WARN_NON_UNICODE))
9871         {
9872             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9873                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9874         }
9875     }
9876
9877 #if ANYOF_INVERT != 1
9878     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9879      * use it, don't */
9880 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9881 #endif
9882
9883     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9884     return (flags & ANYOF_INVERT) ^ match;
9885 }
9886
9887 STATIC U8 *
9888 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9889 {
9890     /* return the position 'off' UTF-8 characters away from 's', forward if
9891      * 'off' >= 0, backwards if negative.  But don't go outside of position
9892      * 'lim', which better be < s  if off < 0 */
9893
9894     PERL_ARGS_ASSERT_REGHOP3;
9895
9896     if (off >= 0) {
9897         while (off-- && s < lim) {
9898             /* XXX could check well-formedness here */
9899             U8 *new_s = s + UTF8SKIP(s);
9900             if (new_s > lim) /* lim may be in the middle of a long character */
9901                 return s;
9902             s = new_s;
9903         }
9904     }
9905     else {
9906         while (off++ && s > lim) {
9907             s--;
9908             if (UTF8_IS_CONTINUED(*s)) {
9909                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9910                     s--;
9911                 if (! UTF8_IS_START(*s)) {
9912                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9913                 }
9914             }
9915             /* XXX could check well-formedness here */
9916         }
9917     }
9918     return s;
9919 }
9920
9921 STATIC U8 *
9922 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9923 {
9924     PERL_ARGS_ASSERT_REGHOP4;
9925
9926     if (off >= 0) {
9927         while (off-- && s < rlim) {
9928             /* XXX could check well-formedness here */
9929             s += UTF8SKIP(s);
9930         }
9931     }
9932     else {
9933         while (off++ && s > llim) {
9934             s--;
9935             if (UTF8_IS_CONTINUED(*s)) {
9936                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9937                     s--;
9938                 if (! UTF8_IS_START(*s)) {
9939                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9940                 }
9941             }
9942             /* XXX could check well-formedness here */
9943         }
9944     }
9945     return s;
9946 }
9947
9948 /* like reghop3, but returns NULL on overrun, rather than returning last
9949  * char pos */
9950
9951 STATIC U8 *
9952 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9953 {
9954     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9955
9956     if (off >= 0) {
9957         while (off-- && s < lim) {
9958             /* XXX could check well-formedness here */
9959             s += UTF8SKIP(s);
9960         }
9961         if (off >= 0)
9962             return NULL;
9963     }
9964     else {
9965         while (off++ && s > lim) {
9966             s--;
9967             if (UTF8_IS_CONTINUED(*s)) {
9968                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9969                     s--;
9970                 if (! UTF8_IS_START(*s)) {
9971                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9972                 }
9973             }
9974             /* XXX could check well-formedness here */
9975         }
9976         if (off <= 0)
9977             return NULL;
9978     }
9979     return s;
9980 }
9981
9982
9983 /* when executing a regex that may have (?{}), extra stuff needs setting
9984    up that will be visible to the called code, even before the current
9985    match has finished. In particular:
9986
9987    * $_ is localised to the SV currently being matched;
9988    * pos($_) is created if necessary, ready to be updated on each call-out
9989      to code;
9990    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9991      isn't set until the current pattern is successfully finished), so that
9992      $1 etc of the match-so-far can be seen;
9993    * save the old values of subbeg etc of the current regex, and  set then
9994      to the current string (again, this is normally only done at the end
9995      of execution)
9996 */
9997
9998 static void
9999 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10000 {
10001     MAGIC *mg;
10002     regexp *const rex = ReANY(reginfo->prog);
10003     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10004
10005     eval_state->rex = rex;
10006
10007     if (reginfo->sv) {
10008         /* Make $_ available to executed code. */
10009         if (reginfo->sv != DEFSV) {
10010             SAVE_DEFSV;
10011             DEFSV_set(reginfo->sv);
10012         }
10013
10014         if (!(mg = mg_find_mglob(reginfo->sv))) {
10015             /* prepare for quick setting of pos */
10016             mg = sv_magicext_mglob(reginfo->sv);
10017             mg->mg_len = -1;
10018         }
10019         eval_state->pos_magic = mg;
10020         eval_state->pos       = mg->mg_len;
10021         eval_state->pos_flags = mg->mg_flags;
10022     }
10023     else
10024         eval_state->pos_magic = NULL;
10025
10026     if (!PL_reg_curpm) {
10027         /* PL_reg_curpm is a fake PMOP that we can attach the current
10028          * regex to and point PL_curpm at, so that $1 et al are visible
10029          * within a /(?{})/. It's just allocated once per interpreter the
10030          * first time its needed */
10031         Newxz(PL_reg_curpm, 1, PMOP);
10032 #ifdef USE_ITHREADS
10033         {
10034             SV* const repointer = &PL_sv_undef;
10035             /* this regexp is also owned by the new PL_reg_curpm, which
10036                will try to free it.  */
10037             av_push(PL_regex_padav, repointer);
10038             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10039             PL_regex_pad = AvARRAY(PL_regex_padav);
10040         }
10041 #endif
10042     }
10043     SET_reg_curpm(reginfo->prog);
10044     eval_state->curpm = PL_curpm;
10045     PL_curpm_under = PL_curpm;
10046     PL_curpm = PL_reg_curpm;
10047     if (RXp_MATCH_COPIED(rex)) {
10048         /*  Here is a serious problem: we cannot rewrite subbeg,
10049             since it may be needed if this match fails.  Thus
10050             $` inside (?{}) could fail... */
10051         eval_state->subbeg     = rex->subbeg;
10052         eval_state->sublen     = rex->sublen;
10053         eval_state->suboffset  = rex->suboffset;
10054         eval_state->subcoffset = rex->subcoffset;
10055 #ifdef PERL_ANY_COW
10056         eval_state->saved_copy = rex->saved_copy;
10057 #endif
10058         RXp_MATCH_COPIED_off(rex);
10059     }
10060     else
10061         eval_state->subbeg = NULL;
10062     rex->subbeg = (char *)reginfo->strbeg;
10063     rex->suboffset = 0;
10064     rex->subcoffset = 0;
10065     rex->sublen = reginfo->strend - reginfo->strbeg;
10066 }
10067
10068
10069 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10070
10071 static void
10072 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10073 {
10074     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10075     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10076     regmatch_slab *s;
10077
10078     Safefree(aux->poscache);
10079
10080     if (eval_state) {
10081
10082         /* undo the effects of S_setup_eval_state() */
10083
10084         if (eval_state->subbeg) {
10085             regexp * const rex = eval_state->rex;
10086             rex->subbeg     = eval_state->subbeg;
10087             rex->sublen     = eval_state->sublen;
10088             rex->suboffset  = eval_state->suboffset;
10089             rex->subcoffset = eval_state->subcoffset;
10090 #ifdef PERL_ANY_COW
10091             rex->saved_copy = eval_state->saved_copy;
10092 #endif
10093             RXp_MATCH_COPIED_on(rex);
10094         }
10095         if (eval_state->pos_magic)
10096         {
10097             eval_state->pos_magic->mg_len = eval_state->pos;
10098             eval_state->pos_magic->mg_flags =
10099                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10100                | (eval_state->pos_flags & MGf_BYTES);
10101         }
10102
10103         PL_curpm = eval_state->curpm;
10104     }
10105
10106     PL_regmatch_state = aux->old_regmatch_state;
10107     PL_regmatch_slab  = aux->old_regmatch_slab;
10108
10109     /* free all slabs above current one - this must be the last action
10110      * of this function, as aux and eval_state are allocated within
10111      * slabs and may be freed here */
10112
10113     s = PL_regmatch_slab->next;
10114     if (s) {
10115         PL_regmatch_slab->next = NULL;
10116         while (s) {
10117             regmatch_slab * const osl = s;
10118             s = s->next;
10119             Safefree(osl);
10120         }
10121     }
10122 }
10123
10124
10125 STATIC void
10126 S_to_utf8_substr(pTHX_ regexp *prog)
10127 {
10128     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10129      * on the converted value */
10130
10131     int i = 1;
10132
10133     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10134
10135     do {
10136         if (prog->substrs->data[i].substr
10137             && !prog->substrs->data[i].utf8_substr) {
10138             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10139             prog->substrs->data[i].utf8_substr = sv;
10140             sv_utf8_upgrade(sv);
10141             if (SvVALID(prog->substrs->data[i].substr)) {
10142                 if (SvTAIL(prog->substrs->data[i].substr)) {
10143                     /* Trim the trailing \n that fbm_compile added last
10144                        time.  */
10145                     SvCUR_set(sv, SvCUR(sv) - 1);
10146                     /* Whilst this makes the SV technically "invalid" (as its
10147                        buffer is no longer followed by "\0") when fbm_compile()
10148                        adds the "\n" back, a "\0" is restored.  */
10149                     fbm_compile(sv, FBMcf_TAIL);
10150                 } else
10151                     fbm_compile(sv, 0);
10152             }
10153             if (prog->substrs->data[i].substr == prog->check_substr)
10154                 prog->check_utf8 = sv;
10155         }
10156     } while (i--);
10157 }
10158
10159 STATIC bool
10160 S_to_byte_substr(pTHX_ regexp *prog)
10161 {
10162     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10163      * on the converted value; returns FALSE if can't be converted. */
10164
10165     int i = 1;
10166
10167     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10168
10169     do {
10170         if (prog->substrs->data[i].utf8_substr
10171             && !prog->substrs->data[i].substr) {
10172             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10173             if (! sv_utf8_downgrade(sv, TRUE)) {
10174                 return FALSE;
10175             }
10176             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10177                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10178                     /* Trim the trailing \n that fbm_compile added last
10179                         time.  */
10180                     SvCUR_set(sv, SvCUR(sv) - 1);
10181                     fbm_compile(sv, FBMcf_TAIL);
10182                 } else
10183                     fbm_compile(sv, 0);
10184             }
10185             prog->substrs->data[i].substr = sv;
10186             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10187                 prog->check_substr = sv;
10188         }
10189     } while (i--);
10190
10191     return TRUE;
10192 }
10193
10194 #ifndef PERL_IN_XSUB_RE
10195
10196 bool
10197 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10198 {
10199     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10200      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10201      * the larger string bounded by 'strbeg' and 'strend'.
10202      *
10203      * 'cp' needs to be assigned (if not a future version of the Unicode
10204      * Standard could make it something that combines with adjacent characters,
10205      * so code using it would then break), and there has to be a GCB break
10206      * before and after the character. */
10207
10208     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10209     const U8 * prev_cp_start;
10210
10211     PERL_ARGS_ASSERT__IS_GRAPHEME;
10212
10213     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10214         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10215     {
10216         /* These are considered graphemes */
10217         return TRUE;
10218     }
10219
10220     /* Otherwise, unassigned code points are forbidden */
10221     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10222                                     _invlist_search(PL_Assigned_invlist, cp))))
10223     {
10224         return FALSE;
10225     }
10226
10227     cp_gcb_val = getGCB_VAL_CP(cp);
10228
10229     /* Find the GCB value of the previous code point in the input */
10230     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10231     if (UNLIKELY(prev_cp_start == s)) {
10232         prev_cp_gcb_val = GCB_EDGE;
10233     }
10234     else {
10235         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10236     }
10237
10238     /* And check that is a grapheme boundary */
10239     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10240                 TRUE /* is UTF-8 encoded */ ))
10241     {
10242         return FALSE;
10243     }
10244
10245     /* Similarly verify there is a break between the current character and the
10246      * following one */
10247     s += UTF8SKIP(s);
10248     if (s >= strend) {
10249         next_cp_gcb_val = GCB_EDGE;
10250     }
10251     else {
10252         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10253     }
10254
10255     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10256 }
10257
10258 /*
10259 =head1 Unicode Support
10260
10261 =for apidoc isSCRIPT_RUN
10262
10263 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10264 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10265 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10266 two degenerate cases given below, this function returns TRUE iff all code
10267 points in it come from any combination of three "scripts" given by the Unicode
10268 "Script Extensions" property: Common, Inherited, and possibly one other.
10269 Additionally all decimal digits must come from the same consecutive sequence of
10270 10.
10271
10272 For example, if all the characters in the sequence are Greek, or Common, or
10273 Inherited, this function will return TRUE, provided any decimal digits in it
10274 are the ASCII digits "0".."9".  For scripts (unlike Greek) that have their own
10275 digits defined this will accept either digits from that set or from 0..9, but
10276 not a combination of the two.  Some scripts, such as Arabic, have more than one
10277 set of digits.  All digits must come from the same set for this function to
10278 return TRUE.
10279
10280 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10281 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10282 C<SCX_INVALID> if the function returns FALSE.
10283
10284 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10285 will be C<SCX_INVALID>.
10286
10287 If the sequence contains a single code point which is unassigned to a character
10288 in the version of Unicode being used, the function will return TRUE, and the
10289 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10290 in the input sequence will result in the function treating the input as not
10291 being a script run.
10292
10293 The returned script will be C<SCX_Inherited> iff all the code points in it are
10294 from the Inherited script.
10295
10296 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10297 it are from the Inherited or Common scripts.
10298
10299 =cut
10300
10301 */
10302
10303 bool
10304 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10305 {
10306     /* Basically, it looks at each character in the sequence to see if the
10307      * above conditions are met; if not it fails.  It uses an inversion map to
10308      * find the enum corresponding to the script of each character.  But this
10309      * is complicated by the fact that a few code points can be in any of
10310      * several scripts.  The data has been constructed so that there are
10311      * additional enum values (all negative) for these situations.  The
10312      * absolute value of those is an index into another table which contains
10313      * pointers to auxiliary tables for each such situation.  Each aux array
10314      * lists all the scripts for the given situation.  There is another,
10315      * parallel, table that gives the number of entries in each aux table.
10316      * These are all defined in charclass_invlists.h */
10317
10318     /* XXX Here are the additional things UTS 39 says could be done:
10319      *
10320      * Forbid sequences of the same nonspacing mark
10321      *
10322      * Check to see that all the characters are in the sets of exemplar
10323      * characters for at least one language in the Unicode Common Locale Data
10324      * Repository [CLDR]. */
10325
10326
10327     /* Things that match /\d/u */
10328     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10329     UV * decimals_array = invlist_array(decimals_invlist);
10330
10331     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10332      * not currently known) */
10333     UV zero_of_run = 0;
10334
10335     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10336     SCX_enum script_of_char = SCX_INVALID;
10337
10338     /* If the script remains not fully determined from iteration to iteration,
10339      * this is the current intersection of the possiblities.  */
10340     SCX_enum * intersection = NULL;
10341     PERL_UINT_FAST8_T intersection_len = 0;
10342
10343     bool retval = TRUE;
10344     SCX_enum * ret_script = NULL;
10345
10346     assert(send >= s);
10347
10348     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10349
10350     /* All code points in 0..255 are either Common or Latin, so must be a
10351      * script run.  We can return immediately unless we need to know which
10352      * script it is. */
10353     if (! utf8_target && LIKELY(send > s)) {
10354         if (ret_script == NULL) {
10355             return TRUE;
10356         }
10357
10358         /* If any character is Latin, the run is Latin */
10359         while (s < send) {
10360             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10361                 *ret_script = SCX_Latin;
10362                 return TRUE;
10363             }
10364         }
10365
10366         /* Here, all are Common */
10367         *ret_script = SCX_Common;
10368         return TRUE;
10369     }
10370
10371     /* Look at each character in the sequence */
10372     while (s < send) {
10373         /* If the current character being examined is a digit, this is the code
10374          * point of the zero for its sequence of 10 */
10375         UV zero_of_char;
10376
10377         UV cp;
10378
10379         /* The code allows all scripts to use the ASCII digits.  This is
10380          * because they are used in commerce even in scripts that have their
10381          * own set.  Hence any ASCII ones found are ok, unless and until a
10382          * digit from another set has already been encountered.  (The other
10383          * digit ranges in Common are not similarly blessed) */
10384         if (UNLIKELY(isDIGIT(*s))) {
10385             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10386                 retval = FALSE;
10387                 break;
10388             }
10389             if (zero_of_run) {
10390                 if (zero_of_run != '0') {
10391                     retval = FALSE;
10392                     break;
10393                 }
10394             }
10395             else {
10396                 zero_of_run = '0';
10397             }
10398             s++;
10399             continue;
10400         }
10401
10402         /* Here, isn't an ASCII digit.  Find the code point of the character */
10403         if (! UTF8_IS_INVARIANT(*s)) {
10404             Size_t len;
10405             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10406             s += len;
10407         }
10408         else {
10409             cp = *(s++);
10410         }
10411
10412         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10413          * digit in that script.  We can skip the rest of this code for this
10414          * character. */
10415         if (UNLIKELY(   zero_of_run
10416                      && cp >= zero_of_run
10417                      && cp - zero_of_run <= 9))
10418         {
10419             continue;
10420         }
10421
10422         /* Find the character's script.  The correct values are hard-coded here
10423          * for small-enough code points. */
10424         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10425                                unlikely to change */
10426             if (       cp > 255
10427                 || (   isALPHA_L1(cp)
10428                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10429             {
10430                 script_of_char = SCX_Latin;
10431             }
10432             else {
10433                 script_of_char = SCX_Common;
10434             }
10435         }
10436         else {
10437             script_of_char = _Perl_SCX_invmap[
10438                                        _invlist_search(PL_SCX_invlist, cp)];
10439         }
10440
10441         /* We arbitrarily accept a single unassigned character, but not in
10442          * combination with anything else, and not a run of them. */
10443         if (   UNLIKELY(script_of_run == SCX_Unknown)
10444             || UNLIKELY(   script_of_run != SCX_INVALID
10445                         && script_of_char == SCX_Unknown))
10446         {
10447             retval = FALSE;
10448             break;
10449         }
10450
10451         /* For the first character, or the run is inherited, the run's script
10452          * is set to the char's */
10453         if (   UNLIKELY(script_of_run == SCX_INVALID)
10454             || UNLIKELY(script_of_run == SCX_Inherited))
10455         {
10456             script_of_run = script_of_char;
10457         }
10458
10459         /* For the character's script to be Unknown, it must be the first
10460          * character in the sequence (for otherwise a test above would have
10461          * prevented us from reaching here), and we have set the run's script
10462          * to it.  Nothing further to be done for this character */
10463         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10464             continue;
10465         }
10466
10467         /* We accept 'inherited' script characters currently even at the
10468          * beginning.  (We know that no characters in Inherited are digits, or
10469          * we'd have to check for that) */
10470         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10471             continue;
10472         }
10473
10474         /* If the run so far is Common, and the new character isn't, change the
10475          * run's script to that of this character */
10476         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10477
10478             /* But Common contains several sets of digits.  Only the '0' set
10479              * can be part of another script. */
10480             if (zero_of_run && zero_of_run != '0') {
10481                 retval = FALSE;
10482                 break;
10483             }
10484
10485             script_of_run = script_of_char;
10486         }
10487
10488         /* Now we can see if the script of the character is the same as that of
10489          * the run */
10490         if (LIKELY(script_of_char == script_of_run)) {
10491             /* By far the most common case */
10492             goto scripts_match;
10493         }
10494
10495         /* Here, the script of the run isn't Common.  But characters in Common
10496          * match any script */
10497         if (script_of_char == SCX_Common) {
10498             goto scripts_match;
10499         }
10500
10501 #ifndef HAS_SCX_AUX_TABLES
10502
10503         /* Too early a Unicode version to have a code point belonging to more
10504          * than one script, so, if the scripts don't exactly match, fail */
10505         PERL_UNUSED_VAR(intersection_len);
10506         retval = FALSE;
10507         break;
10508
10509 #else
10510
10511         /* Here there is no exact match between the character's script and the
10512          * run's.  And we've handled the special cases of scripts Unknown,
10513          * Inherited, and Common.
10514          *
10515          * Negative script numbers signify that the value may be any of several
10516          * scripts, and we need to look at auxiliary information to make our
10517          * deterimination.  But if both are non-negative, we can fail now */
10518         if (LIKELY(script_of_char >= 0)) {
10519             const SCX_enum * search_in;
10520             PERL_UINT_FAST8_T search_in_len;
10521             PERL_UINT_FAST8_T i;
10522
10523             if (LIKELY(script_of_run >= 0)) {
10524                 retval = FALSE;
10525                 break;
10526             }
10527
10528             /* Use the previously constructed set of possible scripts, if any.
10529              * */
10530             if (intersection) {
10531                 search_in = intersection;
10532                 search_in_len = intersection_len;
10533             }
10534             else {
10535                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10536                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10537             }
10538
10539             for (i = 0; i < search_in_len; i++) {
10540                 if (search_in[i] == script_of_char) {
10541                     script_of_run = script_of_char;
10542                     goto scripts_match;
10543                 }
10544             }
10545
10546             retval = FALSE;
10547             break;
10548         }
10549         else if (LIKELY(script_of_run >= 0)) {
10550             /* script of character could be one of several, but run is a single
10551              * script */
10552             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10553             const PERL_UINT_FAST8_T search_in_len
10554                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10555             PERL_UINT_FAST8_T i;
10556
10557             for (i = 0; i < search_in_len; i++) {
10558                 if (search_in[i] == script_of_run) {
10559                     script_of_char = script_of_run;
10560                     goto scripts_match;
10561                 }
10562             }
10563
10564             retval = FALSE;
10565             break;
10566         }
10567         else {
10568             /* Both run and char could be in one of several scripts.  If the
10569              * intersection is empty, then this character isn't in this script
10570              * run.  Otherwise, we need to calculate the intersection to use
10571              * for future iterations of the loop, unless we are already at the
10572              * final character */
10573             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10574             const PERL_UINT_FAST8_T char_len
10575                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10576             const SCX_enum * search_run;
10577             PERL_UINT_FAST8_T run_len;
10578
10579             SCX_enum * new_overlap = NULL;
10580             PERL_UINT_FAST8_T i, j;
10581
10582             if (intersection) {
10583                 search_run = intersection;
10584                 run_len = intersection_len;
10585             }
10586             else {
10587                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10588                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10589             }
10590
10591             intersection_len = 0;
10592
10593             for (i = 0; i < run_len; i++) {
10594                 for (j = 0; j < char_len; j++) {
10595                     if (search_run[i] == search_char[j]) {
10596
10597                         /* Here, the script at i,j matches.  That means this
10598                          * character is in the run.  But continue on to find
10599                          * the complete intersection, for the next loop
10600                          * iteration, and for the digit check after it.
10601                          *
10602                          * On the first found common script, we malloc space
10603                          * for the intersection list for the worst case of the
10604                          * intersection, which is the minimum of the number of
10605                          * scripts remaining in each set. */
10606                         if (intersection_len == 0) {
10607                             Newx(new_overlap,
10608                                  MIN(run_len - i, char_len - j),
10609                                  SCX_enum);
10610                         }
10611                         new_overlap[intersection_len++] = search_run[i];
10612                     }
10613                 }
10614             }
10615
10616             /* Here we've looked through everything.  If they have no scripts
10617              * in common, not a run */
10618             if (intersection_len == 0) {
10619                 retval = FALSE;
10620                 break;
10621             }
10622
10623             /* If there is only a single script in common, set to that.
10624              * Otherwise, use the intersection going forward */
10625             Safefree(intersection);
10626             intersection = NULL;
10627             if (intersection_len == 1) {
10628                 script_of_run = script_of_char = new_overlap[0];
10629                 Safefree(new_overlap);
10630                 new_overlap = NULL;
10631             }
10632             else {
10633                 intersection = new_overlap;
10634             }
10635         }
10636
10637 #endif
10638
10639   scripts_match:
10640
10641         /* Here, the script of the character is compatible with that of the
10642          * run.  That means that in most cases, it continues the script run.
10643          * Either it and the run match exactly, or one or both can be in any of
10644          * several scripts, and the intersection is not empty.  However, if the
10645          * character is a decimal digit, it could still mean failure if it is
10646          * from the wrong sequence of 10.  So, we need to look at if it's a
10647          * digit.  We've already handled the 10 decimal digits, and the next
10648          * lowest one is this one: */
10649         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10650             continue;   /* Not a digit; this character is part of the run */
10651         }
10652
10653         /* If we have a definitive '0' for the script of this character, we
10654          * know that for this to be a digit, it must be in the range of +0..+9
10655          * of that zero. */
10656         if (   script_of_char >= 0
10657             && (zero_of_char = script_zeros[script_of_char]))
10658         {
10659             if (   cp < zero_of_char
10660                 || cp > zero_of_char + 9)
10661             {
10662                 continue;   /* Not a digit; this character is part of the run
10663                              */
10664             }
10665
10666         }
10667         else {  /* Need to look up if this character is a digit or not */
10668             SSize_t index_of_zero_of_char;
10669             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10670             if (     UNLIKELY(index_of_zero_of_char < 0)
10671                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10672             {
10673                 continue;   /* Not a digit; this character is part of the run.
10674                              */
10675             }
10676
10677             zero_of_char = decimals_array[index_of_zero_of_char];
10678         }
10679
10680         /* Here, the character is a decimal digit, and the zero of its sequence
10681          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
10682          * they better be the same. */
10683         if (zero_of_run) {
10684             if (zero_of_run != zero_of_char) {
10685                 retval = FALSE;
10686                 break;
10687             }
10688         }
10689         else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
10690
10691             /* Here, the script run isn't Common, but the current digit is in
10692              * Common, and isn't '0'-'9' (those were handled earlier).   Only
10693              * '0'-'9' are acceptable in non-Common scripts. */
10694             retval = FALSE;
10695             break;
10696         }
10697         else {  /* Otherwise we now have a zero for this run */
10698             zero_of_run = zero_of_char;
10699         }
10700     } /* end of looping through CLOSESR text */
10701
10702     Safefree(intersection);
10703
10704     if (ret_script != NULL) {
10705         if (retval) {
10706             *ret_script = script_of_run;
10707         }
10708         else {
10709             *ret_script = SCX_INVALID;
10710         }
10711     }
10712
10713     return retval;
10714 }
10715
10716 #endif /* ifndef PERL_IN_XSUB_RE */
10717
10718 /*
10719  * ex: set ts=8 sts=4 sw=4 et:
10720  */