regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 101     goto target;                                                         \
 102 } STMT_END
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115
 116 #define HOPc(pos,off) \
 117         (char *)(reginfo->is_utf8_target \
 118             ? reghop3((U8*)pos, off, \
 119                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 120             : (U8*)(pos + off))
 121
 122 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 123 #define HOPBACK3(pos, off, lim) \
 124         (reginfo->is_utf8_target                          \
 125             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 126             : (pos - off >= lim)                                 \
 127                 ? (U8*)pos - off                                 \
 128                 : NULL)
 129
 130 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 131
 132 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 133 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 134
 135 /* lim must be +ve. Returns NULL on overshoot */
 136 #define HOPMAYBE3(pos,off,lim) \
 137         (reginfo->is_utf8_target                        \
 138             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 139             : ((U8*)pos + off <= lim)                   \
 140                 ? (U8*)pos + off                        \
 141                 : NULL)
 142
 143 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 144  * off must be >=0; args should be vars rather than expressions */
 145 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 146     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 147     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 148 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 149
 150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 152     : (U8*)(pos + off))
 153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 154
 155 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 156 #define NEXTCHR_IS_EOS (nextchr < 0)
 157
 158 #define SET_nextchr \
 159     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 160
 161 #define SET_locinput(p) \
 162     locinput = (p);  \
 163     SET_nextchr
 164
 165 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 166 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 167
 168 /* for use after a quantifier and before an EXACT-like node -- japhy */
 169 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 170  *
 171  * NOTE that *nothing* that affects backtracking should be in here, specifically
 172  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 173  * node that is in between two EXACT like nodes when ascertaining what the required
 174  * "follow" character is. This should probably be moved to regex compile time
 175  * although it may be done at run time beause of the REF possibility - more
 176  * investigation required. -- demerphq
 177 */
 178 #define JUMPABLE(rn) (                                                             \
 179     OP(rn) == OPEN ||                                                              \
 180     (OP(rn) == CLOSE &&                                                            \
 181      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 182     OP(rn) == EVAL ||                                                              \
 183     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 184     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 185     OP(rn) == KEEPS ||                                                             \
 186     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 187 )
 188 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 189
 190 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 191
 192 /*
 193   Search for mandatory following text node; for lookahead, the text must
 194   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 195 */
 196 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 197     while (JUMPABLE(rn)) { \
 198         const OPCODE type = OP(rn); \
 199         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 200             rn = NEXTOPER(NEXTOPER(rn)); \
 201         else if (type == PLUS) \
 202             rn = NEXTOPER(rn); \
 203         else if (type == IFMATCH) \
 204             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 205         else rn += NEXT_OFF(rn); \
 206     } \
 207 } STMT_END
 208
 209 #define SLAB_FIRST(s) (&(s)->states[0])
 210 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 211
 212 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 213 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 214 static regmatch_state * S_push_slab(pTHX);
 215
 216 #define REGCP_PAREN_ELEMS 3
 217 #define REGCP_OTHER_ELEMS 3
 218 #define REGCP_FRAME_ELEMS 1
 219 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 220  * are needed for the regexp context stack bookkeeping. */
 221
 222 STATIC CHECKPOINT
 223 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 224 {
 225     const int retval = PL_savestack_ix;
 226     const int paren_elems_to_push =
 227                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 228     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 229     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 230     I32 p;
 231     GET_RE_DEBUG_FLAGS_DECL;
 232
 233     PERL_ARGS_ASSERT_REGCPPUSH;
 234
 235     if (paren_elems_to_push < 0)
 236         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 237                    (int)paren_elems_to_push, (int)maxopenparen,
 238                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 239
 240     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 241         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 242                    " out of range (%lu-%ld)",
 243                    total_elems,
 244                    (unsigned long)maxopenparen,
 245                    (long)parenfloor);
 246
 247     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 248
 249     DEBUG_BUFFERS_r(
 250         if ((int)maxopenparen > (int)parenfloor)
 251             Perl_re_exec_indentf( aTHX_
 252                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 253                 depth,
 254                 PTR2UV(rex),
 255                 PTR2UV(rex->offs)
 256             );
 257     );
 258     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 259 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 260         SSPUSHIV(rex->offs[p].end);
 261         SSPUSHIV(rex->offs[p].start);
 262         SSPUSHINT(rex->offs[p].start_tmp);
 263         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 264             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 265             depth,
 266             (UV)p,
 267             (IV)rex->offs[p].start,
 268             (IV)rex->offs[p].start_tmp,
 269             (IV)rex->offs[p].end
 270         ));
 271     }
 272 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 273     SSPUSHINT(maxopenparen);
 274     SSPUSHINT(rex->lastparen);
 275     SSPUSHINT(rex->lastcloseparen);
 276     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 277
 278     return retval;
 279 }
 280
 281 /* These are needed since we do not localize EVAL nodes: */
 282 #define REGCP_SET(cp)                                           \
 283     DEBUG_STATE_r(                                              \
 284         Perl_re_exec_indentf( aTHX_                             \
 285             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 286             depth, (IV)PL_savestack_ix                          \
 287         )                                                       \
 288     );                                                          \
 289     cp = PL_savestack_ix
 290
 291 #define REGCP_UNWIND(cp)                                        \
 292     DEBUG_STATE_r(                                              \
 293         if (cp != PL_savestack_ix)                              \
 294             Perl_re_exec_indentf( aTHX_                         \
 295                 "Clearing an EVAL scope, savestack=%"           \
 296                 IVdf "..%" IVdf "\n",                           \
 297                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 298             )                                                   \
 299     );                                                          \
 300     regcpblow(cp)
 301
 302 /* set the start and end positions of capture ix */
 303 #define CLOSE_CAPTURE(ix, s, e)                                            \
 304     rex->offs[ix].start = s;                                               \
 305     rex->offs[ix].end = e;                                                 \
 306     if (ix > rex->lastparen)                                               \
 307         rex->lastparen = ix;                                               \
 308     rex->lastcloseparen = ix;                                              \
 309     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
 310         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
 311         depth,                                                             \
 312         PTR2UV(rex),                                                       \
 313         PTR2UV(rex->offs),                                                 \
 314         (UV)ix,                                                            \
 315         (IV)rex->offs[ix].start,                                           \
 316         (IV)rex->offs[ix].end,                                             \
 317         (UV)rex->lastparen                                                 \
 318     ))
 319
 320 #define UNWIND_PAREN(lp, lcp)               \
 321     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
 322         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
 323         depth,                              \
 324         PTR2UV(rex),                        \
 325         PTR2UV(rex->offs),                  \
 326         (UV)(lp),                           \
 327         (UV)(rex->lastparen),               \
 328         (UV)(lcp)                           \
 329     ));                                     \
 330     for (n = rex->lastparen; n > lp; n--)   \
 331         rex->offs[n].end = -1;              \
 332     rex->lastparen = n;                     \
 333     rex->lastcloseparen = lcp;
 334
 335
 336 STATIC void
 337 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 338 {
 339     UV i;
 340     U32 paren;
 341     GET_RE_DEBUG_FLAGS_DECL;
 342
 343     PERL_ARGS_ASSERT_REGCPPOP;
 344
 345     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 346     i = SSPOPUV;
 347     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 348     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 349     rex->lastcloseparen = SSPOPINT;
 350     rex->lastparen = SSPOPINT;
 351     *maxopenparen_p = SSPOPINT;
 352
 353     i -= REGCP_OTHER_ELEMS;
 354     /* Now restore the parentheses context. */
 355     DEBUG_BUFFERS_r(
 356         if (i || rex->lastparen + 1 <= rex->nparens)
 357             Perl_re_exec_indentf( aTHX_
 358                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 359                 depth,
 360                 PTR2UV(rex),
 361                 PTR2UV(rex->offs)
 362             );
 363     );
 364     paren = *maxopenparen_p;
 365     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 366         SSize_t tmps;
 367         rex->offs[paren].start_tmp = SSPOPINT;
 368         rex->offs[paren].start = SSPOPIV;
 369         tmps = SSPOPIV;
 370         if (paren <= rex->lastparen)
 371             rex->offs[paren].end = tmps;
 372         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 373             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 374             depth,
 375             (UV)paren,
 376             (IV)rex->offs[paren].start,
 377             (IV)rex->offs[paren].start_tmp,
 378             (IV)rex->offs[paren].end,
 379             (paren > rex->lastparen ? "(skipped)" : ""));
 380         );
 381         paren--;
 382     }
 383 #if 1
 384     /* It would seem that the similar code in regtry()
 385      * already takes care of this, and in fact it is in
 386      * a better location to since this code can #if 0-ed out
 387      * but the code in regtry() is needed or otherwise tests
 388      * requiring null fields (pat.t#187 and split.t#{13,14}
 389      * (as of patchlevel 7877)  will fail.  Then again,
 390      * this code seems to be necessary or otherwise
 391      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 392      * --jhi updated by dapm */
 393     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 394         if (i > *maxopenparen_p)
 395             rex->offs[i].start = -1;
 396         rex->offs[i].end = -1;
 397         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 398             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 399             depth,
 400             (UV)i,
 401             (i > *maxopenparen_p) ? "-1" : "  "
 402         ));
 403     }
 404 #endif
 405 }
 406
 407 /* restore the parens and associated vars at savestack position ix,
 408  * but without popping the stack */
 409
 410 STATIC void
 411 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 412 {
 413     I32 tmpix = PL_savestack_ix;
 414     PERL_ARGS_ASSERT_REGCP_RESTORE;
 415
 416     PL_savestack_ix = ix;
 417     regcppop(rex, maxopenparen_p);
 418     PL_savestack_ix = tmpix;
 419 }
 420
 421 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 422
 423 #ifndef PERL_IN_XSUB_RE
 424
 425 bool
 426 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 427 {
 428     /* Returns a boolean as to whether or not 'character' is a member of the
 429      * Posix character class given by 'classnum' that should be equivalent to a
 430      * value in the typedef '_char_class_number'.
 431      *
 432      * Ideally this could be replaced by a just an array of function pointers
 433      * to the C library functions that implement the macros this calls.
 434      * However, to compile, the precise function signatures are required, and
 435      * these may vary from platform to to platform.  To avoid having to figure
 436      * out what those all are on each platform, I (khw) am using this method,
 437      * which adds an extra layer of function call overhead (unless the C
 438      * optimizer strips it away).  But we don't particularly care about
 439      * performance with locales anyway. */
 440
 441     switch ((_char_class_number) classnum) {
 442         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 443         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 444         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 445         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 446         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 447                                         || isUPPER_LC(character);
 448         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 449         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 450         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 451         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 452         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 453         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 454         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 455         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 456         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 457         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 458         default:    /* VERTSPACE should never occur in locales */
 459             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 460     }
 461
 462     NOT_REACHED; /* NOTREACHED */
 463     return FALSE;
 464 }
 465
 466 #endif
 467
 468 STATIC bool
 469 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 470 {
 471     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 472      * 'character' is a member of the Posix character class given by 'classnum'
 473      * that should be equivalent to a value in the typedef
 474      * '_char_class_number'.
 475      *
 476      * This just calls isFOO_lc on the code point for the character if it is in
 477      * the range 0-255.  Outside that range, all characters use Unicode
 478      * rules, ignoring any locale.  So use the Unicode function if this class
 479      * requires a swash, and use the Unicode macro otherwise. */
 480
 481     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 482
 483     if (UTF8_IS_INVARIANT(*character)) {
 484         return isFOO_lc(classnum, *character);
 485     }
 486     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 487         return isFOO_lc(classnum,
 488                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 489     }
 490
 491     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 492
 493     switch ((_char_class_number) classnum) {
 494         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 495         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 496         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 497         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 498         default:
 499             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 500                                         utf8_to_uvchr_buf(character, e, NULL));
 501     }
 502
 503     return FALSE; /* Things like CNTRL are always below 256 */
 504 }
 505
 506 STATIC char *
 507 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
 508 {
 509     /* Returns the position of the first ASCII byte in the sequence between 's'
 510      * and 'send-1' inclusive; returns 'send' if none found */
 511
 512     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 513
 514 #ifndef EBCDIC
 515
 516     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 517
 518                             /* This term is wordsize if subword; 0 if not */
 519                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 520
 521                             /* 'offset' */
 522                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 523     {
 524
 525         /* Process per-byte until reach word boundary.  XXX This loop could be
 526          * eliminated if we knew that this platform had fast unaligned reads */
 527         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 528             if (isASCII(*s)) {
 529                 return s;
 530             }
 531             s++;    /* khw didn't bother creating a separate loop for
 532                        utf8_target */
 533         }
 534
 535         /* Here, we know we have at least one full word to process.  Process
 536          * per-word as long as we have at least a full word left */
 537         do {
 538             PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
 539             if (complemented & PERL_VARIANTS_WORD_MASK)  {
 540
 541 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 542      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 543
 544                 s += _variant_byte_number(complemented);
 545                 return s;
 546
 547 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 548            checks. */
 549
 550                 break;
 551 #  endif
 552             }
 553
 554             s += PERL_WORDSIZE;
 555
 556         } while (s + PERL_WORDSIZE <= send);
 557     }
 558
 559 #endif
 560
 561     /* Process per-character */
 562     if (utf8_target) {
 563         while (s < send) {
 564             if (isASCII(*s)) {
 565                 return s;
 566             }
 567             s += UTF8SKIP(s);
 568         }
 569     }
 570     else {
 571         while (s < send) {
 572             if (isASCII(*s)) {
 573                 return s;
 574             }
 575             s++;
 576         }
 577     }
 578
 579     return s;
 580 }
 581
 582 STATIC char *
 583 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
 584 {
 585     /* Returns the position of the first non-ASCII byte in the sequence between
 586      * 's' and 'send-1' inclusive; returns 'send' if none found */
 587
 588 #ifdef EBCDIC
 589
 590     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 591
 592     if (utf8_target) {
 593         while (s < send) {
 594             if ( ! isASCII(*s)) {
 595                 return s;
 596             }
 597             s += UTF8SKIP(s);
 598         }
 599     }
 600     else {
 601         while (s < send) {
 602             if ( ! isASCII(*s)) {
 603                 return s;
 604             }
 605             s++;
 606         }
 607     }
 608
 609     return s;
 610
 611 #else
 612
 613     const U8 * next_non_ascii = NULL;
 614
 615     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 616     PERL_UNUSED_ARG(utf8_target);
 617
 618     /* On ASCII platforms invariants and ASCII are identical, so if the string
 619      * is entirely invariants, there is no non-ASCII character */
 620     return (is_utf8_invariant_string_loc((U8 *) s,
 621                                          (STRLEN) (send - s),
 622                                          &next_non_ascii))
 623             ? (char *) send
 624             : (char *) next_non_ascii;
 625
 626 #endif
 627
 628 }
 629
 630 STATIC U8 *
 631 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 632 {
 633     /* Returns the position of the first byte in the sequence between 's' and
 634      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 635      * */
 636
 637     PERL_ARGS_ASSERT_FIND_SPAN_END;
 638
 639     assert(send >= s);
 640
 641     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 642                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 643                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 644     {
 645         PERL_UINTMAX_T span_word;
 646
 647         /* Process per-byte until reach word boundary.  XXX This loop could be
 648          * eliminated if we knew that this platform had fast unaligned reads */
 649         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 650             if (*s != span_byte) {
 651                 return s;
 652             }
 653             s++;
 654         }
 655
 656         /* Create a word filled with the bytes we are spanning */
 657         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 658
 659         /* Process per-word as long as we have at least a full word left */
 660         do {
 661
 662             /* Keep going if the whole word is composed of 'span_byte's */
 663             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 664                 s += PERL_WORDSIZE;
 665                 continue;
 666             }
 667
 668             /* Here, at least one byte in the word isn't 'span_byte'. */
 669
 670 #ifdef EBCDIC
 671
 672             break;
 673
 674 #else
 675
 676             /* This xor leaves 1 bits only in those non-matching bytes */
 677             span_word ^= * (PERL_UINTMAX_T *) s;
 678
 679             /* Make sure the upper bit of each non-matching byte is set.  This
 680              * makes each such byte look like an ASCII platform variant byte */
 681             span_word |= span_word << 1;
 682             span_word |= span_word << 2;
 683             span_word |= span_word << 4;
 684
 685             /* That reduces the problem to what this function solves */
 686             return s + _variant_byte_number(span_word);
 687
 688 #endif
 689
 690         } while (s + PERL_WORDSIZE <= send);
 691     }
 692
 693     /* Process the straggler bytes beyond the final word boundary */
 694     while (s < send) {
 695         if (*s != span_byte) {
 696             return s;
 697         }
 698         s++;
 699     }
 700
 701     return s;
 702 }
 703
 704 STATIC U8 *
 705 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 706 {
 707     /* Returns the position of the first byte in the sequence between 's'
 708      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 709      * returns 'send' if none found.  It uses word-level operations instead of
 710      * byte to speed up the process */
 711
 712     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 713
 714     assert(send >= s);
 715     assert((byte & mask) == byte);
 716
 717 #ifndef EBCDIC
 718
 719     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 720                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 721                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 722     {
 723         PERL_UINTMAX_T word, mask_word;
 724
 725         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 726             if (((*s) & mask) == byte) {
 727                 return s;
 728             }
 729             s++;
 730         }
 731
 732         word      = PERL_COUNT_MULTIPLIER * byte;
 733         mask_word = PERL_COUNT_MULTIPLIER * mask;
 734
 735         do {
 736             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 737
 738             /* If 'masked' contains bytes with the bit pattern of 'byte' within
 739              * it, xoring with 'word' will leave each of the 8 bits in such
 740              * bytes be 0, and no byte containing any other bit pattern will be
 741              * 0. */
 742             masked ^= word;
 743
 744             /* This causes the most significant bit to be set to 1 for any
 745              * bytes in the word that aren't completely 0 */
 746             masked |= masked << 1;
 747             masked |= masked << 2;
 748             masked |= masked << 4;
 749
 750             /* The msbits are the same as what marks a byte as variant, so we
 751              * can use this mask.  If all msbits are 1, the word doesn't
 752              * contain 'byte' */
 753             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 754                 s += PERL_WORDSIZE;
 755                 continue;
 756             }
 757
 758             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 759              * and any that are, are 0.  Complement and re-AND to swap that */
 760             masked = ~ masked;
 761             masked &= PERL_VARIANTS_WORD_MASK;
 762
 763             /* This reduces the problem to that solved by this function */
 764             s += _variant_byte_number(masked);
 765             return s;
 766
 767         } while (s + PERL_WORDSIZE <= send);
 768     }
 769
 770 #endif
 771
 772     while (s < send) {
 773         if (((*s) & mask) == byte) {
 774             return s;
 775         }
 776         s++;
 777     }
 778
 779     return s;
 780 }
 781
 782 STATIC U8 *
 783 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 784 {
 785     /* Returns the position of the first byte in the sequence between 's' and
 786      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 787      * 'span_byte' should have been ANDed with 'mask' in the call of this
 788      * function.  Returns 'send' if none found.  Works like find_span_end(),
 789      * except for the AND */
 790
 791     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 792
 793     assert(send >= s);
 794     assert((span_byte & mask) == span_byte);
 795
 796     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 797                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 798                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 799     {
 800         PERL_UINTMAX_T span_word, mask_word;
 801
 802         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 803             if (((*s) & mask) != span_byte) {
 804                 return s;
 805             }
 806             s++;
 807         }
 808
 809         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 810         mask_word = PERL_COUNT_MULTIPLIER * mask;
 811
 812         do {
 813             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 814
 815             if (masked == span_word) {
 816                 s += PERL_WORDSIZE;
 817                 continue;
 818             }
 819
 820 #ifdef EBCDIC
 821
 822             break;
 823
 824 #else
 825
 826             masked ^= span_word;
 827             masked |= masked << 1;
 828             masked |= masked << 2;
 829             masked |= masked << 4;
 830             return s + _variant_byte_number(masked);
 831
 832 #endif
 833
 834         } while (s + PERL_WORDSIZE <= send);
 835     }
 836
 837     while (s < send) {
 838         if (((*s) & mask) != span_byte) {
 839             return s;
 840         }
 841         s++;
 842     }
 843
 844     return s;
 845 }
 846
 847 /*
 848  * pregexec and friends
 849  */
 850
 851 #ifndef PERL_IN_XSUB_RE
 852 /*
 853  - pregexec - match a regexp against a string
 854  */
 855 I32
 856 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 857          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 858 /* stringarg: the point in the string at which to begin matching */
 859 /* strend:    pointer to null at end of string */
 860 /* strbeg:    real beginning of string */
 861 /* minend:    end of match must be >= minend bytes after stringarg. */
 862 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 863  *            itself is accessed via the pointers above */
 864 /* nosave:    For optimizations. */
 865 {
 866     PERL_ARGS_ASSERT_PREGEXEC;
 867
 868     return
 869         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 870                       nosave ? 0 : REXEC_COPY_STR);
 871 }
 872 #endif
 873
 874
 875
 876 /* re_intuit_start():
 877  *
 878  * Based on some optimiser hints, try to find the earliest position in the
 879  * string where the regex could match.
 880  *
 881  *   rx:     the regex to match against
 882  *   sv:     the SV being matched: only used for utf8 flag; the string
 883  *           itself is accessed via the pointers below. Note that on
 884  *           something like an overloaded SV, SvPOK(sv) may be false
 885  *           and the string pointers may point to something unrelated to
 886  *           the SV itself.
 887  *   strbeg: real beginning of string
 888  *   strpos: the point in the string at which to begin matching
 889  *   strend: pointer to the byte following the last char of the string
 890  *   flags   currently unused; set to 0
 891  *   data:   currently unused; set to NULL
 892  *
 893  * The basic idea of re_intuit_start() is to use some known information
 894  * about the pattern, namely:
 895  *
 896  *   a) the longest known anchored substring (i.e. one that's at a
 897  *      constant offset from the beginning of the pattern; but not
 898  *      necessarily at a fixed offset from the beginning of the
 899  *      string);
 900  *   b) the longest floating substring (i.e. one that's not at a constant
 901  *      offset from the beginning of the pattern);
 902  *   c) Whether the pattern is anchored to the string; either
 903  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 904  *      or anchored to pos(): /\G/;
 905  *   d) A start class: a real or synthetic character class which
 906  *      represents which characters are legal at the start of the pattern;
 907  *
 908  * to either quickly reject the match, or to find the earliest position
 909  * within the string at which the pattern might match, thus avoiding
 910  * running the full NFA engine at those earlier locations, only to
 911  * eventually fail and retry further along.
 912  *
 913  * Returns NULL if the pattern can't match, or returns the address within
 914  * the string which is the earliest place the match could occur.
 915  *
 916  * The longest of the anchored and floating substrings is called 'check'
 917  * and is checked first. The other is called 'other' and is checked
 918  * second. The 'other' substring may not be present.  For example,
 919  *
 920  *    /(abc|xyz)ABC\d{0,3}DEFG/
 921  *
 922  * will have
 923  *
 924  *   check substr (float)    = "DEFG", offset 6..9 chars
 925  *   other substr (anchored) = "ABC",  offset 3..3 chars
 926  *   stclass = [ax]
 927  *
 928  * Be aware that during the course of this function, sometimes 'anchored'
 929  * refers to a substring being anchored relative to the start of the
 930  * pattern, and sometimes to the pattern itself being anchored relative to
 931  * the string. For example:
 932  *
 933  *   /\dabc/:   "abc" is anchored to the pattern;
 934  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 935  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 936  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 937  *                    but the pattern is anchored to the string.
 938  */
 939
 940 char *
 941 Perl_re_intuit_start(pTHX_
 942                     REGEXP * const rx,
 943                     SV *sv,
 944                     const char * const strbeg,
 945                     char *strpos,
 946                     char *strend,
 947                     const U32 flags,
 948                     re_scream_pos_data *data)
 949 {
 950     struct regexp *const prog = ReANY(rx);
 951     SSize_t start_shift = prog->check_offset_min;
 952     /* Should be nonnegative! */
 953     SSize_t end_shift   = 0;
 954     /* current lowest pos in string where the regex can start matching */
 955     char *rx_origin = strpos;
 956     SV *check;
 957     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 958     U8   other_ix = 1 - prog->substrs->check_ix;
 959     bool ml_anch = 0;
 960     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 961     char *check_at = NULL;              /* check substr found at this pos */
 962     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 963     RXi_GET_DECL(prog,progi);
 964     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 965     regmatch_info *const reginfo = &reginfo_buf;
 966     GET_RE_DEBUG_FLAGS_DECL;
 967
 968     PERL_ARGS_ASSERT_RE_INTUIT_START;
 969     PERL_UNUSED_ARG(flags);
 970     PERL_UNUSED_ARG(data);
 971
 972     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 973                 "Intuit: trying to determine minimum start position...\n"));
 974
 975     /* for now, assume that all substr offsets are positive. If at some point
 976      * in the future someone wants to do clever things with lookbehind and
 977      * -ve offsets, they'll need to fix up any code in this function
 978      * which uses these offsets. See the thread beginning
 979      * <20140113145929.GF27210@iabyn.com>
 980      */
 981     assert(prog->substrs->data[0].min_offset >= 0);
 982     assert(prog->substrs->data[0].max_offset >= 0);
 983     assert(prog->substrs->data[1].min_offset >= 0);
 984     assert(prog->substrs->data[1].max_offset >= 0);
 985     assert(prog->substrs->data[2].min_offset >= 0);
 986     assert(prog->substrs->data[2].max_offset >= 0);
 987
 988     /* for now, assume that if both present, that the floating substring
 989      * doesn't start before the anchored substring.
 990      * If you break this assumption (e.g. doing better optimisations
 991      * with lookahead/behind), then you'll need to audit the code in this
 992      * function carefully first
 993      */
 994     assert(
 995             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 996               && (prog->float_utf8    || prog->float_substr))
 997            || (prog->float_min_offset >= prog->anchored_offset));
 998
 999     /* byte rather than char calculation for efficiency. It fails
1000      * to quickly reject some cases that can't match, but will reject
1001      * them later after doing full char arithmetic */
1002     if (prog->minlen > strend - strpos) {
1003         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1004                               "  String too short...\n"));
1005         goto fail;
1006     }
1007
1008     RXp_MATCH_UTF8_set(prog, utf8_target);
1009     reginfo->is_utf8_target = cBOOL(utf8_target);
1010     reginfo->info_aux = NULL;
1011     reginfo->strbeg = strbeg;
1012     reginfo->strend = strend;
1013     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1014     reginfo->intuit = 1;
1015     /* not actually used within intuit, but zero for safety anyway */
1016     reginfo->poscache_maxiter = 0;
1017
1018     if (utf8_target) {
1019         if ((!prog->anchored_utf8 && prog->anchored_substr)
1020                 || (!prog->float_utf8 && prog->float_substr))
1021             to_utf8_substr(prog);
1022         check = prog->check_utf8;
1023     } else {
1024         if (!prog->check_substr && prog->check_utf8) {
1025             if (! to_byte_substr(prog)) {
1026                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1027             }
1028         }
1029         check = prog->check_substr;
1030     }
1031
1032     /* dump the various substring data */
1033     DEBUG_OPTIMISE_MORE_r({
1034         int i;
1035         for (i=0; i<=2; i++) {
1036             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1037                                   : prog->substrs->data[i].substr);
1038             if (!sv)
1039                 continue;
1040
1041             Perl_re_printf( aTHX_
1042                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1043                 " useful=%" IVdf " utf8=%d [%s]\n",
1044                 i,
1045                 (IV)prog->substrs->data[i].min_offset,
1046                 (IV)prog->substrs->data[i].max_offset,
1047                 (IV)prog->substrs->data[i].end_shift,
1048                 BmUSEFUL(sv),
1049                 utf8_target ? 1 : 0,
1050                 SvPEEK(sv));
1051         }
1052     });
1053
1054     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1055
1056         /* ml_anch: check after \n?
1057          *
1058          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1059          * with /.*.../, these flags will have been added by the
1060          * compiler:
1061          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1062          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1063          */
1064         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1065                    && !(prog->intflags & PREGf_IMPLICIT);
1066
1067         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1068             /* we are only allowed to match at BOS or \G */
1069
1070             /* trivially reject if there's a BOS anchor and we're not at BOS.
1071              *
1072              * Note that we don't try to do a similar quick reject for
1073              * \G, since generally the caller will have calculated strpos
1074              * based on pos() and gofs, so the string is already correctly
1075              * anchored by definition; and handling the exceptions would
1076              * be too fiddly (e.g. REXEC_IGNOREPOS).
1077              */
1078             if (   strpos != strbeg
1079                 && (prog->intflags & PREGf_ANCH_SBOL))
1080             {
1081                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1082                                 "  Not at start...\n"));
1083                 goto fail;
1084             }
1085
1086             /* in the presence of an anchor, the anchored (relative to the
1087              * start of the regex) substr must also be anchored relative
1088              * to strpos. So quickly reject if substr isn't found there.
1089              * This works for \G too, because the caller will already have
1090              * subtracted gofs from pos, and gofs is the offset from the
1091              * \G to the start of the regex. For example, in /.abc\Gdef/,
1092              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1093              * caller will have set strpos=pos()-4; we look for the substr
1094              * at position pos()-4+1, which lines up with the "a" */
1095
1096             if (prog->check_offset_min == prog->check_offset_max) {
1097                 /* Substring at constant offset from beg-of-str... */
1098                 SSize_t slen = SvCUR(check);
1099                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1100
1101                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1102                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1103                     (IV)prog->check_offset_min));
1104
1105                 if (SvTAIL(check)) {
1106                     /* In this case, the regex is anchored at the end too.
1107                      * Unless it's a multiline match, the lengths must match
1108                      * exactly, give or take a \n.  NB: slen >= 1 since
1109                      * the last char of check is \n */
1110                     if (!multiline
1111                         && (   strend - s > slen
1112                             || strend - s < slen - 1
1113                             || (strend - s == slen && strend[-1] != '\n')))
1114                     {
1115                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1116                                             "  String too long...\n"));
1117                         goto fail_finish;
1118                     }
1119                     /* Now should match s[0..slen-2] */
1120                     slen--;
1121                 }
1122                 if (slen && (strend - s < slen
1123                     || *SvPVX_const(check) != *s
1124                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1125                 {
1126                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1127                                     "  String not equal...\n"));
1128                     goto fail_finish;
1129                 }
1130
1131                 check_at = s;
1132                 goto success_at_start;
1133             }
1134         }
1135     }
1136
1137     end_shift = prog->check_end_shift;
1138
1139 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1140     if (end_shift < 0)
1141         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1142                    (IV)end_shift, RX_PRECOMP(rx));
1143 #endif
1144
1145   restart:
1146
1147     /* This is the (re)entry point of the main loop in this function.
1148      * The goal of this loop is to:
1149      * 1) find the "check" substring in the region rx_origin..strend
1150      *    (adjusted by start_shift / end_shift). If not found, reject
1151      *    immediately.
1152      * 2) If it exists, look for the "other" substr too if defined; for
1153      *    example, if the check substr maps to the anchored substr, then
1154      *    check the floating substr, and vice-versa. If not found, go
1155      *    back to (1) with rx_origin suitably incremented.
1156      * 3) If we find an rx_origin position that doesn't contradict
1157      *    either of the substrings, then check the possible additional
1158      *    constraints on rx_origin of /^.../m or a known start class.
1159      *    If these fail, then depending on which constraints fail, jump
1160      *    back to here, or to various other re-entry points further along
1161      *    that skip some of the first steps.
1162      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1163      *    substring. If the start position was determined to be at the
1164      *    beginning of the string  - so, not rejected, but not optimised,
1165      *    since we have to run regmatch from position 0 - decrement the
1166      *    BmUSEFUL() count. Otherwise increment it.
1167      */
1168
1169
1170     /* first, look for the 'check' substring */
1171
1172     {
1173         U8* start_point;
1174         U8* end_point;
1175
1176         DEBUG_OPTIMISE_MORE_r({
1177             Perl_re_printf( aTHX_
1178                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1179                 " Start shift: %" IVdf " End shift %" IVdf
1180                 " Real end Shift: %" IVdf "\n",
1181                 (IV)(rx_origin - strbeg),
1182                 (IV)prog->check_offset_min,
1183                 (IV)start_shift,
1184                 (IV)end_shift,
1185                 (IV)prog->check_end_shift);
1186         });
1187
1188         end_point = HOPBACK3(strend, end_shift, rx_origin);
1189         if (!end_point)
1190             goto fail_finish;
1191         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1192         if (!start_point)
1193             goto fail_finish;
1194
1195
1196         /* If the regex is absolutely anchored to either the start of the
1197          * string (SBOL) or to pos() (ANCH_GPOS), then
1198          * check_offset_max represents an upper bound on the string where
1199          * the substr could start. For the ANCH_GPOS case, we assume that
1200          * the caller of intuit will have already set strpos to
1201          * pos()-gofs, so in this case strpos + offset_max will still be
1202          * an upper bound on the substr.
1203          */
1204         if (!ml_anch
1205             && prog->intflags & PREGf_ANCH
1206             && prog->check_offset_max != SSize_t_MAX)
1207         {
1208             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1209             const char * const anchor =
1210                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1211             SSize_t targ_len = (char*)end_point - anchor;
1212
1213             if (check_len > targ_len) {
1214                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1215                               "Target string too short to match required substring...\n"));
1216                 goto fail_finish;
1217             }
1218
1219             /* do a bytes rather than chars comparison. It's conservative;
1220              * so it skips doing the HOP if the result can't possibly end
1221              * up earlier than the old value of end_point.
1222              */
1223             assert(anchor + check_len <= (char *)end_point);
1224             if (prog->check_offset_max + check_len < targ_len) {
1225                 end_point = HOP3lim((U8*)anchor,
1226                                 prog->check_offset_max,
1227                                 end_point - check_len
1228                             )
1229                             + check_len;
1230                 if (end_point < start_point)
1231                     goto fail_finish;
1232             }
1233         }
1234
1235         check_at = fbm_instr( start_point, end_point,
1236                       check, multiline ? FBMrf_MULTILINE : 0);
1237
1238         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1239             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1240             (IV)((char*)start_point - strbeg),
1241             (IV)((char*)end_point   - strbeg),
1242             (IV)(check_at ? check_at - strbeg : -1)
1243         ));
1244
1245         /* Update the count-of-usability, remove useless subpatterns,
1246             unshift s.  */
1247
1248         DEBUG_EXECUTE_r({
1249             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1250                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1251             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1252                               (check_at ? "Found" : "Did not find"),
1253                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1254                     ? "anchored" : "floating"),
1255                 quoted,
1256                 RE_SV_TAIL(check),
1257                 (check_at ? " at offset " : "...\n") );
1258         });
1259
1260         if (!check_at)
1261             goto fail_finish;
1262         /* set rx_origin to the minimum position where the regex could start
1263          * matching, given the constraint of the just-matched check substring.
1264          * But don't set it lower than previously.
1265          */
1266
1267         if (check_at - rx_origin > prog->check_offset_max)
1268             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1269         /* Finish the diagnostic message */
1270         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1271             "%ld (rx_origin now %" IVdf ")...\n",
1272             (long)(check_at - strbeg),
1273             (IV)(rx_origin - strbeg)
1274         ));
1275     }
1276
1277
1278     /* now look for the 'other' substring if defined */
1279
1280     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1281                     : prog->substrs->data[other_ix].substr)
1282     {
1283         /* Take into account the "other" substring. */
1284         char *last, *last1;
1285         char *s;
1286         SV* must;
1287         struct reg_substr_datum *other;
1288
1289       do_other_substr:
1290         other = &prog->substrs->data[other_ix];
1291
1292         /* if "other" is anchored:
1293          * we've previously found a floating substr starting at check_at.
1294          * This means that the regex origin must lie somewhere
1295          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1296          * and max:                 HOP3(check_at, -check_offset_min)
1297          * (except that min will be >= strpos)
1298          * So the fixed  substr must lie somewhere between
1299          *  HOP3(min, anchored_offset)
1300          *  HOP3(max, anchored_offset) + SvCUR(substr)
1301          */
1302
1303         /* if "other" is floating
1304          * Calculate last1, the absolute latest point where the
1305          * floating substr could start in the string, ignoring any
1306          * constraints from the earlier fixed match. It is calculated
1307          * as follows:
1308          *
1309          * strend - prog->minlen (in chars) is the absolute latest
1310          * position within the string where the origin of the regex
1311          * could appear. The latest start point for the floating
1312          * substr is float_min_offset(*) on from the start of the
1313          * regex.  last1 simply combines thee two offsets.
1314          *
1315          * (*) You might think the latest start point should be
1316          * float_max_offset from the regex origin, and technically
1317          * you'd be correct. However, consider
1318          *    /a\d{2,4}bcd\w/
1319          * Here, float min, max are 3,5 and minlen is 7.
1320          * This can match either
1321          *    /a\d\dbcd\w/
1322          *    /a\d\d\dbcd\w/
1323          *    /a\d\d\d\dbcd\w/
1324          * In the first case, the regex matches minlen chars; in the
1325          * second, minlen+1, in the third, minlen+2.
1326          * In the first case, the floating offset is 3 (which equals
1327          * float_min), in the second, 4, and in the third, 5 (which
1328          * equals float_max). In all cases, the floating string bcd
1329          * can never start more than 4 chars from the end of the
1330          * string, which equals minlen - float_min. As the substring
1331          * starts to match more than float_min from the start of the
1332          * regex, it makes the regex match more than minlen chars,
1333          * and the two cancel each other out. So we can always use
1334          * float_min - minlen, rather than float_max - minlen for the
1335          * latest position in the string.
1336          *
1337          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1338          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1339          */
1340
1341         assert(prog->minlen >= other->min_offset);
1342         last1 = HOP3c(strend,
1343                         other->min_offset - prog->minlen, strbeg);
1344
1345         if (other_ix) {/* i.e. if (other-is-float) */
1346             /* last is the latest point where the floating substr could
1347              * start, *given* any constraints from the earlier fixed
1348              * match. This constraint is that the floating string starts
1349              * <= float_max_offset chars from the regex origin (rx_origin).
1350              * If this value is less than last1, use it instead.
1351              */
1352             assert(rx_origin <= last1);
1353             last =
1354                 /* this condition handles the offset==infinity case, and
1355                  * is a short-cut otherwise. Although it's comparing a
1356                  * byte offset to a char length, it does so in a safe way,
1357                  * since 1 char always occupies 1 or more bytes,
1358                  * so if a string range is  (last1 - rx_origin) bytes,
1359                  * it will be less than or equal to  (last1 - rx_origin)
1360                  * chars; meaning it errs towards doing the accurate HOP3
1361                  * rather than just using last1 as a short-cut */
1362                 (last1 - rx_origin) < other->max_offset
1363                     ? last1
1364                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1365         }
1366         else {
1367             assert(strpos + start_shift <= check_at);
1368             last = HOP4c(check_at, other->min_offset - start_shift,
1369                         strbeg, strend);
1370         }
1371
1372         s = HOP3c(rx_origin, other->min_offset, strend);
1373         if (s < other_last)     /* These positions already checked */
1374             s = other_last;
1375
1376         must = utf8_target ? other->utf8_substr : other->substr;
1377         assert(SvPOK(must));
1378         {
1379             char *from = s;
1380             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1381
1382             if (to > strend)
1383                 to = strend;
1384             if (from > to) {
1385                 s = NULL;
1386                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1387                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1388                     (IV)(from - strbeg),
1389                     (IV)(to   - strbeg)
1390                 ));
1391             }
1392             else {
1393                 s = fbm_instr(
1394                     (unsigned char*)from,
1395                     (unsigned char*)to,
1396                     must,
1397                     multiline ? FBMrf_MULTILINE : 0
1398                 );
1399                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1400                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1401                     (IV)(from - strbeg),
1402                     (IV)(to   - strbeg),
1403                     (IV)(s ? s - strbeg : -1)
1404                 ));
1405             }
1406         }
1407
1408         DEBUG_EXECUTE_r({
1409             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1410                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1411             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1412                 s ? "Found" : "Contradicts",
1413                 other_ix ? "floating" : "anchored",
1414                 quoted, RE_SV_TAIL(must));
1415         });
1416
1417
1418         if (!s) {
1419             /* last1 is latest possible substr location. If we didn't
1420              * find it before there, we never will */
1421             if (last >= last1) {
1422                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1423                                         "; giving up...\n"));
1424                 goto fail_finish;
1425             }
1426
1427             /* try to find the check substr again at a later
1428              * position. Maybe next time we'll find the "other" substr
1429              * in range too */
1430             other_last = HOP3c(last, 1, strend) /* highest failure */;
1431             rx_origin =
1432                 other_ix /* i.e. if other-is-float */
1433                     ? HOP3c(rx_origin, 1, strend)
1434                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1435             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1436                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1437                 (other_ix ? "floating" : "anchored"),
1438                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1439                 (IV)(rx_origin - strbeg)
1440             ));
1441             goto restart;
1442         }
1443         else {
1444             if (other_ix) { /* if (other-is-float) */
1445                 /* other_last is set to s, not s+1, since its possible for
1446                  * a floating substr to fail first time, then succeed
1447                  * second time at the same floating position; e.g.:
1448                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1449                  * The first time round, anchored and float match at
1450                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1451                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1452                  */
1453                 other_last = s;
1454             }
1455             else {
1456                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1457                 other_last = HOP3c(s, 1, strend);
1458             }
1459             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1460                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1461                   (long)(s - strbeg),
1462                 (IV)(rx_origin - strbeg)
1463               ));
1464
1465         }
1466     }
1467     else {
1468         DEBUG_OPTIMISE_MORE_r(
1469             Perl_re_printf( aTHX_
1470                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1471                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1472                 " strend:%" IVdf "\n",
1473                 (IV)prog->check_offset_min,
1474                 (IV)prog->check_offset_max,
1475                 (IV)(check_at-strbeg),
1476                 (IV)(rx_origin-strbeg),
1477                 (IV)(rx_origin-check_at),
1478                 (IV)(strend-strbeg)
1479             )
1480         );
1481     }
1482
1483   postprocess_substr_matches:
1484
1485     /* handle the extra constraint of /^.../m if present */
1486
1487     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1488         char *s;
1489
1490         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1491                         "  looking for /^/m anchor"));
1492
1493         /* we have failed the constraint of a \n before rx_origin.
1494          * Find the next \n, if any, even if it's beyond the current
1495          * anchored and/or floating substrings. Whether we should be
1496          * scanning ahead for the next \n or the next substr is debatable.
1497          * On the one hand you'd expect rare substrings to appear less
1498          * often than \n's. On the other hand, searching for \n means
1499          * we're effectively flipping between check_substr and "\n" on each
1500          * iteration as the current "rarest" string candidate, which
1501          * means for example that we'll quickly reject the whole string if
1502          * hasn't got a \n, rather than trying every substr position
1503          * first
1504          */
1505
1506         s = HOP3c(strend, - prog->minlen, strpos);
1507         if (s <= rx_origin ||
1508             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1509         {
1510             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1511                             "  Did not find /%s^%s/m...\n",
1512                             PL_colors[0], PL_colors[1]));
1513             goto fail_finish;
1514         }
1515
1516         /* earliest possible origin is 1 char after the \n.
1517          * (since *rx_origin == '\n', it's safe to ++ here rather than
1518          * HOP(rx_origin, 1)) */
1519         rx_origin++;
1520
1521         if (prog->substrs->check_ix == 0  /* check is anchored */
1522             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1523         {
1524             /* Position contradicts check-string; either because
1525              * check was anchored (and thus has no wiggle room),
1526              * or check was float and rx_origin is above the float range */
1527             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1528                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1529                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1530             goto restart;
1531         }
1532
1533         /* if we get here, the check substr must have been float,
1534          * is in range, and we may or may not have had an anchored
1535          * "other" substr which still contradicts */
1536         assert(prog->substrs->check_ix); /* check is float */
1537
1538         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1539             /* whoops, the anchored "other" substr exists, so we still
1540              * contradict. On the other hand, the float "check" substr
1541              * didn't contradict, so just retry the anchored "other"
1542              * substr */
1543             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1544                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1545                 PL_colors[0], PL_colors[1],
1546                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1547                 (IV)(rx_origin - strbeg)
1548             ));
1549             goto do_other_substr;
1550         }
1551
1552         /* success: we don't contradict the found floating substring
1553          * (and there's no anchored substr). */
1554         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1555             "  Found /%s^%s/m with rx_origin %ld...\n",
1556             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1557     }
1558     else {
1559         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1560             "  (multiline anchor test skipped)\n"));
1561     }
1562
1563   success_at_start:
1564
1565
1566     /* if we have a starting character class, then test that extra constraint.
1567      * (trie stclasses are too expensive to use here, we are better off to
1568      * leave it to regmatch itself) */
1569
1570     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1571         const U8* const str = (U8*)STRING(progi->regstclass);
1572
1573         /* XXX this value could be pre-computed */
1574         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1575                     ?  (reginfo->is_utf8_pat
1576                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1577                         : STR_LEN(progi->regstclass))
1578                     : 1);
1579         char * endpos;
1580         char *s;
1581         /* latest pos that a matching float substr constrains rx start to */
1582         char *rx_max_float = NULL;
1583
1584         /* if the current rx_origin is anchored, either by satisfying an
1585          * anchored substring constraint, or a /^.../m constraint, then we
1586          * can reject the current origin if the start class isn't found
1587          * at the current position. If we have a float-only match, then
1588          * rx_origin is constrained to a range; so look for the start class
1589          * in that range. if neither, then look for the start class in the
1590          * whole rest of the string */
1591
1592         /* XXX DAPM it's not clear what the minlen test is for, and why
1593          * it's not used in the floating case. Nothing in the test suite
1594          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1595          * Here are some old comments, which may or may not be correct:
1596          *
1597          *   minlen == 0 is possible if regstclass is \b or \B,
1598          *   and the fixed substr is ''$.
1599          *   Since minlen is already taken into account, rx_origin+1 is
1600          *   before strend; accidentally, minlen >= 1 guaranties no false
1601          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1602          *   0) below assumes that regstclass does not come from lookahead...
1603          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1604          *   This leaves EXACTF-ish only, which are dealt with in
1605          *   find_byclass().
1606          */
1607
1608         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1609             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1610         else if (prog->float_substr || prog->float_utf8) {
1611             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1612             endpos = HOP3clim(rx_max_float, cl_l, strend);
1613         }
1614         else
1615             endpos= strend;
1616
1617         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1618             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1619             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1620               (IV)start_shift, (IV)(check_at - strbeg),
1621               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1622
1623         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1624                             reginfo);
1625         if (!s) {
1626             if (endpos == strend) {
1627                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1628                                 "  Could not match STCLASS...\n") );
1629                 goto fail;
1630             }
1631             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1632                                "  This position contradicts STCLASS...\n") );
1633             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1634                         && !(prog->intflags & PREGf_IMPLICIT))
1635                 goto fail;
1636
1637             /* Contradict one of substrings */
1638             if (prog->anchored_substr || prog->anchored_utf8) {
1639                 if (prog->substrs->check_ix == 1) { /* check is float */
1640                     /* Have both, check_string is floating */
1641                     assert(rx_origin + start_shift <= check_at);
1642                     if (rx_origin + start_shift != check_at) {
1643                         /* not at latest position float substr could match:
1644                          * Recheck anchored substring, but not floating.
1645                          * The condition above is in bytes rather than
1646                          * chars for efficiency. It's conservative, in
1647                          * that it errs on the side of doing 'goto
1648                          * do_other_substr'. In this case, at worst,
1649                          * an extra anchored search may get done, but in
1650                          * practice the extra fbm_instr() is likely to
1651                          * get skipped anyway. */
1652                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1653                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1654                             (long)(other_last - strbeg),
1655                             (IV)(rx_origin - strbeg)
1656                         ));
1657                         goto do_other_substr;
1658                     }
1659                 }
1660             }
1661             else {
1662                 /* float-only */
1663
1664                 if (ml_anch) {
1665                     /* In the presence of ml_anch, we might be able to
1666                      * find another \n without breaking the current float
1667                      * constraint. */
1668
1669                     /* strictly speaking this should be HOP3c(..., 1, ...),
1670                      * but since we goto a block of code that's going to
1671                      * search for the next \n if any, its safe here */
1672                     rx_origin++;
1673                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1674                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1675                               PL_colors[0], PL_colors[1],
1676                               (long)(rx_origin - strbeg)) );
1677                     goto postprocess_substr_matches;
1678                 }
1679
1680                 /* strictly speaking this can never be true; but might
1681                  * be if we ever allow intuit without substrings */
1682                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1683                     goto fail;
1684
1685                 rx_origin = rx_max_float;
1686             }
1687
1688             /* at this point, any matching substrings have been
1689              * contradicted. Start again... */
1690
1691             rx_origin = HOP3c(rx_origin, 1, strend);
1692
1693             /* uses bytes rather than char calculations for efficiency.
1694              * It's conservative: it errs on the side of doing 'goto restart',
1695              * where there is code that does a proper char-based test */
1696             if (rx_origin + start_shift + end_shift > strend) {
1697                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1698                                        "  Could not match STCLASS...\n") );
1699                 goto fail;
1700             }
1701             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1702                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1703                 (prog->substrs->check_ix ? "floating" : "anchored"),
1704                 (long)(rx_origin + start_shift - strbeg),
1705                 (IV)(rx_origin - strbeg)
1706             ));
1707             goto restart;
1708         }
1709
1710         /* Success !!! */
1711
1712         if (rx_origin != s) {
1713             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1714                         "  By STCLASS: moving %ld --> %ld\n",
1715                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1716                    );
1717         }
1718         else {
1719             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1720                                   "  Does not contradict STCLASS...\n");
1721                    );
1722         }
1723     }
1724
1725     /* Decide whether using the substrings helped */
1726
1727     if (rx_origin != strpos) {
1728         /* Fixed substring is found far enough so that the match
1729            cannot start at strpos. */
1730
1731         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1732         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1733     }
1734     else {
1735         /* The found rx_origin position does not prohibit matching at
1736          * strpos, so calling intuit didn't gain us anything. Decrement
1737          * the BmUSEFUL() count on the check substring, and if we reach
1738          * zero, free it.  */
1739         if (!(prog->intflags & PREGf_NAUGHTY)
1740             && (utf8_target ? (
1741                 prog->check_utf8                /* Could be deleted already */
1742                 && --BmUSEFUL(prog->check_utf8) < 0
1743                 && (prog->check_utf8 == prog->float_utf8)
1744             ) : (
1745                 prog->check_substr              /* Could be deleted already */
1746                 && --BmUSEFUL(prog->check_substr) < 0
1747                 && (prog->check_substr == prog->float_substr)
1748             )))
1749         {
1750             /* If flags & SOMETHING - do not do it many times on the same match */
1751             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1752             /* XXX Does the destruction order has to change with utf8_target? */
1753             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1754             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1755             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1756             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1757             check = NULL;                       /* abort */
1758             /* XXXX This is a remnant of the old implementation.  It
1759                     looks wasteful, since now INTUIT can use many
1760                     other heuristics. */
1761             prog->extflags &= ~RXf_USE_INTUIT;
1762         }
1763     }
1764
1765     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1766             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1767              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1768
1769     return rx_origin;
1770
1771   fail_finish:                          /* Substring not found */
1772     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1773         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1774   fail:
1775     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1776                           PL_colors[4], PL_colors[5]));
1777     return NULL;
1778 }
1779
1780
1781 #define DECL_TRIE_TYPE(scan) \
1782     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1783                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1784                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1785                     trie_type = ((scan->flags == EXACT)                             \
1786                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1787                                  : (scan->flags == EXACTL)                          \
1788                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1789                                     : (scan->flags == EXACTFAA)                     \
1790                                       ? (utf8_target                                \
1791                                          ? trie_utf8_exactfa_fold                   \
1792                                          : trie_latin_utf8_exactfa_fold)            \
1793                                       : (scan->flags == EXACTFLU8                   \
1794                                          ? (utf8_target                             \
1795                                            ? trie_flu8                              \
1796                                            : trie_flu8_latin)                       \
1797                                          : (utf8_target                             \
1798                                            ? trie_utf8_fold                         \
1799                                            : trie_latin_utf8_fold)))
1800
1801 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1802  * 'foldbuf+sizeof(foldbuf)' */
1803 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1804 STMT_START {                                                                        \
1805     STRLEN skiplen;                                                                 \
1806     U8 flags = FOLD_FLAGS_FULL;                                                     \
1807     switch (trie_type) {                                                            \
1808     case trie_flu8:                                                                 \
1809         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1810         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1811             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1812         }                                                                           \
1813         goto do_trie_utf8_fold;                                                     \
1814     case trie_utf8_exactfa_fold:                                                    \
1815         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1816         /* FALLTHROUGH */                                                           \
1817     case trie_utf8_fold:                                                            \
1818       do_trie_utf8_fold:                                                            \
1819         if ( foldlen>0 ) {                                                          \
1820             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1821             foldlen -= len;                                                         \
1822             uscan += len;                                                           \
1823             len=0;                                                                  \
1824         } else {                                                                    \
1825             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1826                                                                             flags); \
1827             len = UTF8SKIP(uc);                                                     \
1828             skiplen = UVCHR_SKIP( uvc );                                            \
1829             foldlen -= skiplen;                                                     \
1830             uscan = foldbuf + skiplen;                                              \
1831         }                                                                           \
1832         break;                                                                      \
1833     case trie_flu8_latin:                                                           \
1834         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1835         goto do_trie_latin_utf8_fold;                                               \
1836     case trie_latin_utf8_exactfa_fold:                                              \
1837         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1838         /* FALLTHROUGH */                                                           \
1839     case trie_latin_utf8_fold:                                                      \
1840       do_trie_latin_utf8_fold:                                                      \
1841         if ( foldlen>0 ) {                                                          \
1842             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1843             foldlen -= len;                                                         \
1844             uscan += len;                                                           \
1845             len=0;                                                                  \
1846         } else {                                                                    \
1847             len = 1;                                                                \
1848             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1849             skiplen = UVCHR_SKIP( uvc );                                            \
1850             foldlen -= skiplen;                                                     \
1851             uscan = foldbuf + skiplen;                                              \
1852         }                                                                           \
1853         break;                                                                      \
1854     case trie_utf8l:                                                                \
1855         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1856         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1857             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1858         }                                                                           \
1859         /* FALLTHROUGH */                                                           \
1860     case trie_utf8:                                                                 \
1861         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1862         break;                                                                      \
1863     case trie_plain:                                                                \
1864         uvc = (UV)*uc;                                                              \
1865         len = 1;                                                                    \
1866     }                                                                               \
1867     if (uvc < 256) {                                                                \
1868         charid = trie->charmap[ uvc ];                                              \
1869     }                                                                               \
1870     else {                                                                          \
1871         charid = 0;                                                                 \
1872         if (widecharmap) {                                                          \
1873             SV** const svpp = hv_fetch(widecharmap,                                 \
1874                         (char*)&uvc, sizeof(UV), 0);                                \
1875             if (svpp)                                                               \
1876                 charid = (U16)SvIV(*svpp);                                          \
1877         }                                                                           \
1878     }                                                                               \
1879 } STMT_END
1880
1881 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1882     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1883                 startpos, doutf8, depth)
1884
1885 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1886     STMT_START {                                            \
1887         while (s < strend) {                                \
1888             CODE                                            \
1889             s += ((UTF8) ? UTF8SKIP(s) : 1);                \
1890         }                                                   \
1891     } STMT_END
1892
1893 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1894     STMT_START {                                            \
1895         while (s < strend) {                                \
1896             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1897         }                                                   \
1898     } STMT_END
1899
1900 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1901     if (COND) {                                                \
1902         FBC_CHECK_AND_TRY                                      \
1903         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1904         previous_occurrence_end = s;                           \
1905     }                                                          \
1906     else {                                                     \
1907         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1908     }
1909
1910 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1911     if (utf8_target) {                                         \
1912         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1913     }                                                          \
1914     else {                                                     \
1915         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1916     }
1917
1918 /* We keep track of where the next character should start after an occurrence
1919  * of the one we're looking for.  Knowing that, we can see right away if the
1920  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1921  * don't accept the 2nd and succeeding adjacent occurrences */
1922 #define FBC_CHECK_AND_TRY                                      \
1923         if (   (   doevery                                     \
1924                 || s != previous_occurrence_end)               \
1925             && (reginfo->intuit || regtry(reginfo, &s)))       \
1926         {                                                      \
1927             goto got_it;                                       \
1928         }
1929
1930
1931 /* This differs from the above macros in that it calls a function which returns
1932  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1933  * there is no such occurrence. */
1934 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1935     while (s < strend) {                                    \
1936         s = (f);                                            \
1937         if (s >= strend) {                                  \
1938             break;                                          \
1939         }                                                   \
1940                                                             \
1941         FBC_CHECK_AND_TRY                                   \
1942         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1943         previous_occurrence_end = s;                        \
1944     }
1945
1946 /* The three macros below are slightly different versions of the same logic.
1947  *
1948  * The first is for /a and /aa when the target string is UTF-8.  This can only
1949  * match ascii, but it must advance based on UTF-8.   The other two handle the
1950  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1951  * for the boundary (or non-boundary) between a word and non-word character.
1952  * The utf8 and non-utf8 cases have the same logic, but the details must be
1953  * different.  Find the "wordness" of the character just prior to this one, and
1954  * compare it with the wordness of this one.  If they differ, we have a
1955  * boundary.  At the beginning of the string, pretend that the previous
1956  * character was a new-line.
1957  *
1958  * All these macros uncleanly have side-effects with each other and outside
1959  * variables.  So far it's been too much trouble to clean-up
1960  *
1961  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1962  *               a word character or not.
1963  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1964  *               word/non-word
1965  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1966  *
1967  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1968  * are looking for a boundary or for a non-boundary.  If we are looking for a
1969  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1970  * see if this tentative match actually works, and if so, to quit the loop
1971  * here.  And vice-versa if we are looking for a non-boundary.
1972  *
1973  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1974  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1975  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1976  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1977  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1978  * complement.  But in that branch we complement tmp, meaning that at the
1979  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1980  * which means at the top of the loop in the next iteration, it is
1981  * TEST_NON_UTF8(s-1) */
1982 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1983     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1984     tmp = TEST_NON_UTF8(tmp);                                                  \
1985     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1986         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1987             tmp = !tmp;                                                        \
1988             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1989         }                                                                      \
1990         else {                                                                 \
1991             IF_FAIL;                                                           \
1992         }                                                                      \
1993     );                                                                         \
1994
1995 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1996  * TEST_UTF8 is a macro that for the same input code points returns identically
1997  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1998 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1999     if (s == reginfo->strbeg) {                                                \
2000         tmp = '\n';                                                            \
2001     }                                                                          \
2002     else { /* Back-up to the start of the previous character */                \
2003         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
2004         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
2005                                                        0, UTF8_ALLOW_DEFAULT); \
2006     }                                                                          \
2007     tmp = TEST_UV(tmp);                                                        \
2008     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2009         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2010             tmp = !tmp;                                                        \
2011             IF_SUCCESS;                                                        \
2012         }                                                                      \
2013         else {                                                                 \
2014             IF_FAIL;                                                           \
2015         }                                                                      \
2016     );
2017
2018 /* Like the above two macros.  UTF8_CODE is the complete code for handling
2019  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2020  * macros below */
2021 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
2022     if (utf8_target) {                                                         \
2023         UTF8_CODE                                                              \
2024     }                                                                          \
2025     else {  /* Not utf8 */                                                     \
2026         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
2027         tmp = TEST_NON_UTF8(tmp);                                              \
2028         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
2029             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
2030                 IF_SUCCESS;                                                    \
2031                 tmp = !tmp;                                                    \
2032             }                                                                  \
2033             else {                                                             \
2034                 IF_FAIL;                                                       \
2035             }                                                                  \
2036         );                                                                     \
2037     }                                                                          \
2038     /* Here, things have been set up by the previous code so that tmp is the   \
2039      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
2040      * utf8ness of the target).  We also have to check if this matches against \
2041      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
2042      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
2043      * string */                                                               \
2044     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2045         IF_SUCCESS;                                                            \
2046     }                                                                          \
2047     else {                                                                     \
2048         IF_FAIL;                                                               \
2049     }
2050
2051 /* This is the macro to use when we want to see if something that looks like it
2052  * could match, actually does, and if so exits the loop */
2053 #define REXEC_FBC_TRYIT                            \
2054     if ((reginfo->intuit || regtry(reginfo, &s)))  \
2055         goto got_it
2056
2057 /* The only difference between the BOUND and NBOUND cases is that
2058  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2059  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2060  * with the other one being empty (PLACEHOLDER is defined as empty).
2061  *
2062  * The TEST_FOO parameters are for operating on different forms of input, but
2063  * all should be ones that return identically for the same underlying code
2064  * points */
2065 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
2066     FBC_BOUND_COMMON(                                                          \
2067           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
2068           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2069
2070 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2071     FBC_BOUND_COMMON(                                                          \
2072             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2073             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2074
2075 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2076     FBC_BOUND_COMMON(                                                          \
2077           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2078           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2079
2080 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2081     FBC_BOUND_COMMON(                                                          \
2082             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2083             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2084
2085 #ifdef DEBUGGING
2086 static IV
2087 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2088   IV cp_out = _invlist_search(invlist, cp_in);
2089   assert(cp_out >= 0);
2090   return cp_out;
2091 }
2092 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2093         invmap[S_get_break_val_cp_checked(invlist, cp)]
2094 #else
2095 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2096         invmap[_invlist_search(invlist, cp)]
2097 #endif
2098
2099 /* Takes a pointer to an inversion list, a pointer to its corresponding
2100  * inversion map, and a code point, and returns the code point's value
2101  * according to the two arrays.  It assumes that all code points have a value.
2102  * This is used as the base macro for macros for particular properties */
2103 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2104         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2105
2106 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2107  * of a code point, returning the value for the first code point in the string.
2108  * And it takes the particular macro name that finds the desired value given a
2109  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2110 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2111              (__ASSERT_(pos < strend)                                          \
2112                  /* Note assumes is valid UTF-8 */                             \
2113              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2114
2115 /* Returns the GCB value for the input code point */
2116 #define getGCB_VAL_CP(cp)                                                      \
2117           _generic_GET_BREAK_VAL_CP(                                           \
2118                                     PL_GCB_invlist,                            \
2119                                     _Perl_GCB_invmap,                          \
2120                                     (cp))
2121
2122 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2123  * bounded by pos and strend */
2124 #define getGCB_VAL_UTF8(pos, strend)                                           \
2125     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2126
2127 /* Returns the LB value for the input code point */
2128 #define getLB_VAL_CP(cp)                                                       \
2129           _generic_GET_BREAK_VAL_CP(                                           \
2130                                     PL_LB_invlist,                             \
2131                                     _Perl_LB_invmap,                           \
2132                                     (cp))
2133
2134 /* Returns the LB value for the first code point in the UTF-8 encoded string
2135  * bounded by pos and strend */
2136 #define getLB_VAL_UTF8(pos, strend)                                            \
2137     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2138
2139
2140 /* Returns the SB value for the input code point */
2141 #define getSB_VAL_CP(cp)                                                       \
2142           _generic_GET_BREAK_VAL_CP(                                           \
2143                                     PL_SB_invlist,                             \
2144                                     _Perl_SB_invmap,                     \
2145                                     (cp))
2146
2147 /* Returns the SB value for the first code point in the UTF-8 encoded string
2148  * bounded by pos and strend */
2149 #define getSB_VAL_UTF8(pos, strend)                                            \
2150     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2151
2152 /* Returns the WB value for the input code point */
2153 #define getWB_VAL_CP(cp)                                                       \
2154           _generic_GET_BREAK_VAL_CP(                                           \
2155                                     PL_WB_invlist,                             \
2156                                     _Perl_WB_invmap,                         \
2157                                     (cp))
2158
2159 /* Returns the WB value for the first code point in the UTF-8 encoded string
2160  * bounded by pos and strend */
2161 #define getWB_VAL_UTF8(pos, strend)                                            \
2162     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2163
2164 /* We know what class REx starts with.  Try to find this position... */
2165 /* if reginfo->intuit, its a dryrun */
2166 /* annoyingly all the vars in this routine have different names from their counterparts
2167    in regmatch. /grrr */
2168 STATIC char *
2169 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2170     const char *strend, regmatch_info *reginfo)
2171 {
2172     dVAR;
2173
2174     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2175     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2176
2177     char *pat_string;   /* The pattern's exactish string */
2178     char *pat_end;          /* ptr to end char of pat_string */
2179     re_fold_t folder;   /* Function for computing non-utf8 folds */
2180     const U8 *fold_array;   /* array for folding ords < 256 */
2181     STRLEN ln;
2182     STRLEN lnc;
2183     U8 c1;
2184     U8 c2;
2185     char *e;
2186
2187     /* In some cases we accept only the first occurence of 'x' in a sequence of
2188      * them.  This variable points to just beyond the end of the previous
2189      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2190      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2191      * hop back.) */
2192     char * previous_occurrence_end = 0;
2193
2194     I32 tmp;            /* Scratch variable */
2195     const bool utf8_target = reginfo->is_utf8_target;
2196     UV utf8_fold_flags = 0;
2197     const bool is_utf8_pat = reginfo->is_utf8_pat;
2198     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2199                                    with a result inverts that result, as 0^1 =
2200                                    1 and 1^1 = 0 */
2201     _char_class_number classnum;
2202
2203     RXi_GET_DECL(prog,progi);
2204
2205     PERL_ARGS_ASSERT_FIND_BYCLASS;
2206
2207     /* We know what class it must start with. */
2208     switch (OP(c)) {
2209     case ANYOFPOSIXL:
2210     case ANYOFL:
2211         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2212
2213         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2214             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2215         }
2216
2217         /* FALLTHROUGH */
2218     case ANYOFD:
2219     case ANYOF:
2220         if (utf8_target) {
2221             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2222                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2223         }
2224         else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2225             /* We know that s is in the bitmap range since the target isn't
2226              * UTF-8, so what happens for out-of-range values is not relevant,
2227              * so exclude that from the flags */
2228             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2229         }
2230         else {
2231             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2232         }
2233         break;
2234
2235     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2236         /* UTF-8ness doesn't matter, so use 0 */
2237         REXEC_FBC_FIND_NEXT_SCAN(0,
2238          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2239                                    (U8) ARG(c), FLAGS(c)));
2240         break;
2241
2242     case NANYOFM:
2243         REXEC_FBC_FIND_NEXT_SCAN(0,
2244          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2245                                    (U8) ARG(c), FLAGS(c)));
2246         break;
2247
2248     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2249         assert(! is_utf8_pat);
2250         /* FALLTHROUGH */
2251     case EXACTFAA:
2252         if (is_utf8_pat || utf8_target) {
2253             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2254             goto do_exactf_utf8;
2255         }
2256         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
2257         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
2258         goto do_exactf_non_utf8;        /* isn't dealt with by these */
2259
2260     case EXACTF:   /* This node only generated for non-utf8 patterns */
2261         assert(! is_utf8_pat);
2262         if (utf8_target) {
2263             utf8_fold_flags = 0;
2264             goto do_exactf_utf8;
2265         }
2266         fold_array = PL_fold;
2267         folder = foldEQ;
2268         goto do_exactf_non_utf8;
2269
2270     case EXACTFL:
2271         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2272         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2273             utf8_fold_flags = FOLDEQ_LOCALE;
2274             goto do_exactf_utf8;
2275         }
2276         fold_array = PL_fold_locale;
2277         folder = foldEQ_locale;
2278         goto do_exactf_non_utf8;
2279
2280     case EXACTFU_SS:
2281         if (is_utf8_pat) {
2282             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2283         }
2284         goto do_exactf_utf8;
2285
2286     case EXACTFLU8:
2287             if (! utf8_target) {    /* All code points in this node require
2288                                        UTF-8 to express.  */
2289                 break;
2290             }
2291             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2292                                              | FOLDEQ_S2_FOLDS_SANE;
2293             goto do_exactf_utf8;
2294
2295     case EXACTFU_ONLY8:
2296         if (! utf8_target) {
2297             break;
2298         }
2299         assert(is_utf8_pat);
2300         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2301         goto do_exactf_utf8;
2302
2303     case EXACTFU:
2304         if (is_utf8_pat || utf8_target) {
2305             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2306             goto do_exactf_utf8;
2307         }
2308
2309         /* Any 'ss' in the pattern should have been replaced by regcomp,
2310          * so we don't have to worry here about this single special case
2311          * in the Latin1 range */
2312         fold_array = PL_fold_latin1;
2313         folder = foldEQ_latin1;
2314
2315         /* FALLTHROUGH */
2316
2317       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2318                            are no glitches with fold-length differences
2319                            between the target string and pattern */
2320
2321         /* The idea in the non-utf8 EXACTF* cases is to first find the
2322          * first character of the EXACTF* node and then, if necessary,
2323          * case-insensitively compare the full text of the node.  c1 is the
2324          * first character.  c2 is its fold.  This logic will not work for
2325          * Unicode semantics and the german sharp ss, which hence should
2326          * not be compiled into a node that gets here. */
2327         pat_string = STRING(c);
2328         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2329
2330         /* We know that we have to match at least 'ln' bytes (which is the
2331          * same as characters, since not utf8).  If we have to match 3
2332          * characters, and there are only 2 availabe, we know without
2333          * trying that it will fail; so don't start a match past the
2334          * required minimum number from the far end */
2335         e = HOP3c(strend, -((SSize_t)ln), s);
2336         if (e < s)
2337             break;
2338
2339         c1 = *pat_string;
2340         c2 = fold_array[c1];
2341         if (c1 == c2) { /* If char and fold are the same */
2342             while (s <= e) {
2343                 s = (char *) memchr(s, c1, e + 1 - s);
2344                 if (s == NULL) {
2345                     break;
2346                 }
2347
2348                 /* Check that the rest of the node matches */
2349                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2350                     && (reginfo->intuit || regtry(reginfo, &s)) )
2351                 {
2352                     goto got_it;
2353                 }
2354                 s++;
2355             }
2356         }
2357         else {
2358             U8 bits_differing = c1 ^ c2;
2359
2360             /* If the folds differ in one bit position only, we can mask to
2361              * match either of them, and can use this faster find method.  Both
2362              * ASCII and EBCDIC tend to have their case folds differ in only
2363              * one position, so this is very likely */
2364             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2365                 bits_differing = ~ bits_differing;
2366                 while (s <= e) {
2367                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2368                                         (c1 & bits_differing), bits_differing);
2369                     if (s > e) {
2370                         break;
2371                     }
2372
2373                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2374                         && (reginfo->intuit || regtry(reginfo, &s)) )
2375                     {
2376                         goto got_it;
2377                     }
2378                     s++;
2379                 }
2380             }
2381             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2382                        should actually happen only in EXACTFL nodes */
2383                 while (s <= e) {
2384                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2385                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2386                         && (reginfo->intuit || regtry(reginfo, &s)) )
2387                     {
2388                         goto got_it;
2389                     }
2390                     s++;
2391                 }
2392             }
2393         }
2394         break;
2395
2396       do_exactf_utf8:
2397       {
2398         unsigned expansion;
2399
2400         /* If one of the operands is in utf8, we can't use the simpler folding
2401          * above, due to the fact that many different characters can have the
2402          * same fold, or portion of a fold, or different- length fold */
2403         pat_string = STRING(c);
2404         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2405         pat_end = pat_string + ln;
2406         lnc = is_utf8_pat       /* length to match in characters */
2407                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2408                 : ln;
2409
2410         /* We have 'lnc' characters to match in the pattern, but because of
2411          * multi-character folding, each character in the target can match
2412          * up to 3 characters (Unicode guarantees it will never exceed
2413          * this) if it is utf8-encoded; and up to 2 if not (based on the
2414          * fact that the Latin 1 folds are already determined, and the
2415          * only multi-char fold in that range is the sharp-s folding to
2416          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2417          * string character.  Adjust lnc accordingly, rounding up, so that
2418          * if we need to match at least 4+1/3 chars, that really is 5. */
2419         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2420         lnc = (lnc + expansion - 1) / expansion;
2421
2422         /* As in the non-UTF8 case, if we have to match 3 characters, and
2423          * only 2 are left, it's guaranteed to fail, so don't start a
2424          * match that would require us to go beyond the end of the string
2425          */
2426         e = HOP3c(strend, -((SSize_t)lnc), s);
2427
2428         /* XXX Note that we could recalculate e to stop the loop earlier,
2429          * as the worst case expansion above will rarely be met, and as we
2430          * go along we would usually find that e moves further to the left.
2431          * This would happen only after we reached the point in the loop
2432          * where if there were no expansion we should fail.  Unclear if
2433          * worth the expense */
2434
2435         while (s <= e) {
2436             char *my_strend= (char *)strend;
2437             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2438                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2439                 && (reginfo->intuit || regtry(reginfo, &s)) )
2440             {
2441                 goto got_it;
2442             }
2443             s += (utf8_target) ? UTF8SKIP(s) : 1;
2444         }
2445         break;
2446     }
2447
2448     case BOUNDL:
2449         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2450         if (FLAGS(c) != TRADITIONAL_BOUND) {
2451             if (! IN_UTF8_CTYPE_LOCALE) {
2452                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2453                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2454             }
2455             goto do_boundu;
2456         }
2457
2458         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2459         break;
2460
2461     case NBOUNDL:
2462         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2463         if (FLAGS(c) != TRADITIONAL_BOUND) {
2464             if (! IN_UTF8_CTYPE_LOCALE) {
2465                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2466                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2467             }
2468             goto do_nboundu;
2469         }
2470
2471         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2472         break;
2473
2474     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2475                    meaning */
2476         assert(FLAGS(c) == TRADITIONAL_BOUND);
2477
2478         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2479         break;
2480
2481     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2482                    meaning */
2483         assert(FLAGS(c) == TRADITIONAL_BOUND);
2484
2485         FBC_BOUND_A(isWORDCHAR_A);
2486         break;
2487
2488     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2489                    meaning */
2490         assert(FLAGS(c) == TRADITIONAL_BOUND);
2491
2492         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2493         break;
2494
2495     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2496                    meaning */
2497         assert(FLAGS(c) == TRADITIONAL_BOUND);
2498
2499         FBC_NBOUND_A(isWORDCHAR_A);
2500         break;
2501
2502     case NBOUNDU:
2503         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2504             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2505             break;
2506         }
2507
2508       do_nboundu:
2509
2510         to_complement = 1;
2511         /* FALLTHROUGH */
2512
2513     case BOUNDU:
2514       do_boundu:
2515         switch((bound_type) FLAGS(c)) {
2516             case TRADITIONAL_BOUND:
2517                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2518                 break;
2519             case GCB_BOUND:
2520                 if (s == reginfo->strbeg) {
2521                     if (reginfo->intuit || regtry(reginfo, &s))
2522                     {
2523                         goto got_it;
2524                     }
2525
2526                     /* Didn't match.  Try at the next position (if there is one) */
2527                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2528                     if (UNLIKELY(s >= reginfo->strend)) {
2529                         break;
2530                     }
2531                 }
2532
2533                 if (utf8_target) {
2534                     GCB_enum before = getGCB_VAL_UTF8(
2535                                                reghop3((U8*)s, -1,
2536                                                        (U8*)(reginfo->strbeg)),
2537                                                (U8*) reginfo->strend);
2538                     while (s < strend) {
2539                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2540                                                         (U8*) reginfo->strend);
2541                         if (   (to_complement ^ isGCB(before,
2542                                                       after,
2543                                                       (U8*) reginfo->strbeg,
2544                                                       (U8*) s,
2545                                                       utf8_target))
2546                             && (reginfo->intuit || regtry(reginfo, &s)))
2547                         {
2548                             goto got_it;
2549                         }
2550                         before = after;
2551                         s += UTF8SKIP(s);
2552                     }
2553                 }
2554                 else {  /* Not utf8.  Everything is a GCB except between CR and
2555                            LF */
2556                     while (s < strend) {
2557                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2558                                               || UCHARAT(s) != '\n'))
2559                             && (reginfo->intuit || regtry(reginfo, &s)))
2560                         {
2561                             goto got_it;
2562                         }
2563                         s++;
2564                     }
2565                 }
2566
2567                 /* And, since this is a bound, it can match after the final
2568                  * character in the string */
2569                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2570                     goto got_it;
2571                 }
2572                 break;
2573
2574             case LB_BOUND:
2575                 if (s == reginfo->strbeg) {
2576                     if (reginfo->intuit || regtry(reginfo, &s)) {
2577                         goto got_it;
2578                     }
2579                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2580                     if (UNLIKELY(s >= reginfo->strend)) {
2581                         break;
2582                     }
2583                 }
2584
2585                 if (utf8_target) {
2586                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2587                                                                -1,
2588                                                                (U8*)(reginfo->strbeg)),
2589                                                        (U8*) reginfo->strend);
2590                     while (s < strend) {
2591                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2592                         if (to_complement ^ isLB(before,
2593                                                  after,
2594                                                  (U8*) reginfo->strbeg,
2595                                                  (U8*) s,
2596                                                  (U8*) reginfo->strend,
2597                                                  utf8_target)
2598                             && (reginfo->intuit || regtry(reginfo, &s)))
2599                         {
2600                             goto got_it;
2601                         }
2602                         before = after;
2603                         s += UTF8SKIP(s);
2604                     }
2605                 }
2606                 else {  /* Not utf8. */
2607                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2608                     while (s < strend) {
2609                         LB_enum after = getLB_VAL_CP((U8) *s);
2610                         if (to_complement ^ isLB(before,
2611                                                  after,
2612                                                  (U8*) reginfo->strbeg,
2613                                                  (U8*) s,
2614                                                  (U8*) reginfo->strend,
2615                                                  utf8_target)
2616                             && (reginfo->intuit || regtry(reginfo, &s)))
2617                         {
2618                             goto got_it;
2619                         }
2620                         before = after;
2621                         s++;
2622                     }
2623                 }
2624
2625                 if (reginfo->intuit || regtry(reginfo, &s)) {
2626                     goto got_it;
2627                 }
2628
2629                 break;
2630
2631             case SB_BOUND:
2632                 if (s == reginfo->strbeg) {
2633                     if (reginfo->intuit || regtry(reginfo, &s)) {
2634                         goto got_it;
2635                     }
2636                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2637                     if (UNLIKELY(s >= reginfo->strend)) {
2638                         break;
2639                     }
2640                 }
2641
2642                 if (utf8_target) {
2643                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2644                                                         -1,
2645                                                         (U8*)(reginfo->strbeg)),
2646                                                       (U8*) reginfo->strend);
2647                     while (s < strend) {
2648                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2649                                                          (U8*) reginfo->strend);
2650                         if ((to_complement ^ isSB(before,
2651                                                   after,
2652                                                   (U8*) reginfo->strbeg,
2653                                                   (U8*) s,
2654                                                   (U8*) reginfo->strend,
2655                                                   utf8_target))
2656                             && (reginfo->intuit || regtry(reginfo, &s)))
2657                         {
2658                             goto got_it;
2659                         }
2660                         before = after;
2661                         s += UTF8SKIP(s);
2662                     }
2663                 }
2664                 else {  /* Not utf8. */
2665                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2666                     while (s < strend) {
2667                         SB_enum after = getSB_VAL_CP((U8) *s);
2668                         if ((to_complement ^ isSB(before,
2669                                                   after,
2670                                                   (U8*) reginfo->strbeg,
2671                                                   (U8*) s,
2672                                                   (U8*) reginfo->strend,
2673                                                   utf8_target))
2674                             && (reginfo->intuit || regtry(reginfo, &s)))
2675                         {
2676                             goto got_it;
2677                         }
2678                         before = after;
2679                         s++;
2680                     }
2681                 }
2682
2683                 /* Here are at the final position in the target string.  The SB
2684                  * value is always true here, so matches, depending on other
2685                  * constraints */
2686                 if (reginfo->intuit || regtry(reginfo, &s)) {
2687                     goto got_it;
2688                 }
2689
2690                 break;
2691
2692             case WB_BOUND:
2693                 if (s == reginfo->strbeg) {
2694                     if (reginfo->intuit || regtry(reginfo, &s)) {
2695                         goto got_it;
2696                     }
2697                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2698                     if (UNLIKELY(s >= reginfo->strend)) {
2699                         break;
2700                     }
2701                 }
2702
2703                 if (utf8_target) {
2704                     /* We are at a boundary between char_sub_0 and char_sub_1.
2705                      * We also keep track of the value for char_sub_-1 as we
2706                      * loop through the line.   Context may be needed to make a
2707                      * determination, and if so, this can save having to
2708                      * recalculate it */
2709                     WB_enum previous = WB_UNKNOWN;
2710                     WB_enum before = getWB_VAL_UTF8(
2711                                               reghop3((U8*)s,
2712                                                       -1,
2713                                                       (U8*)(reginfo->strbeg)),
2714                                               (U8*) reginfo->strend);
2715                     while (s < strend) {
2716                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2717                                                         (U8*) reginfo->strend);
2718                         if ((to_complement ^ isWB(previous,
2719                                                   before,
2720                                                   after,
2721                                                   (U8*) reginfo->strbeg,
2722                                                   (U8*) s,
2723                                                   (U8*) reginfo->strend,
2724                                                   utf8_target))
2725                             && (reginfo->intuit || regtry(reginfo, &s)))
2726                         {
2727                             goto got_it;
2728                         }
2729                         previous = before;
2730                         before = after;
2731                         s += UTF8SKIP(s);
2732                     }
2733                 }
2734                 else {  /* Not utf8. */
2735                     WB_enum previous = WB_UNKNOWN;
2736                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2737                     while (s < strend) {
2738                         WB_enum after = getWB_VAL_CP((U8) *s);
2739                         if ((to_complement ^ isWB(previous,
2740                                                   before,
2741                                                   after,
2742                                                   (U8*) reginfo->strbeg,
2743                                                   (U8*) s,
2744                                                   (U8*) reginfo->strend,
2745                                                   utf8_target))
2746                             && (reginfo->intuit || regtry(reginfo, &s)))
2747                         {
2748                             goto got_it;
2749                         }
2750                         previous = before;
2751                         before = after;
2752                         s++;
2753                     }
2754                 }
2755
2756                 if (reginfo->intuit || regtry(reginfo, &s)) {
2757                     goto got_it;
2758                 }
2759         }
2760         break;
2761
2762     case LNBREAK:
2763         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2764                         is_LNBREAK_latin1_safe(s, strend)
2765         );
2766         break;
2767
2768     case ASCII:
2769         REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
2770         break;
2771
2772     case NASCII:
2773         if (utf8_target) {
2774             REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2775                                                             utf8_target));
2776         }
2777         else {
2778             REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2779                                                             utf8_target));
2780         }
2781
2782         break;
2783
2784     /* The argument to all the POSIX node types is the class number to pass to
2785      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2786
2787     case NPOSIXL:
2788         to_complement = 1;
2789         /* FALLTHROUGH */
2790
2791     case POSIXL:
2792         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2793         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2794                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2795         break;
2796
2797     case NPOSIXD:
2798         to_complement = 1;
2799         /* FALLTHROUGH */
2800
2801     case POSIXD:
2802         if (utf8_target) {
2803             goto posix_utf8;
2804         }
2805         goto posixa;
2806
2807     case NPOSIXA:
2808         if (utf8_target) {
2809             /* The complement of something that matches only ASCII matches all
2810              * non-ASCII, plus everything in ASCII that isn't in the class. */
2811             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2812                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2813             break;
2814         }
2815
2816         to_complement = 1;
2817         goto posixa;
2818
2819     case POSIXA:
2820         /* Don't need to worry about utf8, as it can match only a single
2821          * byte invariant character.  But we do anyway for performance reasons,
2822          * as otherwise we would have to examine all the continuation
2823          * characters */
2824         if (utf8_target) {
2825             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2826             break;
2827         }
2828
2829       posixa:
2830         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2831                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2832         break;
2833
2834     case NPOSIXU:
2835         to_complement = 1;
2836         /* FALLTHROUGH */
2837
2838     case POSIXU:
2839         if (! utf8_target) {
2840             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2841                                  to_complement ^ cBOOL(_generic_isCC(*s,
2842                                                                     FLAGS(c))));
2843         }
2844         else {
2845
2846           posix_utf8:
2847             classnum = (_char_class_number) FLAGS(c);
2848             switch (classnum) {
2849                 default:
2850                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2851                         to_complement ^ cBOOL(_invlist_contains_cp(
2852                                               PL_XPosix_ptrs[classnum],
2853                                               utf8_to_uvchr_buf((U8 *) s,
2854                                                                 (U8 *) strend,
2855                                                                 NULL))));
2856                     break;
2857                 case _CC_ENUM_SPACE:
2858                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2859                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2860                     break;
2861
2862                 case _CC_ENUM_BLANK:
2863                     REXEC_FBC_CLASS_SCAN(1,
2864                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2865                     break;
2866
2867                 case _CC_ENUM_XDIGIT:
2868                     REXEC_FBC_CLASS_SCAN(1,
2869                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2870                     break;
2871
2872                 case _CC_ENUM_VERTSPACE:
2873                     REXEC_FBC_CLASS_SCAN(1,
2874                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2875                     break;
2876
2877                 case _CC_ENUM_CNTRL:
2878                     REXEC_FBC_CLASS_SCAN(1,
2879                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2880                     break;
2881             }
2882         }
2883         break;
2884
2885     case AHOCORASICKC:
2886     case AHOCORASICK:
2887         {
2888             DECL_TRIE_TYPE(c);
2889             /* what trie are we using right now */
2890             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2891             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2892             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2893
2894             const char *last_start = strend - trie->minlen;
2895 #ifdef DEBUGGING
2896             const char *real_start = s;
2897 #endif
2898             STRLEN maxlen = trie->maxlen;
2899             SV *sv_points;
2900             U8 **points; /* map of where we were in the input string
2901                             when reading a given char. For ASCII this
2902                             is unnecessary overhead as the relationship
2903                             is always 1:1, but for Unicode, especially
2904                             case folded Unicode this is not true. */
2905             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2906             U8 *bitmap=NULL;
2907
2908
2909             GET_RE_DEBUG_FLAGS_DECL;
2910
2911             /* We can't just allocate points here. We need to wrap it in
2912              * an SV so it gets freed properly if there is a croak while
2913              * running the match */
2914             ENTER;
2915             SAVETMPS;
2916             sv_points=newSV(maxlen * sizeof(U8 *));
2917             SvCUR_set(sv_points,
2918                 maxlen * sizeof(U8 *));
2919             SvPOK_on(sv_points);
2920             sv_2mortal(sv_points);
2921             points=(U8**)SvPV_nolen(sv_points );
2922             if ( trie_type != trie_utf8_fold
2923                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2924             {
2925                 if (trie->bitmap)
2926                     bitmap=(U8*)trie->bitmap;
2927                 else
2928                     bitmap=(U8*)ANYOF_BITMAP(c);
2929             }
2930             /* this is the Aho-Corasick algorithm modified a touch
2931                to include special handling for long "unknown char" sequences.
2932                The basic idea being that we use AC as long as we are dealing
2933                with a possible matching char, when we encounter an unknown char
2934                (and we have not encountered an accepting state) we scan forward
2935                until we find a legal starting char.
2936                AC matching is basically that of trie matching, except that when
2937                we encounter a failing transition, we fall back to the current
2938                states "fail state", and try the current char again, a process
2939                we repeat until we reach the root state, state 1, or a legal
2940                transition. If we fail on the root state then we can either
2941                terminate if we have reached an accepting state previously, or
2942                restart the entire process from the beginning if we have not.
2943
2944              */
2945             while (s <= last_start) {
2946                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2947                 U8 *uc = (U8*)s;
2948                 U16 charid = 0;
2949                 U32 base = 1;
2950                 U32 state = 1;
2951                 UV uvc = 0;
2952                 STRLEN len = 0;
2953                 STRLEN foldlen = 0;
2954                 U8 *uscan = (U8*)NULL;
2955                 U8 *leftmost = NULL;
2956 #ifdef DEBUGGING
2957                 U32 accepted_word= 0;
2958 #endif
2959                 U32 pointpos = 0;
2960
2961                 while ( state && uc <= (U8*)strend ) {
2962                     int failed=0;
2963                     U32 word = aho->states[ state ].wordnum;
2964
2965                     if( state==1 ) {
2966                         if ( bitmap ) {
2967                             DEBUG_TRIE_EXECUTE_r(
2968                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2969                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2970                                         (char *)uc, utf8_target, 0 );
2971                                     Perl_re_printf( aTHX_
2972                                         " Scanning for legal start char...\n");
2973                                 }
2974                             );
2975                             if (utf8_target) {
2976                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2977                                     uc += UTF8SKIP(uc);
2978                                 }
2979                             } else {
2980                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2981                                     uc++;
2982                                 }
2983                             }
2984                             s= (char *)uc;
2985                         }
2986                         if (uc >(U8*)last_start) break;
2987                     }
2988
2989                     if ( word ) {
2990                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2991                         if (!leftmost || lpos < leftmost) {
2992                             DEBUG_r(accepted_word=word);
2993                             leftmost= lpos;
2994                         }
2995                         if (base==0) break;
2996
2997                     }
2998                     points[pointpos++ % maxlen]= uc;
2999                     if (foldlen || uc < (U8*)strend) {
3000                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3001                                              (U8 *) strend, uscan, len, uvc,
3002                                              charid, foldlen, foldbuf,
3003                                              uniflags);
3004                         DEBUG_TRIE_EXECUTE_r({
3005                             dump_exec_pos( (char *)uc, c, strend,
3006                                         real_start, s, utf8_target, 0);
3007                             Perl_re_printf( aTHX_
3008                                 " Charid:%3u CP:%4" UVxf " ",
3009                                  charid, uvc);
3010                         });
3011                     }
3012                     else {
3013                         len = 0;
3014                         charid = 0;
3015                     }
3016
3017
3018                     do {
3019 #ifdef DEBUGGING
3020                         word = aho->states[ state ].wordnum;
3021 #endif
3022                         base = aho->states[ state ].trans.base;
3023
3024                         DEBUG_TRIE_EXECUTE_r({
3025                             if (failed)
3026                                 dump_exec_pos( (char *)uc, c, strend, real_start,
3027                                     s,   utf8_target, 0 );
3028                             Perl_re_printf( aTHX_
3029                                 "%sState: %4" UVxf ", word=%" UVxf,
3030                                 failed ? " Fail transition to " : "",
3031                                 (UV)state, (UV)word);
3032                         });
3033                         if ( base ) {
3034                             U32 tmp;
3035                             I32 offset;
3036                             if (charid &&
3037                                  ( ((offset = base + charid
3038                                     - 1 - trie->uniquecharcount)) >= 0)
3039                                  && ((U32)offset < trie->lasttrans)
3040                                  && trie->trans[offset].check == state
3041                                  && (tmp=trie->trans[offset].next))
3042                             {
3043                                 DEBUG_TRIE_EXECUTE_r(
3044                                     Perl_re_printf( aTHX_ " - legal\n"));
3045                                 state = tmp;
3046                                 break;
3047                             }
3048                             else {
3049                                 DEBUG_TRIE_EXECUTE_r(
3050                                     Perl_re_printf( aTHX_ " - fail\n"));
3051                                 failed = 1;
3052                                 state = aho->fail[state];
3053                             }
3054                         }
3055                         else {
3056                             /* we must be accepting here */
3057                             DEBUG_TRIE_EXECUTE_r(
3058                                     Perl_re_printf( aTHX_ " - accepting\n"));
3059                             failed = 1;
3060                             break;
3061                         }
3062                     } while(state);
3063                     uc += len;
3064                     if (failed) {
3065                         if (leftmost)
3066                             break;
3067                         if (!state) state = 1;
3068                     }
3069                 }
3070                 if ( aho->states[ state ].wordnum ) {
3071                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3072                     if (!leftmost || lpos < leftmost) {
3073                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3074                         leftmost = lpos;
3075                     }
3076                 }
3077                 if (leftmost) {
3078                     s = (char*)leftmost;
3079                     DEBUG_TRIE_EXECUTE_r({
3080                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3081                             (UV)accepted_word, (IV)(s - real_start)
3082                         );
3083                     });
3084                     if (reginfo->intuit || regtry(reginfo, &s)) {
3085                         FREETMPS;
3086                         LEAVE;
3087                         goto got_it;
3088                     }
3089                     s = HOPc(s,1);
3090                     DEBUG_TRIE_EXECUTE_r({
3091                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3092                     });
3093                 } else {
3094                     DEBUG_TRIE_EXECUTE_r(
3095                         Perl_re_printf( aTHX_ "No match.\n"));
3096                     break;
3097                 }
3098             }
3099             FREETMPS;
3100             LEAVE;
3101         }
3102         break;
3103     default:
3104         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3105     }
3106     return 0;
3107   got_it:
3108     return s;
3109 }
3110
3111 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3112  * flags have same meanings as with regexec_flags() */
3113
3114 static void
3115 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3116                             char *strbeg,
3117                             char *strend,
3118                             SV *sv,
3119                             U32 flags,
3120                             bool utf8_target)
3121 {
3122     struct regexp *const prog = ReANY(rx);
3123
3124     if (flags & REXEC_COPY_STR) {
3125 #ifdef PERL_ANY_COW
3126         if (SvCANCOW(sv)) {
3127             DEBUG_C(Perl_re_printf( aTHX_
3128                               "Copy on write: regexp capture, type %d\n",
3129                                     (int) SvTYPE(sv)));
3130             /* Create a new COW SV to share the match string and store
3131              * in saved_copy, unless the current COW SV in saved_copy
3132              * is valid and suitable for our purpose */
3133             if ((   prog->saved_copy
3134                  && SvIsCOW(prog->saved_copy)
3135                  && SvPOKp(prog->saved_copy)
3136                  && SvIsCOW(sv)
3137                  && SvPOKp(sv)
3138                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3139             {
3140                 /* just reuse saved_copy SV */
3141                 if (RXp_MATCH_COPIED(prog)) {
3142                     Safefree(prog->subbeg);
3143                     RXp_MATCH_COPIED_off(prog);
3144                 }
3145             }
3146             else {
3147                 /* create new COW SV to share string */
3148                 RXp_MATCH_COPY_FREE(prog);
3149                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3150             }
3151             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3152             assert (SvPOKp(prog->saved_copy));
3153             prog->sublen  = strend - strbeg;
3154             prog->suboffset = 0;
3155             prog->subcoffset = 0;
3156         } else
3157 #endif
3158         {
3159             SSize_t min = 0;
3160             SSize_t max = strend - strbeg;
3161             SSize_t sublen;
3162
3163             if (    (flags & REXEC_COPY_SKIP_POST)
3164                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3165                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3166             ) { /* don't copy $' part of string */
3167                 U32 n = 0;
3168                 max = -1;
3169                 /* calculate the right-most part of the string covered
3170                  * by a capture. Due to lookahead, this may be to
3171                  * the right of $&, so we have to scan all captures */
3172                 while (n <= prog->lastparen) {
3173                     if (prog->offs[n].end > max)
3174                         max = prog->offs[n].end;
3175                     n++;
3176                 }
3177                 if (max == -1)
3178                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3179                             ? prog->offs[0].start
3180                             : 0;
3181                 assert(max >= 0 && max <= strend - strbeg);
3182             }
3183
3184             if (    (flags & REXEC_COPY_SKIP_PRE)
3185                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3186                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3187             ) { /* don't copy $` part of string */
3188                 U32 n = 0;
3189                 min = max;
3190                 /* calculate the left-most part of the string covered
3191                  * by a capture. Due to lookbehind, this may be to
3192                  * the left of $&, so we have to scan all captures */
3193                 while (min && n <= prog->lastparen) {
3194                     if (   prog->offs[n].start != -1
3195                         && prog->offs[n].start < min)
3196                     {
3197                         min = prog->offs[n].start;
3198                     }
3199                     n++;
3200                 }
3201                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3202                     && min >  prog->offs[0].end
3203                 )
3204                     min = prog->offs[0].end;
3205
3206             }
3207
3208             assert(min >= 0 && min <= max && min <= strend - strbeg);
3209             sublen = max - min;
3210
3211             if (RXp_MATCH_COPIED(prog)) {
3212                 if (sublen > prog->sublen)
3213                     prog->subbeg =
3214                             (char*)saferealloc(prog->subbeg, sublen+1);
3215             }
3216             else
3217                 prog->subbeg = (char*)safemalloc(sublen+1);
3218             Copy(strbeg + min, prog->subbeg, sublen, char);
3219             prog->subbeg[sublen] = '\0';
3220             prog->suboffset = min;
3221             prog->sublen = sublen;
3222             RXp_MATCH_COPIED_on(prog);
3223         }
3224         prog->subcoffset = prog->suboffset;
3225         if (prog->suboffset && utf8_target) {
3226             /* Convert byte offset to chars.
3227              * XXX ideally should only compute this if @-/@+
3228              * has been seen, a la PL_sawampersand ??? */
3229
3230             /* If there's a direct correspondence between the
3231              * string which we're matching and the original SV,
3232              * then we can use the utf8 len cache associated with
3233              * the SV. In particular, it means that under //g,
3234              * sv_pos_b2u() will use the previously cached
3235              * position to speed up working out the new length of
3236              * subcoffset, rather than counting from the start of
3237              * the string each time. This stops
3238              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3239              * from going quadratic */
3240             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3241                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3242                                                 SV_GMAGIC|SV_CONST_RETURN);
3243             else
3244                 prog->subcoffset = utf8_length((U8*)strbeg,
3245                                     (U8*)(strbeg+prog->suboffset));
3246         }
3247     }
3248     else {
3249         RXp_MATCH_COPY_FREE(prog);
3250         prog->subbeg = strbeg;
3251         prog->suboffset = 0;
3252         prog->subcoffset = 0;
3253         prog->sublen = strend - strbeg;
3254     }
3255 }
3256
3257
3258
3259
3260 /*
3261  - regexec_flags - match a regexp against a string
3262  */
3263 I32
3264 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3265               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3266 /* stringarg: the point in the string at which to begin matching */
3267 /* strend:    pointer to null at end of string */
3268 /* strbeg:    real beginning of string */
3269 /* minend:    end of match must be >= minend bytes after stringarg. */
3270 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3271  *            itself is accessed via the pointers above */
3272 /* data:      May be used for some additional optimizations.
3273               Currently unused. */
3274 /* flags:     For optimizations. See REXEC_* in regexp.h */
3275
3276 {
3277     struct regexp *const prog = ReANY(rx);
3278     char *s;
3279     regnode *c;
3280     char *startpos;
3281     SSize_t minlen;             /* must match at least this many chars */
3282     SSize_t dontbother = 0;     /* how many characters not to try at end */
3283     const bool utf8_target = cBOOL(DO_UTF8(sv));
3284     I32 multiline;
3285     RXi_GET_DECL(prog,progi);
3286     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3287     regmatch_info *const reginfo = &reginfo_buf;
3288     regexp_paren_pair *swap = NULL;
3289     I32 oldsave;
3290     GET_RE_DEBUG_FLAGS_DECL;
3291
3292     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3293     PERL_UNUSED_ARG(data);
3294
3295     /* Be paranoid... */
3296     if (prog == NULL) {
3297         Perl_croak(aTHX_ "NULL regexp parameter");
3298     }
3299
3300     DEBUG_EXECUTE_r(
3301         debug_start_match(rx, utf8_target, stringarg, strend,
3302         "Matching");
3303     );
3304
3305     startpos = stringarg;
3306
3307     /* set these early as they may be used by the HOP macros below */
3308     reginfo->strbeg = strbeg;
3309     reginfo->strend = strend;
3310     reginfo->is_utf8_target = cBOOL(utf8_target);
3311
3312     if (prog->intflags & PREGf_GPOS_SEEN) {
3313         MAGIC *mg;
3314
3315         /* set reginfo->ganch, the position where \G can match */
3316
3317         reginfo->ganch =
3318             (flags & REXEC_IGNOREPOS)
3319             ? stringarg /* use start pos rather than pos() */
3320             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3321               /* Defined pos(): */
3322             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3323             : strbeg; /* pos() not defined; use start of string */
3324
3325         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3326             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3327
3328         /* in the presence of \G, we may need to start looking earlier in
3329          * the string than the suggested start point of stringarg:
3330          * if prog->gofs is set, then that's a known, fixed minimum
3331          * offset, such as
3332          * /..\G/:   gofs = 2
3333          * /ab|c\G/: gofs = 1
3334          * or if the minimum offset isn't known, then we have to go back
3335          * to the start of the string, e.g. /w+\G/
3336          */
3337
3338         if (prog->intflags & PREGf_ANCH_GPOS) {
3339             if (prog->gofs) {
3340                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3341                 if (!startpos ||
3342                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3343                 {
3344                     DEBUG_r(Perl_re_printf( aTHX_
3345                             "fail: ganch-gofs before earliest possible start\n"));
3346                     return 0;
3347                 }
3348             }
3349             else
3350                 startpos = reginfo->ganch;
3351         }
3352         else if (prog->gofs) {
3353             startpos = HOPBACKc(startpos, prog->gofs);
3354             if (!startpos)
3355                 startpos = strbeg;
3356         }
3357         else if (prog->intflags & PREGf_GPOS_FLOAT)
3358             startpos = strbeg;
3359     }
3360
3361     minlen = prog->minlen;
3362     if ((startpos + minlen) > strend || startpos < strbeg) {
3363         DEBUG_r(Perl_re_printf( aTHX_
3364                     "Regex match can't succeed, so not even tried\n"));
3365         return 0;
3366     }
3367
3368     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3369      * which will call destuctors to reset PL_regmatch_state, free higher
3370      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3371      * regmatch_info_aux_eval */
3372
3373     oldsave = PL_savestack_ix;
3374
3375     s = startpos;
3376
3377     if ((prog->extflags & RXf_USE_INTUIT)
3378         && !(flags & REXEC_CHECKED))
3379     {
3380         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3381                                     flags, NULL);
3382         if (!s)
3383             return 0;
3384
3385         if (prog->extflags & RXf_CHECK_ALL) {
3386             /* we can match based purely on the result of INTUIT.
3387              * Set up captures etc just for $& and $-[0]
3388              * (an intuit-only match wont have $1,$2,..) */
3389             assert(!prog->nparens);
3390
3391             /* s/// doesn't like it if $& is earlier than where we asked it to
3392              * start searching (which can happen on something like /.\G/) */
3393             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3394                     && (s < stringarg))
3395             {
3396                 /* this should only be possible under \G */
3397                 assert(prog->intflags & PREGf_GPOS_SEEN);
3398                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3399                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3400                 goto phooey;
3401             }
3402
3403             /* match via INTUIT shouldn't have any captures.
3404              * Let @-, @+, $^N know */
3405             prog->lastparen = prog->lastcloseparen = 0;
3406             RXp_MATCH_UTF8_set(prog, utf8_target);
3407             prog->offs[0].start = s - strbeg;
3408             prog->offs[0].end = utf8_target
3409                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3410                 : s - strbeg + prog->minlenret;
3411             if ( !(flags & REXEC_NOT_FIRST) )
3412                 S_reg_set_capture_string(aTHX_ rx,
3413                                         strbeg, strend,
3414                                         sv, flags, utf8_target);
3415
3416             return 1;
3417         }
3418     }
3419
3420     multiline = prog->extflags & RXf_PMf_MULTILINE;
3421
3422     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3423         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3424                               "String too short [regexec_flags]...\n"));
3425         goto phooey;
3426     }
3427
3428     /* Check validity of program. */
3429     if (UCHARAT(progi->program) != REG_MAGIC) {
3430         Perl_croak(aTHX_ "corrupted regexp program");
3431     }
3432
3433     RXp_MATCH_TAINTED_off(prog);
3434     RXp_MATCH_UTF8_set(prog, utf8_target);
3435
3436     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3437     reginfo->intuit = 0;
3438     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3439     reginfo->warned = FALSE;
3440     reginfo->sv = sv;
3441     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3442     /* see how far we have to get to not match where we matched before */
3443     reginfo->till = stringarg + minend;
3444
3445     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3446         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3447            S_cleanup_regmatch_info_aux has executed (registered by
3448            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3449            magic belonging to this SV.
3450            Not newSVsv, either, as it does not COW.
3451         */
3452         reginfo->sv = newSV(0);
3453         SvSetSV_nosteal(reginfo->sv, sv);
3454         SAVEFREESV(reginfo->sv);
3455     }
3456
3457     /* reserve next 2 or 3 slots in PL_regmatch_state:
3458      * slot N+0: may currently be in use: skip it
3459      * slot N+1: use for regmatch_info_aux struct
3460      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3461      * slot N+3: ready for use by regmatch()
3462      */
3463
3464     {
3465         regmatch_state *old_regmatch_state;
3466         regmatch_slab  *old_regmatch_slab;
3467         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3468
3469         /* on first ever match, allocate first slab */
3470         if (!PL_regmatch_slab) {
3471             Newx(PL_regmatch_slab, 1, regmatch_slab);
3472             PL_regmatch_slab->prev = NULL;
3473             PL_regmatch_slab->next = NULL;
3474             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3475         }
3476
3477         old_regmatch_state = PL_regmatch_state;
3478         old_regmatch_slab  = PL_regmatch_slab;
3479
3480         for (i=0; i <= max; i++) {
3481             if (i == 1)
3482                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3483             else if (i ==2)
3484                 reginfo->info_aux_eval =
3485                 reginfo->info_aux->info_aux_eval =
3486                             &(PL_regmatch_state->u.info_aux_eval);
3487
3488             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3489                 PL_regmatch_state = S_push_slab(aTHX);
3490         }
3491
3492         /* note initial PL_regmatch_state position; at end of match we'll
3493          * pop back to there and free any higher slabs */
3494
3495         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3496         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3497         reginfo->info_aux->poscache = NULL;
3498
3499         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3500
3501         if ((prog->extflags & RXf_EVAL_SEEN))
3502             S_setup_eval_state(aTHX_ reginfo);
3503         else
3504             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3505     }
3506
3507     /* If there is a "must appear" string, look for it. */
3508
3509     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3510         /* We have to be careful. If the previous successful match
3511            was from this regex we don't want a subsequent partially
3512            successful match to clobber the old results.
3513            So when we detect this possibility we add a swap buffer
3514            to the re, and switch the buffer each match. If we fail,
3515            we switch it back; otherwise we leave it swapped.
3516         */
3517         swap = prog->offs;
3518         /* do we need a save destructor here for eval dies? */
3519         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3520         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3521             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3522             0,
3523             PTR2UV(prog),
3524             PTR2UV(swap),
3525             PTR2UV(prog->offs)
3526         ));
3527     }
3528
3529     if (prog->recurse_locinput)
3530         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3531
3532     /* Simplest case: anchored match need be tried only once, or with
3533      * MBOL, only at the beginning of each line.
3534      *
3535      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3536      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3537      * match at the start of the string then it won't match anywhere else
3538      * either; while with /.*.../, if it doesn't match at the beginning,
3539      * the earliest it could match is at the start of the next line */
3540
3541     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3542         char *end;
3543
3544         if (regtry(reginfo, &s))
3545             goto got_it;
3546
3547         if (!(prog->intflags & PREGf_ANCH_MBOL))
3548             goto phooey;
3549
3550         /* didn't match at start, try at other newline positions */
3551
3552         if (minlen)
3553             dontbother = minlen - 1;
3554         end = HOP3c(strend, -dontbother, strbeg) - 1;
3555
3556         /* skip to next newline */
3557
3558         while (s <= end) { /* note it could be possible to match at the end of the string */
3559             /* NB: newlines are the same in unicode as they are in latin */
3560             if (*s++ != '\n')
3561                 continue;
3562             if (prog->check_substr || prog->check_utf8) {
3563             /* note that with PREGf_IMPLICIT, intuit can only fail
3564              * or return the start position, so it's of limited utility.
3565              * Nevertheless, I made the decision that the potential for
3566              * quick fail was still worth it - DAPM */
3567                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3568                 if (!s)
3569                     goto phooey;
3570             }
3571             if (regtry(reginfo, &s))
3572                 goto got_it;
3573         }
3574         goto phooey;
3575     } /* end anchored search */
3576
3577     if (prog->intflags & PREGf_ANCH_GPOS)
3578     {
3579         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3580         assert(prog->intflags & PREGf_GPOS_SEEN);
3581         /* For anchored \G, the only position it can match from is
3582          * (ganch-gofs); we already set startpos to this above; if intuit
3583          * moved us on from there, we can't possibly succeed */
3584         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3585         if (s == startpos && regtry(reginfo, &s))
3586             goto got_it;
3587         goto phooey;
3588     }
3589
3590     /* Messy cases:  unanchored match. */
3591     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3592         /* we have /x+whatever/ */
3593         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3594         char ch;
3595 #ifdef DEBUGGING
3596         int did_match = 0;
3597 #endif
3598         if (utf8_target) {
3599             if (! prog->anchored_utf8) {
3600                 to_utf8_substr(prog);
3601             }
3602             ch = SvPVX_const(prog->anchored_utf8)[0];
3603             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3604                 if (*s == ch) {
3605                     DEBUG_EXECUTE_r( did_match = 1 );
3606                     if (regtry(reginfo, &s)) goto got_it;
3607                     s += UTF8SKIP(s);
3608                     while (s < strend && *s == ch)
3609                         s += UTF8SKIP(s);
3610                 }
3611             );
3612
3613         }
3614         else {
3615             if (! prog->anchored_substr) {
3616                 if (! to_byte_substr(prog)) {
3617                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3618                 }
3619             }
3620             ch = SvPVX_const(prog->anchored_substr)[0];
3621             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3622                 if (*s == ch) {
3623                     DEBUG_EXECUTE_r( did_match = 1 );
3624                     if (regtry(reginfo, &s)) goto got_it;
3625                     s++;
3626                     while (s < strend && *s == ch)
3627                         s++;
3628                 }
3629             );
3630         }
3631         DEBUG_EXECUTE_r(if (!did_match)
3632                 Perl_re_printf( aTHX_
3633                                   "Did not find anchored character...\n")
3634                );
3635     }
3636     else if (prog->anchored_substr != NULL
3637               || prog->anchored_utf8 != NULL
3638               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3639                   && prog->float_max_offset < strend - s)) {
3640         SV *must;
3641         SSize_t back_max;
3642         SSize_t back_min;
3643         char *last;
3644         char *last1;            /* Last position checked before */
3645 #ifdef DEBUGGING
3646         int did_match = 0;
3647 #endif
3648         if (prog->anchored_substr || prog->anchored_utf8) {
3649             if (utf8_target) {
3650                 if (! prog->anchored_utf8) {
3651                     to_utf8_substr(prog);
3652                 }
3653                 must = prog->anchored_utf8;
3654             }
3655             else {
3656                 if (! prog->anchored_substr) {
3657                     if (! to_byte_substr(prog)) {
3658                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3659                     }
3660                 }
3661                 must = prog->anchored_substr;
3662             }
3663             back_max = back_min = prog->anchored_offset;
3664         } else {
3665             if (utf8_target) {
3666                 if (! prog->float_utf8) {
3667                     to_utf8_substr(prog);
3668                 }
3669                 must = prog->float_utf8;
3670             }
3671             else {
3672                 if (! prog->float_substr) {
3673                     if (! to_byte_substr(prog)) {
3674                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3675                     }
3676                 }
3677                 must = prog->float_substr;
3678             }
3679             back_max = prog->float_max_offset;
3680             back_min = prog->float_min_offset;
3681         }
3682
3683         if (back_min<0) {
3684             last = strend;
3685         } else {
3686             last = HOP3c(strend,        /* Cannot start after this */
3687                   -(SSize_t)(CHR_SVLEN(must)
3688                          - (SvTAIL(must) != 0) + back_min), strbeg);
3689         }
3690         if (s > reginfo->strbeg)
3691             last1 = HOPc(s, -1);
3692         else
3693             last1 = s - 1;      /* bogus */
3694
3695         /* XXXX check_substr already used to find "s", can optimize if
3696            check_substr==must. */
3697         dontbother = 0;
3698         strend = HOPc(strend, -dontbother);
3699         while ( (s <= last) &&
3700                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3701                                   (unsigned char*)strend, must,
3702                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3703             DEBUG_EXECUTE_r( did_match = 1 );
3704             if (HOPc(s, -back_max) > last1) {
3705                 last1 = HOPc(s, -back_min);
3706                 s = HOPc(s, -back_max);
3707             }
3708             else {
3709                 char * const t = (last1 >= reginfo->strbeg)
3710                                     ? HOPc(last1, 1) : last1 + 1;
3711
3712                 last1 = HOPc(s, -back_min);
3713                 s = t;
3714             }
3715             if (utf8_target) {
3716                 while (s <= last1) {
3717                     if (regtry(reginfo, &s))
3718                         goto got_it;
3719                     if (s >= last1) {
3720                         s++; /* to break out of outer loop */
3721                         break;
3722                     }
3723                     s += UTF8SKIP(s);
3724                 }
3725             }
3726             else {
3727                 while (s <= last1) {
3728                     if (regtry(reginfo, &s))
3729                         goto got_it;
3730                     s++;
3731                 }
3732             }
3733         }
3734         DEBUG_EXECUTE_r(if (!did_match) {
3735             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3736                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3737             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3738                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3739                                ? "anchored" : "floating"),
3740                 quoted, RE_SV_TAIL(must));
3741         });
3742         goto phooey;
3743     }
3744     else if ( (c = progi->regstclass) ) {
3745         if (minlen) {
3746             const OPCODE op = OP(progi->regstclass);
3747             /* don't bother with what can't match */
3748             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3749                 strend = HOPc(strend, -(minlen - 1));
3750         }
3751         DEBUG_EXECUTE_r({
3752             SV * const prop = sv_newmortal();
3753             regprop(prog, prop, c, reginfo, NULL);
3754             {
3755                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3756                     s,strend-s,PL_dump_re_max_len);
3757                 Perl_re_printf( aTHX_
3758                     "Matching stclass %.*s against %s (%d bytes)\n",
3759                     (int)SvCUR(prop), SvPVX_const(prop),
3760                      quoted, (int)(strend - s));
3761             }
3762         });
3763         if (find_byclass(prog, c, s, strend, reginfo))
3764             goto got_it;
3765         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3766     }
3767     else {
3768         dontbother = 0;
3769         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3770             /* Trim the end. */
3771             char *last= NULL;
3772             SV* float_real;
3773             STRLEN len;
3774             const char *little;
3775
3776             if (utf8_target) {
3777                 if (! prog->float_utf8) {
3778                     to_utf8_substr(prog);
3779                 }
3780                 float_real = prog->float_utf8;
3781             }
3782             else {
3783                 if (! prog->float_substr) {
3784                     if (! to_byte_substr(prog)) {
3785                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3786                     }
3787                 }
3788                 float_real = prog->float_substr;
3789             }
3790
3791             little = SvPV_const(float_real, len);
3792             if (SvTAIL(float_real)) {
3793                     /* This means that float_real contains an artificial \n on
3794                      * the end due to the presence of something like this:
3795                      * /foo$/ where we can match both "foo" and "foo\n" at the
3796                      * end of the string.  So we have to compare the end of the
3797                      * string first against the float_real without the \n and
3798                      * then against the full float_real with the string.  We
3799                      * have to watch out for cases where the string might be
3800                      * smaller than the float_real or the float_real without
3801                      * the \n. */
3802                     char *checkpos= strend - len;
3803                     DEBUG_OPTIMISE_r(
3804                         Perl_re_printf( aTHX_
3805                             "%sChecking for float_real.%s\n",
3806                             PL_colors[4], PL_colors[5]));
3807                     if (checkpos + 1 < strbeg) {
3808                         /* can't match, even if we remove the trailing \n
3809                          * string is too short to match */
3810                         DEBUG_EXECUTE_r(
3811                             Perl_re_printf( aTHX_
3812                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3813                                 PL_colors[4], PL_colors[5]));
3814                         goto phooey;
3815                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3816                         /* can match, the end of the string matches without the
3817                          * "\n" */
3818                         last = checkpos + 1;
3819                     } else if (checkpos < strbeg) {
3820                         /* cant match, string is too short when the "\n" is
3821                          * included */
3822                         DEBUG_EXECUTE_r(
3823                             Perl_re_printf( aTHX_
3824                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3825                                 PL_colors[4], PL_colors[5]));
3826                         goto phooey;
3827                     } else if (!multiline) {
3828                         /* non multiline match, so compare with the "\n" at the
3829                          * end of the string */
3830                         if (memEQ(checkpos, little, len)) {
3831                             last= checkpos;
3832                         } else {
3833                             DEBUG_EXECUTE_r(
3834                                 Perl_re_printf( aTHX_
3835                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3836                                     PL_colors[4], PL_colors[5]));
3837                             goto phooey;
3838                         }
3839                     } else {
3840                         /* multiline match, so we have to search for a place
3841                          * where the full string is located */
3842                         goto find_last;
3843                     }
3844             } else {
3845                   find_last:
3846                     if (len)
3847                         last = rninstr(s, strend, little, little + len);
3848                     else
3849                         last = strend;  /* matching "$" */
3850             }
3851             if (!last) {
3852                 /* at one point this block contained a comment which was
3853                  * probably incorrect, which said that this was a "should not
3854                  * happen" case.  Even if it was true when it was written I am
3855                  * pretty sure it is not anymore, so I have removed the comment
3856                  * and replaced it with this one. Yves */
3857                 DEBUG_EXECUTE_r(
3858                     Perl_re_printf( aTHX_
3859                         "%sString does not contain required substring, cannot match.%s\n",
3860                         PL_colors[4], PL_colors[5]
3861                     ));
3862                 goto phooey;
3863             }
3864             dontbother = strend - last + prog->float_min_offset;
3865         }
3866         if (minlen && (dontbother < minlen))
3867             dontbother = minlen - 1;
3868         strend -= dontbother;              /* this one's always in bytes! */
3869         /* We don't know much -- general case. */
3870         if (utf8_target) {
3871             for (;;) {
3872                 if (regtry(reginfo, &s))
3873                     goto got_it;
3874                 if (s >= strend)
3875                     break;
3876                 s += UTF8SKIP(s);
3877             };
3878         }
3879         else {
3880             do {
3881                 if (regtry(reginfo, &s))
3882                     goto got_it;
3883             } while (s++ < strend);
3884         }
3885     }
3886
3887     /* Failure. */
3888     goto phooey;
3889
3890   got_it:
3891     /* s/// doesn't like it if $& is earlier than where we asked it to
3892      * start searching (which can happen on something like /.\G/) */
3893     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3894             && (prog->offs[0].start < stringarg - strbeg))
3895     {
3896         /* this should only be possible under \G */
3897         assert(prog->intflags & PREGf_GPOS_SEEN);
3898         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3899             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3900         goto phooey;
3901     }
3902
3903     DEBUG_BUFFERS_r(
3904         if (swap)
3905             Perl_re_exec_indentf( aTHX_
3906                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3907                 0,
3908                 PTR2UV(prog),
3909                 PTR2UV(swap)
3910             );
3911     );
3912     Safefree(swap);
3913
3914     /* clean up; this will trigger destructors that will free all slabs
3915      * above the current one, and cleanup the regmatch_info_aux
3916      * and regmatch_info_aux_eval sructs */
3917
3918     LEAVE_SCOPE(oldsave);
3919
3920     if (RXp_PAREN_NAMES(prog))
3921         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3922
3923     /* make sure $`, $&, $', and $digit will work later */
3924     if ( !(flags & REXEC_NOT_FIRST) )
3925         S_reg_set_capture_string(aTHX_ rx,
3926                                     strbeg, reginfo->strend,
3927                                     sv, flags, utf8_target);
3928
3929     return 1;
3930
3931   phooey:
3932     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3933                           PL_colors[4], PL_colors[5]));
3934
3935     /* clean up; this will trigger destructors that will free all slabs
3936      * above the current one, and cleanup the regmatch_info_aux
3937      * and regmatch_info_aux_eval sructs */
3938
3939     LEAVE_SCOPE(oldsave);
3940
3941     if (swap) {
3942         /* we failed :-( roll it back */
3943         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3944             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3945             0,
3946             PTR2UV(prog),
3947             PTR2UV(prog->offs),
3948             PTR2UV(swap)
3949         ));
3950         Safefree(prog->offs);
3951         prog->offs = swap;
3952     }
3953     return 0;
3954 }
3955
3956
3957 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3958  * Do inc before dec, in case old and new rex are the same */
3959 #define SET_reg_curpm(Re2)                          \
3960     if (reginfo->info_aux_eval) {                   \
3961         (void)ReREFCNT_inc(Re2);                    \
3962         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3963         PM_SETRE((PL_reg_curpm), (Re2));            \
3964     }
3965
3966
3967 /*
3968  - regtry - try match at specific point
3969  */
3970 STATIC bool                     /* 0 failure, 1 success */
3971 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3972 {
3973     CHECKPOINT lastcp;
3974     REGEXP *const rx = reginfo->prog;
3975     regexp *const prog = ReANY(rx);
3976     SSize_t result;
3977 #ifdef DEBUGGING
3978     U32 depth = 0; /* used by REGCP_SET */
3979 #endif
3980     RXi_GET_DECL(prog,progi);
3981     GET_RE_DEBUG_FLAGS_DECL;
3982
3983     PERL_ARGS_ASSERT_REGTRY;
3984
3985     reginfo->cutpoint=NULL;
3986
3987     prog->offs[0].start = *startposp - reginfo->strbeg;
3988     prog->lastparen = 0;
3989     prog->lastcloseparen = 0;
3990
3991     /* XXXX What this code is doing here?!!!  There should be no need
3992        to do this again and again, prog->lastparen should take care of
3993        this!  --ilya*/
3994
3995     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3996      * Actually, the code in regcppop() (which Ilya may be meaning by
3997      * prog->lastparen), is not needed at all by the test suite
3998      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3999      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4000      * Meanwhile, this code *is* needed for the
4001      * above-mentioned test suite tests to succeed.  The common theme
4002      * on those tests seems to be returning null fields from matches.
4003      * --jhi updated by dapm */
4004
4005     /* After encountering a variant of the issue mentioned above I think
4006      * the point Ilya was making is that if we properly unwind whenever
4007      * we set lastparen to a smaller value then we should not need to do
4008      * this every time, only when needed. So if we have tests that fail if
4009      * we remove this, then it suggests somewhere else we are improperly
4010      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4011      * places it is called, and related regcp() routines. - Yves */
4012 #if 1
4013     if (prog->nparens) {
4014         regexp_paren_pair *pp = prog->offs;
4015         I32 i;
4016         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4017             ++pp;
4018             pp->start = -1;
4019             pp->end = -1;
4020         }
4021     }
4022 #endif
4023     REGCP_SET(lastcp);
4024     result = regmatch(reginfo, *startposp, progi->program + 1);
4025     if (result != -1) {
4026         prog->offs[0].end = result;
4027         return 1;
4028     }
4029     if (reginfo->cutpoint)
4030         *startposp= reginfo->cutpoint;
4031     REGCP_UNWIND(lastcp);
4032     return 0;
4033 }
4034
4035
4036 #define sayYES goto yes
4037 #define sayNO goto no
4038 #define sayNO_SILENT goto no_silent
4039
4040 /* we dont use STMT_START/END here because it leads to
4041    "unreachable code" warnings, which are bogus, but distracting. */
4042 #define CACHEsayNO \
4043     if (ST.cache_mask) \
4044        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
4045     sayNO
4046
4047 /* this is used to determine how far from the left messages like
4048    'failed...' are printed in regexec.c. It should be set such that
4049    messages are inline with the regop output that created them.
4050 */
4051 #define REPORT_CODE_OFF 29
4052 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4053 #ifdef DEBUGGING
4054 int
4055 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4056 {
4057     va_list ap;
4058     int result;
4059     PerlIO *f= Perl_debug_log;
4060     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4061     va_start(ap, depth);
4062     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4063     result = PerlIO_vprintf(f, fmt, ap);
4064     va_end(ap);
4065     return result;
4066 }
4067 #endif /* DEBUGGING */
4068
4069
4070 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4071 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4072 #define CHRTEST_NOT_A_CP_1 -999
4073 #define CHRTEST_NOT_A_CP_2 -998
4074
4075 /* grab a new slab and return the first slot in it */
4076
4077 STATIC regmatch_state *
4078 S_push_slab(pTHX)
4079 {
4080     regmatch_slab *s = PL_regmatch_slab->next;
4081     if (!s) {
4082         Newx(s, 1, regmatch_slab);
4083         s->prev = PL_regmatch_slab;
4084         s->next = NULL;
4085         PL_regmatch_slab->next = s;
4086     }
4087     PL_regmatch_slab = s;
4088     return SLAB_FIRST(s);
4089 }
4090
4091
4092 /* push a new state then goto it */
4093
4094 #define PUSH_STATE_GOTO(state, node, input) \
4095     pushinput = input; \
4096     scan = node; \
4097     st->resume_state = state; \
4098     goto push_state;
4099
4100 /* push a new state with success backtracking, then goto it */
4101
4102 #define PUSH_YES_STATE_GOTO(state, node, input) \
4103     pushinput = input; \
4104     scan = node; \
4105     st->resume_state = state; \
4106     goto push_yes_state;
4107
4108
4109
4110
4111 /*
4112
4113 regmatch() - main matching routine
4114
4115 This is basically one big switch statement in a loop. We execute an op,
4116 set 'next' to point the next op, and continue. If we come to a point which
4117 we may need to backtrack to on failure such as (A|B|C), we push a
4118 backtrack state onto the backtrack stack. On failure, we pop the top
4119 state, and re-enter the loop at the state indicated. If there are no more
4120 states to pop, we return failure.
4121
4122 Sometimes we also need to backtrack on success; for example /A+/, where
4123 after successfully matching one A, we need to go back and try to
4124 match another one; similarly for lookahead assertions: if the assertion
4125 completes successfully, we backtrack to the state just before the assertion
4126 and then carry on.  In these cases, the pushed state is marked as
4127 'backtrack on success too'. This marking is in fact done by a chain of
4128 pointers, each pointing to the previous 'yes' state. On success, we pop to
4129 the nearest yes state, discarding any intermediate failure-only states.
4130 Sometimes a yes state is pushed just to force some cleanup code to be
4131 called at the end of a successful match or submatch; e.g. (??{$re}) uses
4132 it to free the inner regex.
4133
4134 Note that failure backtracking rewinds the cursor position, while
4135 success backtracking leaves it alone.
4136
4137 A pattern is complete when the END op is executed, while a subpattern
4138 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
4139 ops trigger the "pop to last yes state if any, otherwise return true"
4140 behaviour.
4141
4142 A common convention in this function is to use A and B to refer to the two
4143 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
4144 the subpattern to be matched possibly multiple times, while B is the entire
4145 rest of the pattern. Variable and state names reflect this convention.
4146
4147 The states in the main switch are the union of ops and failure/success of
4148 substates associated with with that op.  For example, IFMATCH is the op
4149 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
4150 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
4151 successfully matched A and IFMATCH_A_fail is a state saying that we have
4152 just failed to match A. Resume states always come in pairs. The backtrack
4153 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
4154 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
4155 on success or failure.
4156
4157 The struct that holds a backtracking state is actually a big union, with
4158 one variant for each major type of op. The variable st points to the
4159 top-most backtrack struct. To make the code clearer, within each
4160 block of code we #define ST to alias the relevant union.
4161
4162 Here's a concrete example of a (vastly oversimplified) IFMATCH
4163 implementation:
4164
4165     switch (state) {
4166     ....
4167
4168 #define ST st->u.ifmatch
4169
4170     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4171         ST.foo = ...; // some state we wish to save
4172         ...
4173         // push a yes backtrack state with a resume value of
4174         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
4175         // first node of A:
4176         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
4177         // NOTREACHED
4178
4179     case IFMATCH_A: // we have successfully executed A; now continue with B
4180         next = B;
4181         bar = ST.foo; // do something with the preserved value
4182         break;
4183
4184     case IFMATCH_A_fail: // A failed, so the assertion failed
4185         ...;   // do some housekeeping, then ...
4186         sayNO; // propagate the failure
4187
4188 #undef ST
4189
4190     ...
4191     }
4192
4193 For any old-timers reading this who are familiar with the old recursive
4194 approach, the code above is equivalent to:
4195
4196     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4197     {
4198         int foo = ...
4199         ...
4200         if (regmatch(A)) {
4201             next = B;
4202             bar = foo;
4203             break;
4204         }
4205         ...;   // do some housekeeping, then ...
4206         sayNO; // propagate the failure
4207     }
4208
4209 The topmost backtrack state, pointed to by st, is usually free. If you
4210 want to claim it, populate any ST.foo fields in it with values you wish to
4211 save, then do one of
4212
4213         PUSH_STATE_GOTO(resume_state, node, newinput);
4214         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
4215
4216 which sets that backtrack state's resume value to 'resume_state', pushes a
4217 new free entry to the top of the backtrack stack, then goes to 'node'.
4218 On backtracking, the free slot is popped, and the saved state becomes the
4219 new free state. An ST.foo field in this new top state can be temporarily
4220 accessed to retrieve values, but once the main loop is re-entered, it
4221 becomes available for reuse.
4222
4223 Note that the depth of the backtrack stack constantly increases during the
4224 left-to-right execution of the pattern, rather than going up and down with
4225 the pattern nesting. For example the stack is at its maximum at Z at the
4226 end of the pattern, rather than at X in the following:
4227
4228     /(((X)+)+)+....(Y)+....Z/
4229
4230 The only exceptions to this are lookahead/behind assertions and the cut,
4231 (?>A), which pop all the backtrack states associated with A before
4232 continuing.
4233
4234 Backtrack state structs are allocated in slabs of about 4K in size.
4235 PL_regmatch_state and st always point to the currently active state,
4236 and PL_regmatch_slab points to the slab currently containing
4237 PL_regmatch_state.  The first time regmatch() is called, the first slab is
4238 allocated, and is never freed until interpreter destruction. When the slab
4239 is full, a new one is allocated and chained to the end. At exit from
4240 regmatch(), slabs allocated since entry are freed.
4241
4242 */
4243
4244
4245 #define DEBUG_STATE_pp(pp)                                  \
4246     DEBUG_STATE_r({                                         \
4247         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
4248         Perl_re_printf( aTHX_                                           \
4249             "%*s" pp " %s%s%s%s%s\n",                       \
4250             INDENT_CHARS(depth), "",                        \
4251             PL_reg_name[st->resume_state],                  \
4252             ((st==yes_state||st==mark_state) ? "[" : ""),   \
4253             ((st==yes_state) ? "Y" : ""),                   \
4254             ((st==mark_state) ? "M" : ""),                  \
4255             ((st==yes_state||st==mark_state) ? "]" : "")    \
4256         );                                                  \
4257     });
4258
4259
4260 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4261
4262 #ifdef DEBUGGING
4263
4264 STATIC void
4265 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4266     const char *start, const char *end, const char *blurb)
4267 {
4268     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4269
4270     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4271
4272     if (!PL_colorset)
4273             reginitcolors();
4274     {
4275         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4276             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4277
4278         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4279             start, end - start, PL_dump_re_max_len);
4280
4281         Perl_re_printf( aTHX_
4282             "%s%s REx%s %s against %s\n",
4283                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4284
4285         if (utf8_target||utf8_pat)
4286             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4287                 utf8_pat ? "pattern" : "",
4288                 utf8_pat && utf8_target ? " and " : "",
4289                 utf8_target ? "string" : ""
4290             );
4291     }
4292 }
4293
4294 STATIC void
4295 S_dump_exec_pos(pTHX_ const char *locinput,
4296                       const regnode *scan,
4297                       const char *loc_regeol,
4298                       const char *loc_bostr,
4299                       const char *loc_reg_starttry,
4300                       const bool utf8_target,
4301                       const U32 depth
4302                 )
4303 {
4304     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4305     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4306     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4307     /* The part of the string before starttry has one color
4308        (pref0_len chars), between starttry and current
4309        position another one (pref_len - pref0_len chars),
4310        after the current position the third one.
4311        We assume that pref0_len <= pref_len, otherwise we
4312        decrease pref0_len.  */
4313     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4314         ? (5 + taill) - l : locinput - loc_bostr;
4315     int pref0_len;
4316
4317     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4318
4319     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4320         pref_len++;
4321     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4322     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4323         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4324               ? (5 + taill) - pref_len : loc_regeol - locinput);
4325     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4326         l--;
4327     if (pref0_len < 0)
4328         pref0_len = 0;
4329     if (pref0_len > pref_len)
4330         pref0_len = pref_len;
4331     {
4332         const int is_uni = utf8_target ? 1 : 0;
4333
4334         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4335             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4336
4337         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4338                     (locinput - pref_len + pref0_len),
4339                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4340
4341         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4342                     locinput, loc_regeol - locinput, 10, 0, 1);
4343
4344         const STRLEN tlen=len0+len1+len2;
4345         Perl_re_printf( aTHX_
4346                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4347                     (IV)(locinput - loc_bostr),
4348                     len0, s0,
4349                     len1, s1,
4350                     (docolor ? "" : "> <"),
4351                     len2, s2,
4352                     (int)(tlen > 19 ? 0 :  19 - tlen),
4353                     "",
4354                     depth);
4355     }
4356 }
4357
4358 #endif
4359
4360 /* reg_check_named_buff_matched()
4361  * Checks to see if a named buffer has matched. The data array of
4362  * buffer numbers corresponding to the buffer is expected to reside
4363  * in the regexp->data->data array in the slot stored in the ARG() of
4364  * node involved. Note that this routine doesn't actually care about the
4365  * name, that information is not preserved from compilation to execution.
4366  * Returns the index of the leftmost defined buffer with the given name
4367  * or 0 if non of the buffers matched.
4368  */
4369 STATIC I32
4370 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4371 {
4372     I32 n;
4373     RXi_GET_DECL(rex,rexi);
4374     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4375     I32 *nums=(I32*)SvPVX(sv_dat);
4376
4377     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4378
4379     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4380         if ((I32)rex->lastparen >= nums[n] &&
4381             rex->offs[nums[n]].end != -1)
4382         {
4383             return nums[n];
4384         }
4385     }
4386     return 0;
4387 }
4388
4389
4390 static bool
4391 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4392         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4393 {
4394     /* This function determines if there are one or two characters that match
4395      * the first character of the passed-in EXACTish node <text_node>, and if
4396      * so, returns them in the passed-in pointers.
4397      *
4398      * If it determines that no possible character in the target string can
4399      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4400      * the first character in <text_node> requires UTF-8 to represent, and the
4401      * target string isn't in UTF-8.)
4402      *
4403      * If there are more than two characters that could match the beginning of
4404      * <text_node>, or if more context is required to determine a match or not,
4405      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4406      *
4407      * The motiviation behind this function is to allow the caller to set up
4408      * tight loops for matching.  If <text_node> is of type EXACT, there is
4409      * only one possible character that can match its first character, and so
4410      * the situation is quite simple.  But things get much more complicated if
4411      * folding is involved.  It may be that the first character of an EXACTFish
4412      * node doesn't participate in any possible fold, e.g., punctuation, so it
4413      * can be matched only by itself.  The vast majority of characters that are
4414      * in folds match just two things, their lower and upper-case equivalents.
4415      * But not all are like that; some have multiple possible matches, or match
4416      * sequences of more than one character.  This function sorts all that out.
4417      *
4418      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4419      * loop of trying to match A*, we know we can't exit where the thing
4420      * following it isn't a B.  And something can't be a B unless it is the
4421      * beginning of B.  By putting a quick test for that beginning in a tight
4422      * loop, we can rule out things that can't possibly be B without having to
4423      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4424      * character, we can make a tight loop matching A*, using the outputs of
4425      * this function.
4426      *
4427      * If the target string to match isn't in UTF-8, and there aren't
4428      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4429      * the one or two possible octets (which are characters in this situation)
4430      * that can match.  In all cases, if there is only one character that can
4431      * match, *<c1p> and *<c2p> will be identical.
4432      *
4433      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4434      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4435      * can match the beginning of <text_node>.  They should be declared with at
4436      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4437      * undefined what these contain.)  If one or both of the buffers are
4438      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4439      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4440      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4441      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4442      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4443
4444     const bool utf8_target = reginfo->is_utf8_target;
4445
4446     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4447     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4448     bool use_chrtest_void = FALSE;
4449     const bool is_utf8_pat = reginfo->is_utf8_pat;
4450
4451     /* Used when we have both utf8 input and utf8 output, to avoid converting
4452      * to/from code points */
4453     bool utf8_has_been_setup = FALSE;
4454
4455     dVAR;
4456
4457     U8 *pat = (U8*)STRING(text_node);
4458     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4459
4460     if (   OP(text_node) == EXACT
4461         || OP(text_node) == EXACT_ONLY8
4462         || OP(text_node) == EXACTL)
4463     {
4464
4465         /* In an exact node, only one thing can be matched, that first
4466          * character.  If both the pat and the target are UTF-8, we can just
4467          * copy the input to the output, avoiding finding the code point of
4468          * that character */
4469         if (!is_utf8_pat) {
4470             c2 = c1 = *pat;
4471         }
4472         else if (utf8_target) {
4473             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4474             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4475             utf8_has_been_setup = TRUE;
4476         }
4477         else {
4478             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4479         }
4480     }
4481     else { /* an EXACTFish node */
4482         U8 *pat_end = pat + STR_LEN(text_node);
4483
4484         /* An EXACTFL node has at least some characters unfolded, because what
4485          * they match is not known until now.  So, now is the time to fold
4486          * the first few of them, as many as are needed to determine 'c1' and
4487          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4488          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4489          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4490          * need to fold as many characters as a single character can fold to,
4491          * so that later we can check if the first ones are such a multi-char
4492          * fold.  But, in such a pattern only locale-problematic characters
4493          * aren't folded, so we can skip this completely if the first character
4494          * in the node isn't one of the tricky ones */
4495         if (OP(text_node) == EXACTFL) {
4496
4497             if (! is_utf8_pat) {
4498                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4499                 {
4500                     folded[0] = folded[1] = 's';
4501                     pat = folded;
4502                     pat_end = folded + 2;
4503                 }
4504             }
4505             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4506                 U8 *s = pat;
4507                 U8 *d = folded;
4508                 int i;
4509
4510                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4511                     if (isASCII(*s)) {
4512                         *(d++) = (U8) toFOLD_LC(*s);
4513                         s++;
4514                     }
4515                     else {
4516                         STRLEN len;
4517                         _toFOLD_utf8_flags(s,
4518                                            pat_end,
4519                                            d,
4520                                            &len,
4521                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4522                         d += len;
4523                         s += UTF8SKIP(s);
4524                     }
4525                 }
4526
4527                 pat = folded;
4528                 pat_end = d;
4529             }
4530         }
4531
4532         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4533              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4534         {
4535             /* Multi-character folds require more context to sort out.  Also
4536              * PL_utf8_foldclosures used below doesn't handle them, so have to
4537              * be handled outside this routine */
4538             use_chrtest_void = TRUE;
4539         }
4540         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4541             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4542             if (c1 > 255) {
4543                 const unsigned int * remaining_folds_to_list;
4544                 unsigned int first_folds_to;
4545
4546                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4547                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4548                 Size_t folds_to_count = _inverse_folds(c1,
4549                                                      &first_folds_to,
4550                                                      &remaining_folds_to_list);
4551                 if (folds_to_count == 0) {
4552                     c2 = c1;    /* there is only a single character that could
4553                                    match */
4554                 }
4555                 else if (folds_to_count != 1) {
4556                     /* If there aren't exactly two folds to this (itself and
4557                      * another), it is outside the scope of this function */
4558                     use_chrtest_void = TRUE;
4559                 }
4560                 else {  /* There are two.  We already have one, get the other */
4561                     c2 = first_folds_to;
4562
4563                     /* Folds that cross the 255/256 boundary are forbidden if
4564                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4565                      * ASCIII.  The only other match to c1 is c2, and since c1
4566                      * is above 255, c2 better be as well under these
4567                      * circumstances.  If it isn't, it means the only legal
4568                      * match of c1 is itself. */
4569                     if (    c2 < 256
4570                         && (   (   OP(text_node) == EXACTFL
4571                                 && ! IN_UTF8_CTYPE_LOCALE)
4572                             || ((     OP(text_node) == EXACTFAA
4573                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4574                                 && (isASCII(c1) || isASCII(c2)))))
4575                     {
4576                         c2 = c1;
4577                     }
4578                 }
4579             }
4580             else /* Here, c1 is <= 255 */
4581                 if (utf8_target
4582                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4583                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4584                     && ((OP(text_node) != EXACTFAA
4585                         && OP(text_node) != EXACTFAA_NO_TRIE)
4586                         || ! isASCII(c1)))
4587             {
4588                 /* Here, there could be something above Latin1 in the target
4589                  * which folds to this character in the pattern.  All such
4590                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4591                  * than two characters involved in their folds, so are outside
4592                  * the scope of this function */
4593                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4594                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4595                 }
4596                 else {
4597                     use_chrtest_void = TRUE;
4598                 }
4599             }
4600             else { /* Here nothing above Latin1 can fold to the pattern
4601                       character */
4602                 switch (OP(text_node)) {
4603
4604                     case EXACTFL:   /* /l rules */
4605                         c2 = PL_fold_locale[c1];
4606                         break;
4607
4608                     case EXACTF:   /* This node only generated for non-utf8
4609                                     patterns */
4610                         assert(! is_utf8_pat);
4611                         if (! utf8_target) {    /* /d rules */
4612                             c2 = PL_fold[c1];
4613                             break;
4614                         }
4615                         /* FALLTHROUGH */
4616                         /* /u rules for all these.  This happens to work for
4617                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4618                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4619                                                 non-utf8 patterns */
4620                         assert(! is_utf8_pat);
4621                         /* FALLTHROUGH */
4622                     case EXACTFAA:
4623                     case EXACTFU_SS:
4624                     case EXACTFU:
4625                         c2 = PL_fold_latin1[c1];
4626                         break;
4627
4628                     default:
4629                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4630                         NOT_REACHED; /* NOTREACHED */
4631                 }
4632             }
4633         }
4634     }
4635
4636     /* Here have figured things out.  Set up the returns */
4637     if (use_chrtest_void) {
4638         *c2p = *c1p = CHRTEST_VOID;
4639     }
4640     else if (utf8_target) {
4641         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4642             uvchr_to_utf8(c1_utf8, c1);
4643             uvchr_to_utf8(c2_utf8, c2);
4644         }
4645
4646         /* Invariants are stored in both the utf8 and byte outputs; Use
4647          * negative numbers otherwise for the byte ones.  Make sure that the
4648          * byte ones are the same iff the utf8 ones are the same */
4649         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4650         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4651                 ? *c2_utf8
4652                 : (c1 == c2)
4653                   ? CHRTEST_NOT_A_CP_1
4654                   : CHRTEST_NOT_A_CP_2;
4655     }
4656     else if (c1 > 255) {
4657        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4658                            can represent */
4659            return FALSE;
4660        }
4661
4662        *c1p = *c2p = c2;    /* c2 is the only representable value */
4663     }
4664     else {  /* c1 is representable; see about c2 */
4665        *c1p = c1;
4666        *c2p = (c2 < 256) ? c2 : c1;
4667     }
4668
4669     return TRUE;
4670 }
4671
4672 STATIC bool
4673 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4674 {
4675     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4676      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4677
4678     PERL_ARGS_ASSERT_ISGCB;
4679
4680     switch (GCB_table[before][after]) {
4681         case GCB_BREAKABLE:
4682             return TRUE;
4683
4684         case GCB_NOBREAK:
4685             return FALSE;
4686
4687         case GCB_RI_then_RI:
4688             {
4689                 int RI_count = 1;
4690                 U8 * temp_pos = (U8 *) curpos;
4691
4692                 /* Do not break within emoji flag sequences. That is, do not
4693                  * break between regional indicator (RI) symbols if there is an
4694                  * odd number of RI characters before the break point.
4695                  *  GB12   sot (RI RI)* RI × RI
4696                  *  GB13 [^RI] (RI RI)* RI × RI */
4697
4698                 while (backup_one_GCB(strbeg,
4699                                     &temp_pos,
4700                                     utf8_target) == GCB_Regional_Indicator)
4701                 {
4702                     RI_count++;
4703                 }
4704
4705                 return RI_count % 2 != 1;
4706             }
4707
4708         case GCB_EX_then_EM:
4709
4710             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4711             {
4712                 U8 * temp_pos = (U8 *) curpos;
4713                 GCB_enum prev;
4714
4715                 do {
4716                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4717                 }
4718                 while (prev == GCB_Extend);
4719
4720                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4721             }
4722
4723         case GCB_Maybe_Emoji_NonBreak:
4724
4725             {
4726
4727             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4728               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4729               */
4730                 U8 * temp_pos = (U8 *) curpos;
4731                 GCB_enum prev;
4732
4733                 do {
4734                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4735                 }
4736                 while (prev == GCB_Extend);
4737
4738                 return prev != GCB_XPG_XX;
4739             }
4740
4741         default:
4742             break;
4743     }
4744
4745 #ifdef DEBUGGING
4746     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4747                                   before, after, GCB_table[before][after]);
4748     assert(0);
4749 #endif
4750     return TRUE;
4751 }
4752
4753 STATIC GCB_enum
4754 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4755 {
4756     GCB_enum gcb;
4757
4758     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4759
4760     if (*curpos < strbeg) {
4761         return GCB_EDGE;
4762     }
4763
4764     if (utf8_target) {
4765         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4766         U8 * prev_prev_char_pos;
4767
4768         if (! prev_char_pos) {
4769             return GCB_EDGE;
4770         }
4771
4772         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4773             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4774             *curpos = prev_char_pos;
4775             prev_char_pos = prev_prev_char_pos;
4776         }
4777         else {
4778             *curpos = (U8 *) strbeg;
4779             return GCB_EDGE;
4780         }
4781     }
4782     else {
4783         if (*curpos - 2 < strbeg) {
4784             *curpos = (U8 *) strbeg;
4785             return GCB_EDGE;
4786         }
4787         (*curpos)--;
4788         gcb = getGCB_VAL_CP(*(*curpos - 1));
4789     }
4790
4791     return gcb;
4792 }
4793
4794 /* Combining marks attach to most classes that precede them, but this defines
4795  * the exceptions (from TR14) */
4796 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4797                                      || prev == LB_Mandatory_Break      \
4798                                      || prev == LB_Carriage_Return      \
4799                                      || prev == LB_Line_Feed            \
4800                                      || prev == LB_Next_Line            \
4801                                      || prev == LB_Space                \
4802                                      || prev == LB_ZWSpace))
4803
4804 STATIC bool
4805 S_isLB(pTHX_ LB_enum before,
4806              LB_enum after,
4807              const U8 * const strbeg,
4808              const U8 * const curpos,
4809              const U8 * const strend,
4810              const bool utf8_target)
4811 {
4812     U8 * temp_pos = (U8 *) curpos;
4813     LB_enum prev = before;
4814
4815     /* Is the boundary between 'before' and 'after' line-breakable?
4816      * Most of this is just a table lookup of a generated table from Unicode
4817      * rules.  But some rules require context to decide, and so have to be
4818      * implemented in code */
4819
4820     PERL_ARGS_ASSERT_ISLB;
4821
4822     /* Rule numbers in the comments below are as of Unicode 9.0 */
4823
4824   redo:
4825     before = prev;
4826     switch (LB_table[before][after]) {
4827         case LB_BREAKABLE:
4828             return TRUE;
4829
4830         case LB_NOBREAK:
4831         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4832             return FALSE;
4833
4834         case LB_SP_foo + LB_BREAKABLE:
4835         case LB_SP_foo + LB_NOBREAK:
4836         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4837
4838             /* When we have something following a SP, we have to look at the
4839              * context in order to know what to do.
4840              *
4841              * SP SP should not reach here because LB7: Do not break before
4842              * spaces.  (For two spaces in a row there is nothing that
4843              * overrides that) */
4844             assert(after != LB_Space);
4845
4846             /* Here we have a space followed by a non-space.  Mostly this is a
4847              * case of LB18: "Break after spaces".  But there are complications
4848              * as the handling of spaces is somewhat tricky.  They are in a
4849              * number of rules, which have to be applied in priority order, but
4850              * something earlier in the string can cause a rule to be skipped
4851              * and a lower priority rule invoked.  A prime example is LB7 which
4852              * says don't break before a space.  But rule LB8 (lower priority)
4853              * says that the first break opportunity after a ZW is after any
4854              * span of spaces immediately after it.  If a ZW comes before a SP
4855              * in the input, rule LB8 applies, and not LB7.  Other such rules
4856              * involve combining marks which are rules 9 and 10, but they may
4857              * override higher priority rules if they come earlier in the
4858              * string.  Since we're doing random access into the middle of the
4859              * string, we have to look for rules that should get applied based
4860              * on both string position and priority.  Combining marks do not
4861              * attach to either ZW nor SP, so we don't have to consider them
4862              * until later.
4863              *
4864              * To check for LB8, we have to find the first non-space character
4865              * before this span of spaces */
4866             do {
4867                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4868             }
4869             while (prev == LB_Space);
4870
4871             /* LB8 Break before any character following a zero-width space,
4872              * even if one or more spaces intervene.
4873              *      ZW SP* ÷
4874              * So if we have a ZW just before this span, and to get here this
4875              * is the final space in the span. */
4876             if (prev == LB_ZWSpace) {
4877                 return TRUE;
4878             }
4879
4880             /* Here, not ZW SP+.  There are several rules that have higher
4881              * priority than LB18 and can be resolved now, as they don't depend
4882              * on anything earlier in the string (except ZW, which we have
4883              * already handled).  One of these rules is LB11 Do not break
4884              * before Word joiner, but we have specially encoded that in the
4885              * lookup table so it is caught by the single test below which
4886              * catches the other ones. */
4887             if (LB_table[LB_Space][after] - LB_SP_foo
4888                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4889             {
4890                 return FALSE;
4891             }
4892
4893             /* If we get here, we have to XXX consider combining marks. */
4894             if (prev == LB_Combining_Mark) {
4895
4896                 /* What happens with these depends on the character they
4897                  * follow.  */
4898                 do {
4899                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4900                 }
4901                 while (prev == LB_Combining_Mark);
4902
4903                 /* Most times these attach to and inherit the characteristics
4904                  * of that character, but not always, and when not, they are to
4905                  * be treated as AL by rule LB10. */
4906                 if (! LB_CM_ATTACHES_TO(prev)) {
4907                     prev = LB_Alphabetic;
4908                 }
4909             }
4910
4911             /* Here, we have the character preceding the span of spaces all set
4912              * up.  We follow LB18: "Break after spaces" unless the table shows
4913              * that is overriden */
4914             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4915
4916         case LB_CM_ZWJ_foo:
4917
4918             /* We don't know how to treat the CM except by looking at the first
4919              * non-CM character preceding it.  ZWJ is treated as CM */
4920             do {
4921                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4922             }
4923             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4924
4925             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4926              * attatches to it, then it inherits the behavior of 'prev'.  If it
4927              * doesn't attach, it is to be treated as an AL */
4928             if (! LB_CM_ATTACHES_TO(prev)) {
4929                 prev = LB_Alphabetic;
4930             }
4931
4932             goto redo;
4933
4934         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4935         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4936
4937             /* LB21a Don't break after Hebrew + Hyphen.
4938              * HL (HY | BA) × */
4939
4940             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4941                                                           == LB_Hebrew_Letter)
4942             {
4943                 return FALSE;
4944             }
4945
4946             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4947
4948         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4949         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4950
4951             /* LB25a (PR | PO) × ( OP | HY )? NU */
4952             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4953                 return FALSE;
4954             }
4955
4956             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4957                                                                 == LB_BREAKABLE;
4958
4959         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4960         case LB_SY_or_IS_then_various + LB_NOBREAK:
4961         {
4962             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4963
4964             LB_enum temp = prev;
4965             do {
4966                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4967             }
4968             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4969             if (temp == LB_Numeric) {
4970                 return FALSE;
4971             }
4972
4973             return LB_table[prev][after] - LB_SY_or_IS_then_various
4974                                                                == LB_BREAKABLE;
4975         }
4976
4977         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4978         case LB_various_then_PO_or_PR + LB_NOBREAK:
4979         {
4980             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4981
4982             LB_enum temp = prev;
4983             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4984             {
4985                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4986             }
4987             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4988                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4989             }
4990             if (temp == LB_Numeric) {
4991                 return FALSE;
4992             }
4993             return LB_various_then_PO_or_PR;
4994         }
4995
4996         case LB_RI_then_RI + LB_NOBREAK:
4997         case LB_RI_then_RI + LB_BREAKABLE:
4998             {
4999                 int RI_count = 1;
5000
5001                 /* LB30a Break between two regional indicator symbols if and
5002                  * only if there are an even number of regional indicators
5003                  * preceding the position of the break.
5004                  *
5005                  *    sot (RI RI)* RI × RI
5006                  *  [^RI] (RI RI)* RI × RI */
5007
5008                 while (backup_one_LB(strbeg,
5009                                      &temp_pos,
5010                                      utf8_target) == LB_Regional_Indicator)
5011                 {
5012                     RI_count++;
5013                 }
5014
5015                 return RI_count % 2 == 0;
5016             }
5017
5018         default:
5019             break;
5020     }
5021
5022 #ifdef DEBUGGING
5023     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5024                                   before, after, LB_table[before][after]);
5025     assert(0);
5026 #endif
5027     return TRUE;
5028 }
5029
5030 STATIC LB_enum
5031 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5032 {
5033     LB_enum lb;
5034
5035     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5036
5037     if (*curpos >= strend) {
5038         return LB_EDGE;
5039     }
5040
5041     if (utf8_target) {
5042         *curpos += UTF8SKIP(*curpos);
5043         if (*curpos >= strend) {
5044             return LB_EDGE;
5045         }
5046         lb = getLB_VAL_UTF8(*curpos, strend);
5047     }
5048     else {
5049         (*curpos)++;
5050         if (*curpos >= strend) {
5051             return LB_EDGE;
5052         }
5053         lb = getLB_VAL_CP(**curpos);
5054     }
5055
5056     return lb;
5057 }
5058
5059 STATIC LB_enum
5060 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5061 {
5062     LB_enum lb;
5063
5064     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5065
5066     if (*curpos < strbeg) {
5067         return LB_EDGE;
5068     }
5069
5070     if (utf8_target) {
5071         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5072         U8 * prev_prev_char_pos;
5073
5074         if (! prev_char_pos) {
5075             return LB_EDGE;
5076         }
5077
5078         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5079             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5080             *curpos = prev_char_pos;
5081             prev_char_pos = prev_prev_char_pos;
5082         }
5083         else {
5084             *curpos = (U8 *) strbeg;
5085             return LB_EDGE;
5086         }
5087     }
5088     else {
5089         if (*curpos - 2 < strbeg) {
5090             *curpos = (U8 *) strbeg;
5091             return LB_EDGE;
5092         }
5093         (*curpos)--;
5094         lb = getLB_VAL_CP(*(*curpos - 1));
5095     }
5096
5097     return lb;
5098 }
5099
5100 STATIC bool
5101 S_isSB(pTHX_ SB_enum before,
5102              SB_enum after,
5103              const U8 * const strbeg,
5104              const U8 * const curpos,
5105              const U8 * const strend,
5106              const bool utf8_target)
5107 {
5108     /* returns a boolean indicating if there is a Sentence Boundary Break
5109      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
5110
5111     U8 * lpos = (U8 *) curpos;
5112     bool has_para_sep = FALSE;
5113     bool has_sp = FALSE;
5114
5115     PERL_ARGS_ASSERT_ISSB;
5116
5117     /* Break at the start and end of text.
5118         SB1.  sot  ÷
5119         SB2.  ÷  eot
5120       But unstated in Unicode is don't break if the text is empty */
5121     if (before == SB_EDGE || after == SB_EDGE) {
5122         return before != after;
5123     }
5124
5125     /* SB 3: Do not break within CRLF. */
5126     if (before == SB_CR && after == SB_LF) {
5127         return FALSE;
5128     }
5129
5130     /* Break after paragraph separators.  CR and LF are considered
5131      * so because Unicode views text as like word processing text where there
5132      * are no newlines except between paragraphs, and the word processor takes
5133      * care of wrapping without there being hard line-breaks in the text *./
5134        SB4.  Sep | CR | LF  ÷ */
5135     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5136         return TRUE;
5137     }
5138
5139     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5140      * (See Section 6.2, Replacing Ignore Rules.)
5141         SB5.  X (Extend | Format)*  →  X */
5142     if (after == SB_Extend || after == SB_Format) {
5143
5144         /* Implied is that the these characters attach to everything
5145          * immediately prior to them except for those separator-type
5146          * characters.  And the rules earlier have already handled the case
5147          * when one of those immediately precedes the extend char */
5148         return FALSE;
5149     }
5150
5151     if (before == SB_Extend || before == SB_Format) {
5152         U8 * temp_pos = lpos;
5153         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5154         if (   backup != SB_EDGE
5155             && backup != SB_Sep
5156             && backup != SB_CR
5157             && backup != SB_LF)
5158         {
5159             before = backup;
5160             lpos = temp_pos;
5161         }
5162
5163         /* Here, both 'before' and 'backup' are these types; implied is that we
5164          * don't break between them */
5165         if (backup == SB_Extend || backup == SB_Format) {
5166             return FALSE;
5167         }
5168     }
5169
5170     /* Do not break after ambiguous terminators like period, if they are
5171      * immediately followed by a number or lowercase letter, if they are
5172      * between uppercase letters, if the first following letter (optionally
5173      * after certain punctuation) is lowercase, or if they are followed by
5174      * "continuation" punctuation such as comma, colon, or semicolon. For
5175      * example, a period may be an abbreviation or numeric period, and thus may
5176      * not mark the end of a sentence.
5177
5178      * SB6. ATerm  ×  Numeric */
5179     if (before == SB_ATerm && after == SB_Numeric) {
5180         return FALSE;
5181     }
5182
5183     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5184     if (before == SB_ATerm && after == SB_Upper) {
5185         U8 * temp_pos = lpos;
5186         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5187         if (backup == SB_Upper || backup == SB_Lower) {
5188             return FALSE;
5189         }
5190     }
5191
5192     /* The remaining rules that aren't the final one, all require an STerm or
5193      * an ATerm after having backed up over some Close* Sp*, and in one case an
5194      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5195      * So do that backup now, setting flags if either Sp or a paragraph
5196      * separator are found */
5197
5198     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5199         has_para_sep = TRUE;
5200         before = backup_one_SB(strbeg, &lpos, utf8_target);
5201     }
5202
5203     if (before == SB_Sp) {
5204         has_sp = TRUE;
5205         do {
5206             before = backup_one_SB(strbeg, &lpos, utf8_target);
5207         }
5208         while (before == SB_Sp);
5209     }
5210
5211     while (before == SB_Close) {
5212         before = backup_one_SB(strbeg, &lpos, utf8_target);
5213     }
5214
5215     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5216      * most cases an STerm */
5217     if (before == SB_STerm || before == SB_ATerm) {
5218
5219         /* So, here the lhs matches
5220          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5221          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5222          * The rules that apply here are:
5223          *
5224          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5225                                            | LF | STerm | ATerm) )* Lower
5226            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5227            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5228            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5229            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5230          */
5231
5232         /* And all but SB11 forbid having seen a paragraph separator */
5233         if (! has_para_sep) {
5234             if (before == SB_ATerm) {          /* SB8 */
5235                 U8 * rpos = (U8 *) curpos;
5236                 SB_enum later = after;
5237
5238                 while (    later != SB_OLetter
5239                         && later != SB_Upper
5240                         && later != SB_Lower
5241                         && later != SB_Sep
5242                         && later != SB_CR
5243                         && later != SB_LF
5244                         && later != SB_STerm
5245                         && later != SB_ATerm
5246                         && later != SB_EDGE)
5247                 {
5248                     later = advance_one_SB(&rpos, strend, utf8_target);
5249                 }
5250                 if (later == SB_Lower) {
5251                     return FALSE;
5252                 }
5253             }
5254
5255             if (   after == SB_SContinue    /* SB8a */
5256                 || after == SB_STerm
5257                 || after == SB_ATerm)
5258             {
5259                 return FALSE;
5260             }
5261
5262             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5263                 if (   after == SB_Close
5264                     || after == SB_Sp
5265                     || after == SB_Sep
5266                     || after == SB_CR
5267                     || after == SB_LF)
5268                 {
5269                     return FALSE;
5270                 }
5271             }
5272
5273             /* SB10.  This and SB9 could probably be combined some way, but khw
5274              * has decided to follow the Unicode rule book precisely for
5275              * simplified maintenance */
5276             if (   after == SB_Sp
5277                 || after == SB_Sep
5278                 || after == SB_CR
5279                 || after == SB_LF)
5280             {
5281                 return FALSE;
5282             }
5283         }
5284
5285         /* SB11.  */
5286         return TRUE;
5287     }
5288
5289     /* Otherwise, do not break.
5290     SB12.  Any  ×  Any */
5291
5292     return FALSE;
5293 }
5294
5295 STATIC SB_enum
5296 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5297 {
5298     SB_enum sb;
5299
5300     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5301
5302     if (*curpos >= strend) {
5303         return SB_EDGE;
5304     }
5305
5306     if (utf8_target) {
5307         do {
5308             *curpos += UTF8SKIP(*curpos);
5309             if (*curpos >= strend) {
5310                 return SB_EDGE;
5311             }
5312             sb = getSB_VAL_UTF8(*curpos, strend);
5313         } while (sb == SB_Extend || sb == SB_Format);
5314     }
5315     else {
5316         do {
5317             (*curpos)++;
5318             if (*curpos >= strend) {
5319                 return SB_EDGE;
5320             }
5321             sb = getSB_VAL_CP(**curpos);
5322         } while (sb == SB_Extend || sb == SB_Format);
5323     }
5324
5325     return sb;
5326 }
5327
5328 STATIC SB_enum
5329 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5330 {
5331     SB_enum sb;
5332
5333     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5334
5335     if (*curpos < strbeg) {
5336         return SB_EDGE;
5337     }
5338
5339     if (utf8_target) {
5340         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5341         if (! prev_char_pos) {
5342             return SB_EDGE;
5343         }
5344
5345         /* Back up over Extend and Format.  curpos is always just to the right
5346          * of the characater whose value we are getting */
5347         do {
5348             U8 * prev_prev_char_pos;
5349             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5350                                                                       strbeg)))
5351             {
5352                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5353                 *curpos = prev_char_pos;
5354                 prev_char_pos = prev_prev_char_pos;
5355             }
5356             else {
5357                 *curpos = (U8 *) strbeg;
5358                 return SB_EDGE;
5359             }
5360         } while (sb == SB_Extend || sb == SB_Format);
5361     }
5362     else {
5363         do {
5364             if (*curpos - 2 < strbeg) {
5365                 *curpos = (U8 *) strbeg;
5366                 return SB_EDGE;
5367             }
5368             (*curpos)--;
5369             sb = getSB_VAL_CP(*(*curpos - 1));
5370         } while (sb == SB_Extend || sb == SB_Format);
5371     }
5372
5373     return sb;
5374 }
5375
5376 STATIC bool
5377 S_isWB(pTHX_ WB_enum previous,
5378              WB_enum before,
5379              WB_enum after,
5380              const U8 * const strbeg,
5381              const U8 * const curpos,
5382              const U8 * const strend,
5383              const bool utf8_target)
5384 {
5385     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5386      *  a Unicode word break, using their published algorithm, but tailored for
5387      *  Perl by treating spans of white space as one unit.  Context may be
5388      *  needed to make this determination.  If the value for the character
5389      *  before 'before' is known, it is passed as 'previous'; otherwise that
5390      *  should be set to WB_UNKNOWN.  The other input parameters give the
5391      *  boundaries and current position in the matching of the string.  That
5392      *  is, 'curpos' marks the position where the character whose wb value is
5393      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5394
5395     U8 * before_pos = (U8 *) curpos;
5396     U8 * after_pos = (U8 *) curpos;
5397     WB_enum prev = before;
5398     WB_enum next;
5399
5400     PERL_ARGS_ASSERT_ISWB;
5401
5402     /* Rule numbers in the comments below are as of Unicode 9.0 */
5403
5404   redo:
5405     before = prev;
5406     switch (WB_table[before][after]) {
5407         case WB_BREAKABLE:
5408             return TRUE;
5409
5410         case WB_NOBREAK:
5411             return FALSE;
5412
5413         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5414             next = advance_one_WB(&after_pos, strend, utf8_target,
5415                                  FALSE /* Don't skip Extend nor Format */ );
5416             /* A space immediately preceeding an Extend or Format is attached
5417              * to by them, and hence gets separated from previous spaces.
5418              * Otherwise don't break between horizontal white space */
5419             return next == WB_Extend || next == WB_Format;
5420
5421         /* WB4 Ignore Format and Extend characters, except when they appear at
5422          * the beginning of a region of text.  This code currently isn't
5423          * general purpose, but it works as the rules are currently and likely
5424          * to be laid out.  The reason it works is that when 'they appear at
5425          * the beginning of a region of text', the rule is to break before
5426          * them, just like any other character.  Therefore, the default rule
5427          * applies and we don't have to look in more depth.  Should this ever
5428          * change, we would have to have 2 'case' statements, like in the rules
5429          * below, and backup a single character (not spacing over the extend
5430          * ones) and then see if that is one of the region-end characters and
5431          * go from there */
5432         case WB_Ex_or_FO_or_ZWJ_then_foo:
5433             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5434             goto redo;
5435
5436         case WB_DQ_then_HL + WB_BREAKABLE:
5437         case WB_DQ_then_HL + WB_NOBREAK:
5438
5439             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5440
5441             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5442                                                             == WB_Hebrew_Letter)
5443             {
5444                 return FALSE;
5445             }
5446
5447              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5448
5449         case WB_HL_then_DQ + WB_BREAKABLE:
5450         case WB_HL_then_DQ + WB_NOBREAK:
5451
5452             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5453
5454             if (advance_one_WB(&after_pos, strend, utf8_target,
5455                                        TRUE /* Do skip Extend and Format */ )
5456                                                             == WB_Hebrew_Letter)
5457             {
5458                 return FALSE;
5459             }
5460
5461             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5462
5463         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5464         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5465
5466             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5467              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5468
5469             next = advance_one_WB(&after_pos, strend, utf8_target,
5470                                        TRUE /* Do skip Extend and Format */ );
5471
5472             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5473             {
5474                 return FALSE;
5475             }
5476
5477             return WB_table[before][after]
5478                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5479
5480         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5481         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5482
5483             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5484              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5485
5486             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5487             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5488             {
5489                 return FALSE;
5490             }
5491
5492             return WB_table[before][after]
5493                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5494
5495         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5496         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5497
5498             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5499              * */
5500
5501             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5502                                                             == WB_Numeric)
5503             {
5504                 return FALSE;
5505             }
5506
5507             return WB_table[before][after]
5508                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5509
5510         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5511         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5512
5513             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5514
5515             if (advance_one_WB(&after_pos, strend, utf8_target,
5516                                        TRUE /* Do skip Extend and Format */ )
5517                                                             == WB_Numeric)
5518             {
5519                 return FALSE;
5520             }
5521
5522             return WB_table[before][after]
5523                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5524
5525         case WB_RI_then_RI + WB_NOBREAK:
5526         case WB_RI_then_RI + WB_BREAKABLE:
5527             {
5528                 int RI_count = 1;
5529
5530                 /* Do not break within emoji flag sequences. That is, do not
5531                  * break between regional indicator (RI) symbols if there is an
5532                  * odd number of RI characters before the potential break
5533                  * point.
5534                  *
5535                  * WB15   sot (RI RI)* RI × RI
5536                  * WB16 [^RI] (RI RI)* RI × RI */
5537
5538                 while (backup_one_WB(&previous,
5539                                      strbeg,
5540                                      &before_pos,
5541                                      utf8_target) == WB_Regional_Indicator)
5542                 {
5543                     RI_count++;
5544                 }
5545
5546                 return RI_count % 2 != 1;
5547             }
5548
5549         default:
5550             break;
5551     }
5552
5553 #ifdef DEBUGGING
5554     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5555                                   before, after, WB_table[before][after]);
5556     assert(0);
5557 #endif
5558     return TRUE;
5559 }
5560
5561 STATIC WB_enum
5562 S_advance_one_WB(pTHX_ U8 ** curpos,
5563                        const U8 * const strend,
5564                        const bool utf8_target,
5565                        const bool skip_Extend_Format)
5566 {
5567     WB_enum wb;
5568
5569     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5570
5571     if (*curpos >= strend) {
5572         return WB_EDGE;
5573     }
5574
5575     if (utf8_target) {
5576
5577         /* Advance over Extend and Format */
5578         do {
5579             *curpos += UTF8SKIP(*curpos);
5580             if (*curpos >= strend) {
5581                 return WB_EDGE;
5582             }
5583             wb = getWB_VAL_UTF8(*curpos, strend);
5584         } while (    skip_Extend_Format
5585                  && (wb == WB_Extend || wb == WB_Format));
5586     }
5587     else {
5588         do {
5589             (*curpos)++;
5590             if (*curpos >= strend) {
5591                 return WB_EDGE;
5592             }
5593             wb = getWB_VAL_CP(**curpos);
5594         } while (    skip_Extend_Format
5595                  && (wb == WB_Extend || wb == WB_Format));
5596     }
5597
5598     return wb;
5599 }
5600
5601 STATIC WB_enum
5602 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5603 {
5604     WB_enum wb;
5605
5606     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5607
5608     /* If we know what the previous character's break value is, don't have
5609         * to look it up */
5610     if (*previous != WB_UNKNOWN) {
5611         wb = *previous;
5612
5613         /* But we need to move backwards by one */
5614         if (utf8_target) {
5615             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5616             if (! *curpos) {
5617                 *previous = WB_EDGE;
5618                 *curpos = (U8 *) strbeg;
5619             }
5620             else {
5621                 *previous = WB_UNKNOWN;
5622             }
5623         }
5624         else {
5625             (*curpos)--;
5626             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5627         }
5628
5629         /* And we always back up over these three types */
5630         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5631             return wb;
5632         }
5633     }
5634
5635     if (*curpos < strbeg) {
5636         return WB_EDGE;
5637     }
5638
5639     if (utf8_target) {
5640         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5641         if (! prev_char_pos) {
5642             return WB_EDGE;
5643         }
5644
5645         /* Back up over Extend and Format.  curpos is always just to the right
5646          * of the characater whose value we are getting */
5647         do {
5648             U8 * prev_prev_char_pos;
5649             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5650                                                    -1,
5651                                                    strbeg)))
5652             {
5653                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5654                 *curpos = prev_char_pos;
5655                 prev_char_pos = prev_prev_char_pos;
5656             }
5657             else {
5658                 *curpos = (U8 *) strbeg;
5659                 return WB_EDGE;
5660             }
5661         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5662     }
5663     else {
5664         do {
5665             if (*curpos - 2 < strbeg) {
5666                 *curpos = (U8 *) strbeg;
5667                 return WB_EDGE;
5668             }
5669             (*curpos)--;
5670             wb = getWB_VAL_CP(*(*curpos - 1));
5671         } while (wb == WB_Extend || wb == WB_Format);
5672     }
5673
5674     return wb;
5675 }
5676
5677 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5678 (                                                           \
5679     (   ( st )                                         ) && \
5680     (   ( st )->u.eval.close_paren                     ) && \
5681     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5682 )
5683
5684 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5685 (                                                           \
5686     (   ( st )                                         ) && \
5687     (   ( st )->u.eval.close_paren                     ) && \
5688     (   ( expr )                                       ) && \
5689     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5690 )
5691
5692
5693 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5694     (st)->u.eval.close_paren = ( (expr) + 1 )
5695
5696 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5697     (st)->u.eval.close_paren = 0
5698
5699 /* returns -1 on failure, $+[0] on success */
5700 STATIC SSize_t
5701 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5702 {
5703     dVAR;
5704     const bool utf8_target = reginfo->is_utf8_target;
5705     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5706     REGEXP *rex_sv = reginfo->prog;
5707     regexp *rex = ReANY(rex_sv);
5708     RXi_GET_DECL(rex,rexi);
5709     /* the current state. This is a cached copy of PL_regmatch_state */
5710     regmatch_state *st;
5711     /* cache heavy used fields of st in registers */
5712     regnode *scan;
5713     regnode *next;
5714     U32 n = 0;  /* general value; init to avoid compiler warning */
5715     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5716     SSize_t endref = 0; /* offset of end of backref when ln is start */
5717     char *locinput = startpos;
5718     char *pushinput; /* where to continue after a PUSH */
5719     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5720
5721     bool result = 0;        /* return value of S_regmatch */
5722     U32 depth = 0;            /* depth of backtrack stack */
5723     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5724     const U32 max_nochange_depth =
5725         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5726         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5727     regmatch_state *yes_state = NULL; /* state to pop to on success of
5728                                                             subpattern */
5729     /* mark_state piggy backs on the yes_state logic so that when we unwind
5730        the stack on success we can update the mark_state as we go */
5731     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5732     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5733     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5734     U32 state_num;
5735     bool no_final = 0;      /* prevent failure from backtracking? */
5736     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5737     char *startpoint = locinput;
5738     SV *popmark = NULL;     /* are we looking for a mark? */
5739     SV *sv_commit = NULL;   /* last mark name seen in failure */
5740     SV *sv_yes_mark = NULL; /* last mark name we have seen
5741                                during a successful match */
5742     U32 lastopen = 0;       /* last open we saw */
5743     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5744     SV* const oreplsv = GvSVn(PL_replgv);
5745     /* these three flags are set by various ops to signal information to
5746      * the very next op. They have a useful lifetime of exactly one loop
5747      * iteration, and are not preserved or restored by state pushes/pops
5748      */
5749     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5750     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5751     int logical = 0;        /* the following EVAL is:
5752                                 0: (?{...})
5753                                 1: (?(?{...})X|Y)
5754                                 2: (??{...})
5755                                or the following IFMATCH/UNLESSM is:
5756                                 false: plain (?=foo)
5757                                 true:  used as a condition: (?(?=foo))
5758                             */
5759     PAD* last_pad = NULL;
5760     dMULTICALL;
5761     U8 gimme = G_SCALAR;
5762     CV *caller_cv = NULL;       /* who called us */
5763     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5764     U32 maxopenparen = 0;       /* max '(' index seen so far */
5765     int to_complement;  /* Invert the result? */
5766     _char_class_number classnum;
5767     bool is_utf8_pat = reginfo->is_utf8_pat;
5768     bool match = FALSE;
5769     I32 orig_savestack_ix = PL_savestack_ix;
5770     U8 * script_run_begin = NULL;
5771
5772 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5773 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5774 #  define SOLARIS_BAD_OPTIMIZER
5775     const U32 *pl_charclass_dup = PL_charclass;
5776 #  define PL_charclass pl_charclass_dup
5777 #endif
5778
5779 #ifdef DEBUGGING
5780     GET_RE_DEBUG_FLAGS_DECL;
5781 #endif
5782
5783     /* protect against undef(*^R) */
5784     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5785
5786     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5787     multicall_oldcatch = 0;
5788     PERL_UNUSED_VAR(multicall_cop);
5789
5790     PERL_ARGS_ASSERT_REGMATCH;
5791
5792     st = PL_regmatch_state;
5793
5794     /* Note that nextchr is a byte even in UTF */
5795     SET_nextchr;
5796     scan = prog;
5797
5798     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5799             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5800             Perl_re_printf( aTHX_ "regmatch start\n" );
5801     }));
5802
5803     while (scan != NULL) {
5804         next = scan + NEXT_OFF(scan);
5805         if (next == scan)
5806             next = NULL;
5807         state_num = OP(scan);
5808
5809       reenter_switch:
5810         DEBUG_EXECUTE_r(
5811             if (state_num <= REGNODE_MAX) {
5812                 SV * const prop = sv_newmortal();
5813                 regnode *rnext = regnext(scan);
5814
5815                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5816                 regprop(rex, prop, scan, reginfo, NULL);
5817                 Perl_re_printf( aTHX_
5818                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5819                     INDENT_CHARS(depth), "",
5820                     (IV)(scan - rexi->program),
5821                     SvPVX_const(prop),
5822                     (PL_regkind[OP(scan)] == END || !rnext) ?
5823                         0 : (IV)(rnext - rexi->program));
5824             }
5825         );
5826
5827         to_complement = 0;
5828
5829         SET_nextchr;
5830         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5831
5832         switch (state_num) {
5833         case SBOL: /*  /^../ and /\A../  */
5834             if (locinput == reginfo->strbeg)
5835                 break;
5836             sayNO;
5837
5838         case MBOL: /*  /^../m  */
5839             if (locinput == reginfo->strbeg ||
5840                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5841             {
5842                 break;
5843             }
5844             sayNO;
5845
5846         case GPOS: /*  \G  */
5847             if (locinput == reginfo->ganch)
5848                 break;
5849             sayNO;
5850
5851         case KEEPS: /*   \K  */
5852             /* update the startpoint */
5853             st->u.keeper.val = rex->offs[0].start;
5854             rex->offs[0].start = locinput - reginfo->strbeg;
5855             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5856             NOT_REACHED; /* NOTREACHED */
5857
5858         case KEEPS_next_fail:
5859             /* rollback the start point change */
5860             rex->offs[0].start = st->u.keeper.val;
5861             sayNO_SILENT;
5862             NOT_REACHED; /* NOTREACHED */
5863
5864         case MEOL: /* /..$/m  */
5865             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5866                 sayNO;
5867             break;
5868
5869         case SEOL: /* /..$/  */
5870             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5871                 sayNO;
5872             if (reginfo->strend - locinput > 1)
5873                 sayNO;
5874             break;
5875
5876         case EOS: /*  \z  */
5877             if (!NEXTCHR_IS_EOS)
5878                 sayNO;
5879             break;
5880
5881         case SANY: /*  /./s  */
5882             if (NEXTCHR_IS_EOS)
5883                 sayNO;
5884             goto increment_locinput;
5885
5886         case REG_ANY: /*  /./  */
5887             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5888                 sayNO;
5889             goto increment_locinput;
5890
5891
5892 #undef  ST
5893 #define ST st->u.trie
5894         case TRIEC: /* (ab|cd) with known charclass */
5895             /* In this case the charclass data is available inline so
5896                we can fail fast without a lot of extra overhead.
5897              */
5898             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5899                 DEBUG_EXECUTE_r(
5900                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5901                               depth, PL_colors[4], PL_colors[5])
5902                 );
5903                 sayNO_SILENT;
5904                 NOT_REACHED; /* NOTREACHED */
5905             }
5906             /* FALLTHROUGH */
5907         case TRIE:  /* (ab|cd)  */
5908             /* the basic plan of execution of the trie is:
5909              * At the beginning, run though all the states, and
5910              * find the longest-matching word. Also remember the position
5911              * of the shortest matching word. For example, this pattern:
5912              *    1  2 3 4    5
5913              *    ab|a|x|abcd|abc
5914              * when matched against the string "abcde", will generate
5915              * accept states for all words except 3, with the longest
5916              * matching word being 4, and the shortest being 2 (with
5917              * the position being after char 1 of the string).
5918              *
5919              * Then for each matching word, in word order (i.e. 1,2,4,5),
5920              * we run the remainder of the pattern; on each try setting
5921              * the current position to the character following the word,
5922              * returning to try the next word on failure.
5923              *
5924              * We avoid having to build a list of words at runtime by
5925              * using a compile-time structure, wordinfo[].prev, which
5926              * gives, for each word, the previous accepting word (if any).
5927              * In the case above it would contain the mappings 1->2, 2->0,
5928              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5929              * the longest word (4 above), a list of all words, by
5930              * following the list of prev pointers; this gives us the
5931              * unordered list 4,5,1,2. Then given the current word we have
5932              * just tried, we can go through the list and find the
5933              * next-biggest word to try (so if we just failed on word 2,
5934              * the next in the list is 4).
5935              *
5936              * Since at runtime we don't record the matching position in
5937              * the string for each word, we have to work that out for
5938              * each word we're about to process. The wordinfo table holds
5939              * the character length of each word; given that we recorded
5940              * at the start: the position of the shortest word and its
5941              * length in chars, we just need to move the pointer the
5942              * difference between the two char lengths. Depending on
5943              * Unicode status and folding, that's cheap or expensive.
5944              *
5945              * This algorithm is optimised for the case where are only a
5946              * small number of accept states, i.e. 0,1, or maybe 2.
5947              * With lots of accepts states, and having to try all of them,
5948              * it becomes quadratic on number of accept states to find all
5949              * the next words.
5950              */
5951
5952             {
5953                 /* what type of TRIE am I? (utf8 makes this contextual) */
5954                 DECL_TRIE_TYPE(scan);
5955
5956                 /* what trie are we using right now */
5957                 reg_trie_data * const trie
5958                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5959                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5960                 U32 state = trie->startstate;
5961
5962                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5963                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5964                     if (utf8_target
5965                         && ! NEXTCHR_IS_EOS
5966                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5967                         && scan->flags == EXACTL)
5968                     {
5969                         /* We only output for EXACTL, as we let the folder
5970                          * output this message for EXACTFLU8 to avoid
5971                          * duplication */
5972                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5973                                                                reginfo->strend);
5974                     }
5975                 }
5976                 if (   trie->bitmap
5977                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5978                 {
5979                     if (trie->states[ state ].wordnum) {
5980                          DEBUG_EXECUTE_r(
5981                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5982                                           depth, PL_colors[4], PL_colors[5])
5983                         );
5984                         if (!trie->jump)
5985                             break;
5986                     } else {
5987                         DEBUG_EXECUTE_r(
5988                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5989                                           depth, PL_colors[4], PL_colors[5])
5990                         );
5991                         sayNO_SILENT;
5992                    }
5993                 }
5994
5995             {
5996                 U8 *uc = ( U8* )locinput;
5997
5998                 STRLEN len = 0;
5999                 STRLEN foldlen = 0;
6000                 U8 *uscan = (U8*)NULL;
6001                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6002                 U32 charcount = 0; /* how many input chars we have matched */
6003                 U32 accepted = 0; /* have we seen any accepting states? */
6004
6005                 ST.jump = trie->jump;
6006                 ST.me = scan;
6007                 ST.firstpos = NULL;
6008                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6009                 ST.nextword = 0;
6010
6011                 /* fully traverse the TRIE; note the position of the
6012                    shortest accept state and the wordnum of the longest
6013                    accept state */
6014
6015                 while ( state && uc <= (U8*)(reginfo->strend) ) {
6016                     U32 base = trie->states[ state ].trans.base;
6017                     UV uvc = 0;
6018                     U16 charid = 0;
6019                     U16 wordnum;
6020                     wordnum = trie->states[ state ].wordnum;
6021
6022                     if (wordnum) { /* it's an accept state */
6023                         if (!accepted) {
6024                             accepted = 1;
6025                             /* record first match position */
6026                             if (ST.longfold) {
6027                                 ST.firstpos = (U8*)locinput;
6028                                 ST.firstchars = 0;
6029                             }
6030                             else {
6031                                 ST.firstpos = uc;
6032                                 ST.firstchars = charcount;
6033                             }
6034                         }
6035                         if (!ST.nextword || wordnum < ST.nextword)
6036                             ST.nextword = wordnum;
6037                         ST.topword = wordnum;
6038                     }
6039
6040                     DEBUG_TRIE_EXECUTE_r({
6041                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6042                                 /* HERE */
6043                                 PerlIO_printf( Perl_debug_log,
6044                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6045                                     INDENT_CHARS(depth), "", PL_colors[4],
6046                                     (UV)state, (accepted ? 'Y' : 'N'));
6047                     });
6048
6049                     /* read a char and goto next state */
6050                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6051                         I32 offset;
6052                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6053                                              (U8 *) reginfo->strend, uscan,
6054                                              len, uvc, charid, foldlen,
6055                                              foldbuf, uniflags);
6056                         charcount++;
6057                         if (foldlen>0)
6058                             ST.longfold = TRUE;
6059                         if (charid &&
6060                              ( ((offset =
6061                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6062
6063                              && ((U32)offset < trie->lasttrans)
6064                              && trie->trans[offset].check == state)
6065                         {
6066                             state = trie->trans[offset].next;
6067                         }
6068                         else {
6069                             state = 0;
6070                         }
6071                         uc += len;
6072
6073                     }
6074                     else {
6075                         state = 0;
6076                     }
6077                     DEBUG_TRIE_EXECUTE_r(
6078                         Perl_re_printf( aTHX_
6079                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6080                             charid, uvc, (UV)state, PL_colors[5] );
6081                     );
6082                 }
6083                 if (!accepted)
6084                    sayNO;
6085
6086                 /* calculate total number of accept states */
6087                 {
6088                     U16 w = ST.topword;
6089                     accepted = 0;
6090                     while (w) {
6091                         w = trie->wordinfo[w].prev;
6092                         accepted++;
6093                     }
6094                     ST.accepted = accepted;
6095                 }
6096
6097                 DEBUG_EXECUTE_r(
6098                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6099                         depth,
6100                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6101                 );
6102                 goto trie_first_try; /* jump into the fail handler */
6103             }}
6104             NOT_REACHED; /* NOTREACHED */
6105
6106         case TRIE_next_fail: /* we failed - try next alternative */
6107         {
6108             U8 *uc;
6109             if ( ST.jump ) {
6110                 /* undo any captures done in the tail part of a branch,
6111                  * e.g.
6112                  *    /(?:X(.)(.)|Y(.)).../
6113                  * where the trie just matches X then calls out to do the
6114                  * rest of the branch */
6115                 REGCP_UNWIND(ST.cp);
6116                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6117             }
6118             if (!--ST.accepted) {
6119                 DEBUG_EXECUTE_r({
6120                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6121                         depth,
6122                         PL_colors[4],
6123                         PL_colors[5] );
6124                 });
6125                 sayNO_SILENT;
6126             }
6127             {
6128                 /* Find next-highest word to process.  Note that this code
6129                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6130                 U16 min = 0;
6131                 U16 word;
6132                 U16 const nextword = ST.nextword;
6133                 reg_trie_wordinfo * const wordinfo
6134                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6135                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6136                     if (word > nextword && (!min || word < min))
6137                         min = word;
6138                 }
6139                 ST.nextword = min;
6140             }
6141
6142           trie_first_try:
6143             if (do_cutgroup) {
6144                 do_cutgroup = 0;
6145                 no_final = 0;
6146             }
6147
6148             if ( ST.jump ) {
6149                 ST.lastparen = rex->lastparen;
6150                 ST.lastcloseparen = rex->lastcloseparen;
6151                 REGCP_SET(ST.cp);
6152             }
6153
6154             /* find start char of end of current word */
6155             {
6156                 U32 chars; /* how many chars to skip */
6157                 reg_trie_data * const trie
6158                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6159
6160                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6161                             >=  ST.firstchars);
6162                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6163                             - ST.firstchars;
6164                 uc = ST.firstpos;
6165
6166                 if (ST.longfold) {
6167                     /* the hard option - fold each char in turn and find
6168                      * its folded length (which may be different */
6169                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6170                     STRLEN foldlen;
6171                     STRLEN len;
6172                     UV uvc;
6173                     U8 *uscan;
6174
6175                     while (chars) {
6176                         if (utf8_target) {
6177                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6178                                                     uniflags);
6179                             uc += len;
6180                         }
6181                         else {
6182                             uvc = *uc;
6183                             uc++;
6184                         }
6185                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6186                         uscan = foldbuf;
6187                         while (foldlen) {
6188                             if (!--chars)
6189                                 break;
6190                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6191                                                  uniflags);
6192                             uscan += len;
6193                             foldlen -= len;
6194                         }
6195                     }
6196                 }
6197                 else {
6198                     if (utf8_target)
6199                         while (chars--)
6200                             uc += UTF8SKIP(uc);
6201                     else
6202                         uc += chars;
6203                 }
6204             }
6205
6206             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6207                             ? ST.jump[ST.nextword]
6208                             : NEXT_OFF(ST.me));
6209
6210             DEBUG_EXECUTE_r({
6211                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6212                     depth,
6213                     PL_colors[4],
6214                     ST.nextword,
6215                     PL_colors[5]
6216                     );
6217             });
6218
6219             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6220                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6221                 NOT_REACHED; /* NOTREACHED */
6222             }
6223             /* only one choice left - just continue */
6224             DEBUG_EXECUTE_r({
6225                 AV *const trie_words
6226                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6227                 SV ** const tmp = trie_words
6228                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6229                 SV *sv= tmp ? sv_newmortal() : NULL;
6230
6231                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6232                     depth, PL_colors[4],
6233                     ST.nextword,
6234                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6235                             PL_colors[0], PL_colors[1],
6236                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6237                         )
6238                     : "not compiled under -Dr",
6239                     PL_colors[5] );
6240             });
6241
6242             locinput = (char*)uc;
6243             continue; /* execute rest of RE */
6244             /* NOTREACHED */
6245         }
6246 #undef  ST
6247
6248         case EXACTL:             /*  /abc/l       */
6249             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6250
6251             /* Complete checking would involve going through every character
6252              * matched by the string to see if any is above latin1.  But the
6253              * comparision otherwise might very well be a fast assembly
6254              * language routine, and I (khw) don't think slowing things down
6255              * just to check for this warning is worth it.  So this just checks
6256              * the first character */
6257             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6258                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6259             }
6260             goto do_exact;
6261         case EXACT_ONLY8:
6262             if (! utf8_target) {
6263                 sayNO;
6264             }
6265             /* FALLTHROUGH */
6266         case EXACT: {            /*  /abc/        */
6267             char *s;
6268           do_exact:
6269             s = STRING(scan);
6270             ln = STR_LEN(scan);
6271             if (utf8_target != is_utf8_pat) {
6272                 /* The target and the pattern have differing utf8ness. */
6273                 char *l = locinput;
6274                 const char * const e = s + ln;
6275
6276                 if (utf8_target) {
6277                     /* The target is utf8, the pattern is not utf8.
6278                      * Above-Latin1 code points can't match the pattern;
6279                      * invariants match exactly, and the other Latin1 ones need
6280                      * to be downgraded to a single byte in order to do the
6281                      * comparison.  (If we could be confident that the target
6282                      * is not malformed, this could be refactored to have fewer
6283                      * tests by just assuming that if the first bytes match, it
6284                      * is an invariant, but there are tests in the test suite
6285                      * dealing with (??{...}) which violate this) */
6286                     while (s < e) {
6287                         if (l >= reginfo->strend
6288                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6289                         {
6290                             sayNO;
6291                         }
6292                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6293                             if (*l != *s) {
6294                                 sayNO;
6295                             }
6296                             l++;
6297                         }
6298                         else {
6299                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6300                             {
6301                                 sayNO;
6302                             }
6303                             l += 2;
6304                         }
6305                         s++;
6306                     }
6307                 }
6308                 else {
6309                     /* The target is not utf8, the pattern is utf8. */
6310                     while (s < e) {
6311                         if (l >= reginfo->strend
6312                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6313                         {
6314                             sayNO;
6315                         }
6316                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6317                             if (*s != *l) {
6318                                 sayNO;
6319                             }
6320                             s++;
6321                         }
6322                         else {
6323                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6324                             {
6325                                 sayNO;
6326                             }
6327                             s += 2;
6328                         }
6329                         l++;
6330                     }
6331                 }
6332                 locinput = l;
6333             }
6334             else {
6335                 /* The target and the pattern have the same utf8ness. */
6336                 /* Inline the first character, for speed. */
6337                 if (reginfo->strend - locinput < ln
6338                     || UCHARAT(s) != nextchr
6339                     || (ln > 1 && memNE(s, locinput, ln)))
6340                 {
6341                     sayNO;
6342                 }
6343                 locinput += ln;
6344             }
6345             break;
6346             }
6347
6348         case EXACTFL: {          /*  /abc/il      */
6349             re_fold_t folder;
6350             const U8 * fold_array;
6351             const char * s;
6352             U32 fold_utf8_flags;
6353
6354             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6355             folder = foldEQ_locale;
6356             fold_array = PL_fold_locale;
6357             fold_utf8_flags = FOLDEQ_LOCALE;
6358             goto do_exactf;
6359
6360         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6361                                       is effectively /u; hence to match, target
6362                                       must be UTF-8. */
6363             if (! utf8_target) {
6364                 sayNO;
6365             }
6366             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6367                                              | FOLDEQ_S1_FOLDS_SANE;
6368             folder = foldEQ_latin1;
6369             fold_array = PL_fold_latin1;
6370             goto do_exactf;
6371
6372         case EXACTFU_ONLY8:      /* /abc/iu with something in /abc/ > 255 */
6373             if (! utf8_target) {
6374                 sayNO;
6375             }
6376             assert(is_utf8_pat);
6377             fold_utf8_flags = FOLDEQ_S1_ALREADY_FOLDED;
6378             goto do_exactf;
6379
6380         case EXACTFU_SS:         /*  /\x{df}/iu   */
6381         case EXACTFU:            /*  /abc/iu      */
6382             folder = foldEQ_latin1;
6383             fold_array = PL_fold_latin1;
6384             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6385             goto do_exactf;
6386
6387         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6388                                    patterns */
6389             assert(! is_utf8_pat);
6390             /* FALLTHROUGH */
6391         case EXACTFAA:            /*  /abc/iaa     */
6392             folder = foldEQ_latin1;
6393             fold_array = PL_fold_latin1;
6394             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6395             goto do_exactf;
6396
6397         case EXACTF:             /*  /abc/i    This node only generated for
6398                                                non-utf8 patterns */
6399             assert(! is_utf8_pat);
6400             folder = foldEQ;
6401             fold_array = PL_fold;
6402             fold_utf8_flags = 0;
6403
6404           do_exactf:
6405             s = STRING(scan);
6406             ln = STR_LEN(scan);
6407
6408             if (utf8_target
6409                 || is_utf8_pat
6410                 || state_num == EXACTFU_SS
6411                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6412             {
6413               /* Either target or the pattern are utf8, or has the issue where
6414                * the fold lengths may differ. */
6415                 const char * const l = locinput;
6416                 char *e = reginfo->strend;
6417
6418                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6419                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6420                 {
6421                     sayNO;
6422                 }
6423                 locinput = e;
6424                 break;
6425             }
6426
6427             /* Neither the target nor the pattern are utf8 */
6428             if (UCHARAT(s) != nextchr
6429                 && !NEXTCHR_IS_EOS
6430                 && UCHARAT(s) != fold_array[nextchr])
6431             {
6432                 sayNO;
6433             }
6434             if (reginfo->strend - locinput < ln)
6435                 sayNO;
6436             if (ln > 1 && ! folder(s, locinput, ln))
6437                 sayNO;
6438             locinput += ln;
6439             break;
6440         }
6441
6442         case NBOUNDL: /*  /\B/l  */
6443             to_complement = 1;
6444             /* FALLTHROUGH */
6445
6446         case BOUNDL:  /*  /\b/l  */
6447         {
6448             bool b1, b2;
6449             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6450
6451             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6452                 if (! IN_UTF8_CTYPE_LOCALE) {
6453                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6454                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6455                 }
6456                 goto boundu;
6457             }
6458
6459             if (utf8_target) {
6460                 if (locinput == reginfo->strbeg)
6461                     b1 = isWORDCHAR_LC('\n');
6462                 else {
6463                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6464                                                         (U8*)(reginfo->strbeg)),
6465                                                  (U8*)(reginfo->strend));
6466                 }
6467                 b2 = (NEXTCHR_IS_EOS)
6468                     ? isWORDCHAR_LC('\n')
6469                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6470                                               (U8*) reginfo->strend);
6471             }
6472             else { /* Here the string isn't utf8 */
6473                 b1 = (locinput == reginfo->strbeg)
6474                      ? isWORDCHAR_LC('\n')
6475                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6476                 b2 = (NEXTCHR_IS_EOS)
6477                     ? isWORDCHAR_LC('\n')
6478                     : isWORDCHAR_LC(nextchr);
6479             }
6480             if (to_complement ^ (b1 == b2)) {
6481                 sayNO;
6482             }
6483             break;
6484         }
6485
6486         case NBOUND:  /*  /\B/   */
6487             to_complement = 1;
6488             /* FALLTHROUGH */
6489
6490         case BOUND:   /*  /\b/   */
6491             if (utf8_target) {
6492                 goto bound_utf8;
6493             }
6494             goto bound_ascii_match_only;
6495
6496         case NBOUNDA: /*  /\B/a  */
6497             to_complement = 1;
6498             /* FALLTHROUGH */
6499
6500         case BOUNDA:  /*  /\b/a  */
6501         {
6502             bool b1, b2;
6503
6504           bound_ascii_match_only:
6505             /* Here the string isn't utf8, or is utf8 and only ascii characters
6506              * are to match \w.  In the latter case looking at the byte just
6507              * prior to the current one may be just the final byte of a
6508              * multi-byte character.  This is ok.  There are two cases:
6509              * 1) it is a single byte character, and then the test is doing
6510              *    just what it's supposed to.
6511              * 2) it is a multi-byte character, in which case the final byte is
6512              *    never mistakable for ASCII, and so the test will say it is
6513              *    not a word character, which is the correct answer. */
6514             b1 = (locinput == reginfo->strbeg)
6515                  ? isWORDCHAR_A('\n')
6516                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6517             b2 = (NEXTCHR_IS_EOS)
6518                 ? isWORDCHAR_A('\n')
6519                 : isWORDCHAR_A(nextchr);
6520             if (to_complement ^ (b1 == b2)) {
6521                 sayNO;
6522             }
6523             break;
6524         }
6525
6526         case NBOUNDU: /*  /\B/u  */
6527             to_complement = 1;
6528             /* FALLTHROUGH */
6529
6530         case BOUNDU:  /*  /\b/u  */
6531
6532           boundu:
6533             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6534                 match = FALSE;
6535             }
6536             else if (utf8_target) {
6537               bound_utf8:
6538                 switch((bound_type) FLAGS(scan)) {
6539                     case TRADITIONAL_BOUND:
6540                     {
6541                         bool b1, b2;
6542                         b1 = (locinput == reginfo->strbeg)
6543                              ? 0 /* isWORDCHAR_L1('\n') */
6544                              : isWORDCHAR_utf8_safe(
6545                                                reghop3((U8*)locinput,
6546                                                        -1,
6547                                                        (U8*)(reginfo->strbeg)),
6548                                                     (U8*) reginfo->strend);
6549                         b2 = (NEXTCHR_IS_EOS)
6550                             ? 0 /* isWORDCHAR_L1('\n') */
6551                             : isWORDCHAR_utf8_safe((U8*)locinput,
6552                                                    (U8*) reginfo->strend);
6553                         match = cBOOL(b1 != b2);
6554                         break;
6555                     }
6556                     case GCB_BOUND:
6557                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6558                             match = TRUE; /* GCB always matches at begin and
6559                                              end */
6560                         }
6561                         else {
6562                             /* Find the gcb values of previous and current
6563                              * chars, then see if is a break point */
6564                             match = isGCB(getGCB_VAL_UTF8(
6565                                                 reghop3((U8*)locinput,
6566                                                         -1,
6567                                                         (U8*)(reginfo->strbeg)),
6568                                                 (U8*) reginfo->strend),
6569                                           getGCB_VAL_UTF8((U8*) locinput,
6570                                                         (U8*) reginfo->strend),
6571                                           (U8*) reginfo->strbeg,
6572                                           (U8*) locinput,
6573                                           utf8_target);
6574                         }
6575                         break;
6576
6577                     case LB_BOUND:
6578                         if (locinput == reginfo->strbeg) {
6579                             match = FALSE;
6580                         }
6581                         else if (NEXTCHR_IS_EOS) {
6582                             match = TRUE;
6583                         }
6584                         else {
6585                             match = isLB(getLB_VAL_UTF8(
6586                                                 reghop3((U8*)locinput,
6587                                                         -1,
6588                                                         (U8*)(reginfo->strbeg)),
6589                                                 (U8*) reginfo->strend),
6590                                           getLB_VAL_UTF8((U8*) locinput,
6591                                                         (U8*) reginfo->strend),
6592                                           (U8*) reginfo->strbeg,
6593                                           (U8*) locinput,
6594                                           (U8*) reginfo->strend,
6595                                           utf8_target);
6596                         }
6597                         break;
6598
6599                     case SB_BOUND: /* Always matches at begin and end */
6600                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6601                             match = TRUE;
6602                         }
6603                         else {
6604                             match = isSB(getSB_VAL_UTF8(
6605                                                 reghop3((U8*)locinput,
6606                                                         -1,
6607                                                         (U8*)(reginfo->strbeg)),
6608                                                 (U8*) reginfo->strend),
6609                                           getSB_VAL_UTF8((U8*) locinput,
6610                                                         (U8*) reginfo->strend),
6611                                           (U8*) reginfo->strbeg,
6612                                           (U8*) locinput,
6613                                           (U8*) reginfo->strend,
6614                                           utf8_target);
6615                         }
6616                         break;
6617
6618                     case WB_BOUND:
6619                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6620                             match = TRUE;
6621                         }
6622                         else {
6623                             match = isWB(WB_UNKNOWN,
6624                                          getWB_VAL_UTF8(
6625                                                 reghop3((U8*)locinput,
6626                                                         -1,
6627                                                         (U8*)(reginfo->strbeg)),
6628                                                 (U8*) reginfo->strend),
6629                                           getWB_VAL_UTF8((U8*) locinput,
6630                                                         (U8*) reginfo->strend),
6631                                           (U8*) reginfo->strbeg,
6632                                           (U8*) locinput,
6633                                           (U8*) reginfo->strend,
6634                                           utf8_target);
6635                         }
6636                         break;
6637                 }
6638             }
6639             else {  /* Not utf8 target */
6640                 switch((bound_type) FLAGS(scan)) {
6641                     case TRADITIONAL_BOUND:
6642                     {
6643                         bool b1, b2;
6644                         b1 = (locinput == reginfo->strbeg)
6645                             ? 0 /* isWORDCHAR_L1('\n') */
6646                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6647                         b2 = (NEXTCHR_IS_EOS)
6648                             ? 0 /* isWORDCHAR_L1('\n') */
6649                             : isWORDCHAR_L1(nextchr);
6650                         match = cBOOL(b1 != b2);
6651                         break;
6652                     }
6653
6654                     case GCB_BOUND:
6655                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6656                             match = TRUE; /* GCB always matches at begin and
6657                                              end */
6658                         }
6659                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6660                                    range */
6661                             match =    UCHARAT(locinput - 1) != '\r'
6662                                     || UCHARAT(locinput) != '\n';
6663                         }
6664                         break;
6665
6666                     case LB_BOUND:
6667                         if (locinput == reginfo->strbeg) {
6668                             match = FALSE;
6669                         }
6670                         else if (NEXTCHR_IS_EOS) {
6671                             match = TRUE;
6672                         }
6673                         else {
6674                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6675                                          getLB_VAL_CP(UCHARAT(locinput)),
6676                                          (U8*) reginfo->strbeg,
6677                                          (U8*) locinput,
6678                                          (U8*) reginfo->strend,
6679                                          utf8_target);
6680                         }
6681                         break;
6682
6683                     case SB_BOUND: /* Always matches at begin and end */
6684                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6685                             match = TRUE;
6686                         }
6687                         else {
6688                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6689                                          getSB_VAL_CP(UCHARAT(locinput)),
6690                                          (U8*) reginfo->strbeg,
6691                                          (U8*) locinput,
6692                                          (U8*) reginfo->strend,
6693                                          utf8_target);
6694                         }
6695                         break;
6696
6697                     case WB_BOUND:
6698                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6699                             match = TRUE;
6700                         }
6701                         else {
6702                             match = isWB(WB_UNKNOWN,
6703                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6704                                          getWB_VAL_CP(UCHARAT(locinput)),
6705                                          (U8*) reginfo->strbeg,
6706                                          (U8*) locinput,
6707                                          (U8*) reginfo->strend,
6708                                          utf8_target);
6709                         }
6710                         break;
6711                 }
6712             }
6713
6714             if (to_complement ^ ! match) {
6715                 sayNO;
6716             }
6717             break;
6718
6719         case ANYOFPOSIXL:
6720         case ANYOFL:  /*  /[abc]/l      */
6721             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6722
6723             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6724             {
6725               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6726             }
6727             /* FALLTHROUGH */
6728         case ANYOFD:  /*   /[abc]/d       */
6729         case ANYOF:  /*   /[abc]/       */
6730             if (NEXTCHR_IS_EOS)
6731                 sayNO;
6732             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
6733                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
6734             {
6735                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
6736                     sayNO;
6737                 }
6738                 locinput++;
6739             }
6740             else {
6741                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6742                                                                    utf8_target))
6743                 {
6744                     sayNO;
6745                 }
6746                 goto increment_locinput;
6747             }
6748             break;
6749
6750         case ANYOFM:
6751             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
6752                 sayNO;
6753             }
6754             locinput++; /* ANYOFM is always single byte */
6755             break;
6756
6757         case NANYOFM:
6758             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)) {
6759                 sayNO;
6760             }
6761             goto increment_locinput;
6762             break;
6763
6764         case ASCII:
6765             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6766                 sayNO;
6767             }
6768
6769             locinput++;     /* ASCII is always single byte */
6770             break;
6771
6772         case NASCII:
6773             if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6774                 sayNO;
6775             }
6776
6777             goto increment_locinput;
6778             break;
6779
6780         /* The argument (FLAGS) to all the POSIX node types is the class number
6781          * */
6782
6783         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6784             to_complement = 1;
6785             /* FALLTHROUGH */
6786
6787         case POSIXL:    /* \w or [:punct:] etc. under /l */
6788             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6789             if (NEXTCHR_IS_EOS)
6790                 sayNO;
6791
6792             /* Use isFOO_lc() for characters within Latin1.  (Note that
6793              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6794              * wouldn't be invariant) */
6795             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6796                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6797                     sayNO;
6798                 }
6799
6800                 locinput++;
6801                 break;
6802             }
6803
6804             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6805                 /* An above Latin-1 code point, or malformed */
6806                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6807                                                        reginfo->strend);
6808                 goto utf8_posix_above_latin1;
6809             }
6810
6811             /* Here is a UTF-8 variant code point below 256 and the target is
6812              * UTF-8 */
6813             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6814                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6815                                             *(locinput + 1))))))
6816             {
6817                 sayNO;
6818             }
6819
6820             goto increment_locinput;
6821
6822         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6823             to_complement = 1;
6824             /* FALLTHROUGH */
6825
6826         case POSIXD:    /* \w or [:punct:] etc. under /d */
6827             if (utf8_target) {
6828                 goto utf8_posix;
6829             }
6830             goto posixa;
6831
6832         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6833
6834             if (NEXTCHR_IS_EOS) {
6835                 sayNO;
6836             }
6837
6838             /* All UTF-8 variants match */
6839             if (! UTF8_IS_INVARIANT(nextchr)) {
6840                 goto increment_locinput;
6841             }
6842
6843             to_complement = 1;
6844             goto join_nposixa;
6845
6846         case POSIXA:    /* \w or [:punct:] etc. under /a */
6847
6848           posixa:
6849             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6850              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6851              * character is a single byte */
6852
6853             if (NEXTCHR_IS_EOS) {
6854                 sayNO;
6855             }
6856
6857           join_nposixa:
6858
6859             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6860                                                                 FLAGS(scan)))))
6861             {
6862                 sayNO;
6863             }
6864
6865             /* Here we are either not in utf8, or we matched a utf8-invariant,
6866              * so the next char is the next byte */
6867             locinput++;
6868             break;
6869
6870         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6871             to_complement = 1;
6872             /* FALLTHROUGH */
6873
6874         case POSIXU:    /* \w or [:punct:] etc. under /u */
6875           utf8_posix:
6876             if (NEXTCHR_IS_EOS) {
6877                 sayNO;
6878             }
6879
6880             /* Use _generic_isCC() for characters within Latin1.  (Note that
6881              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6882              * wouldn't be invariant) */
6883             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6884                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6885                                                            FLAGS(scan)))))
6886                 {
6887                     sayNO;
6888                 }
6889                 locinput++;
6890             }
6891             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6892                 if (! (to_complement
6893                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6894                                                                *(locinput + 1)),
6895                                              FLAGS(scan)))))
6896                 {
6897                     sayNO;
6898                 }
6899                 locinput += 2;
6900             }
6901             else {  /* Handle above Latin-1 code points */
6902               utf8_posix_above_latin1:
6903                 classnum = (_char_class_number) FLAGS(scan);
6904                 switch (classnum) {
6905                     default:
6906                         if (! (to_complement
6907                            ^ cBOOL(_invlist_contains_cp(
6908                                       PL_XPosix_ptrs[classnum],
6909                                       utf8_to_uvchr_buf((U8 *) locinput,
6910                                                         (U8 *) reginfo->strend,
6911                                                         NULL)))))
6912                         {
6913                             sayNO;
6914                         }
6915                         break;
6916                     case _CC_ENUM_SPACE:
6917                         if (! (to_complement
6918                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6919                         {
6920                             sayNO;
6921                         }
6922                         break;
6923                     case _CC_ENUM_BLANK:
6924                         if (! (to_complement
6925                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6926                         {
6927                             sayNO;
6928                         }
6929                         break;
6930                     case _CC_ENUM_XDIGIT:
6931                         if (! (to_complement
6932                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6933                         {
6934                             sayNO;
6935                         }
6936                         break;
6937                     case _CC_ENUM_VERTSPACE:
6938                         if (! (to_complement
6939                                         ^ cBOOL(is_VERTWS_high(locinput))))
6940                         {
6941                             sayNO;
6942                         }
6943                         break;
6944                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6945                     case _CC_ENUM_ASCII:
6946                         if (! to_complement) {
6947                             sayNO;
6948                         }
6949                         break;
6950                 }
6951                 locinput += UTF8SKIP(locinput);
6952             }
6953             break;
6954
6955         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6956                        a Unicode extended Grapheme Cluster */
6957             if (NEXTCHR_IS_EOS)
6958                 sayNO;
6959             if  (! utf8_target) {
6960
6961                 /* Match either CR LF  or '.', as all the other possibilities
6962                  * require utf8 */
6963                 locinput++;         /* Match the . or CR */
6964                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6965                                        match the LF */
6966                     && locinput < reginfo->strend
6967                     && UCHARAT(locinput) == '\n')
6968                 {
6969                     locinput++;
6970                 }
6971             }
6972             else {
6973
6974                 /* Get the gcb type for the current character */
6975                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6976                                                        (U8*) reginfo->strend);
6977
6978                 /* Then scan through the input until we get to the first
6979                  * character whose type is supposed to be a gcb with the
6980                  * current character.  (There is always a break at the
6981                  * end-of-input) */
6982                 locinput += UTF8SKIP(locinput);
6983                 while (locinput < reginfo->strend) {
6984                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6985                                                          (U8*) reginfo->strend);
6986                     if (isGCB(prev_gcb, cur_gcb,
6987                               (U8*) reginfo->strbeg, (U8*) locinput,
6988                               utf8_target))
6989                     {
6990                         break;
6991                     }
6992
6993                     prev_gcb = cur_gcb;
6994                     locinput += UTF8SKIP(locinput);
6995                 }
6996
6997
6998             }
6999             break;
7000
7001         case NREFFL:  /*  /\g{name}/il  */
7002         {   /* The capture buffer cases.  The ones beginning with N for the
7003                named buffers just convert to the equivalent numbered and
7004                pretend they were called as the corresponding numbered buffer
7005                op.  */
7006             /* don't initialize these in the declaration, it makes C++
7007                unhappy */
7008             const char *s;
7009             char type;
7010             re_fold_t folder;
7011             const U8 *fold_array;
7012             UV utf8_fold_flags;
7013
7014             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7015             folder = foldEQ_locale;
7016             fold_array = PL_fold_locale;
7017             type = REFFL;
7018             utf8_fold_flags = FOLDEQ_LOCALE;
7019             goto do_nref;
7020
7021         case NREFFA:  /*  /\g{name}/iaa  */
7022             folder = foldEQ_latin1;
7023             fold_array = PL_fold_latin1;
7024             type = REFFA;
7025             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7026             goto do_nref;
7027
7028         case NREFFU:  /*  /\g{name}/iu  */
7029             folder = foldEQ_latin1;
7030             fold_array = PL_fold_latin1;
7031             type = REFFU;
7032             utf8_fold_flags = 0;
7033             goto do_nref;
7034
7035         case NREFF:  /*  /\g{name}/i  */
7036             folder = foldEQ;
7037             fold_array = PL_fold;
7038             type = REFF;
7039             utf8_fold_flags = 0;
7040             goto do_nref;
7041
7042         case NREF:  /*  /\g{name}/   */
7043             type = REF;
7044             folder = NULL;
7045             fold_array = NULL;
7046             utf8_fold_flags = 0;
7047           do_nref:
7048
7049             /* For the named back references, find the corresponding buffer
7050              * number */
7051             n = reg_check_named_buff_matched(rex,scan);
7052
7053             if ( ! n ) {
7054                 sayNO;
7055             }
7056             goto do_nref_ref_common;
7057
7058         case REFFL:  /*  /\1/il  */
7059             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7060             folder = foldEQ_locale;
7061             fold_array = PL_fold_locale;
7062             utf8_fold_flags = FOLDEQ_LOCALE;
7063             goto do_ref;
7064
7065         case REFFA:  /*  /\1/iaa  */
7066             folder = foldEQ_latin1;
7067             fold_array = PL_fold_latin1;
7068             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7069             goto do_ref;
7070
7071         case REFFU:  /*  /\1/iu  */
7072             folder = foldEQ_latin1;
7073             fold_array = PL_fold_latin1;
7074             utf8_fold_flags = 0;
7075             goto do_ref;
7076
7077         case REFF:  /*  /\1/i  */
7078             folder = foldEQ;
7079             fold_array = PL_fold;
7080             utf8_fold_flags = 0;
7081             goto do_ref;
7082
7083         case REF:  /*  /\1/    */
7084             folder = NULL;
7085             fold_array = NULL;
7086             utf8_fold_flags = 0;
7087
7088           do_ref:
7089             type = OP(scan);
7090             n = ARG(scan);  /* which paren pair */
7091
7092           do_nref_ref_common:
7093             ln = rex->offs[n].start;
7094             endref = rex->offs[n].end;
7095             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7096             if (rex->lastparen < n || ln == -1 || endref == -1)
7097                 sayNO;                  /* Do not match unless seen CLOSEn. */
7098             if (ln == endref)
7099                 break;
7100
7101             s = reginfo->strbeg + ln;
7102             if (type != REF     /* REF can do byte comparison */
7103                 && (utf8_target || type == REFFU || type == REFFL))
7104             {
7105                 char * limit = reginfo->strend;
7106
7107                 /* This call case insensitively compares the entire buffer
7108                     * at s, with the current input starting at locinput, but
7109                     * not going off the end given by reginfo->strend, and
7110                     * returns in <limit> upon success, how much of the
7111                     * current input was matched */
7112                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7113                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7114                 {
7115                     sayNO;
7116                 }
7117                 locinput = limit;
7118                 break;
7119             }
7120
7121             /* Not utf8:  Inline the first character, for speed. */
7122             if (!NEXTCHR_IS_EOS &&
7123                 UCHARAT(s) != nextchr &&
7124                 (type == REF ||
7125                  UCHARAT(s) != fold_array[nextchr]))
7126                 sayNO;
7127             ln = endref - ln;
7128             if (locinput + ln > reginfo->strend)
7129                 sayNO;
7130             if (ln > 1 && (type == REF
7131                            ? memNE(s, locinput, ln)
7132                            : ! folder(s, locinput, ln)))
7133                 sayNO;
7134             locinput += ln;
7135             break;
7136         }
7137
7138         case NOTHING: /* null op; e.g. the 'nothing' following
7139                        * the '*' in m{(a+|b)*}' */
7140             break;
7141         case TAIL: /* placeholder while compiling (A|B|C) */
7142             break;
7143
7144 #undef  ST
7145 #define ST st->u.eval
7146 #define CUR_EVAL cur_eval->u.eval
7147
7148         {
7149             SV *ret;
7150             REGEXP *re_sv;
7151             regexp *re;
7152             regexp_internal *rei;
7153             regnode *startpoint;
7154             U32 arg;
7155
7156         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7157             arg= (U32)ARG(scan);
7158             if (cur_eval && cur_eval->locinput == locinput) {
7159                 if ( ++nochange_depth > max_nochange_depth )
7160                     Perl_croak(aTHX_
7161                         "Pattern subroutine nesting without pos change"
7162                         " exceeded limit in regex");
7163             } else {
7164                 nochange_depth = 0;
7165             }
7166             re_sv = rex_sv;
7167             re = rex;
7168             rei = rexi;
7169             startpoint = scan + ARG2L(scan);
7170             EVAL_CLOSE_PAREN_SET( st, arg );
7171             /* Detect infinite recursion
7172              *
7173              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7174              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7175              * So we track the position in the string we are at each time
7176              * we recurse and if we try to enter the same routine twice from
7177              * the same position we throw an error.
7178              */
7179             if ( rex->recurse_locinput[arg] == locinput ) {
7180                 /* FIXME: we should show the regop that is failing as part
7181                  * of the error message. */
7182                 Perl_croak(aTHX_ "Infinite recursion in regex");
7183             } else {
7184                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7185                 rex->recurse_locinput[arg]= locinput;
7186
7187                 DEBUG_r({
7188                     GET_RE_DEBUG_FLAGS_DECL;
7189                     DEBUG_STACK_r({
7190                         Perl_re_exec_indentf( aTHX_
7191                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7192                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7193                         );
7194                     });
7195                 });
7196             }
7197
7198             /* Save all the positions seen so far. */
7199             ST.cp = regcppush(rex, 0, maxopenparen);
7200             REGCP_SET(ST.lastcp);
7201
7202             /* and then jump to the code we share with EVAL */
7203             goto eval_recurse_doit;
7204             /* NOTREACHED */
7205
7206         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7207             if (cur_eval && cur_eval->locinput==locinput) {
7208                 if ( ++nochange_depth > max_nochange_depth )
7209                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7210             } else {
7211                 nochange_depth = 0;
7212             }
7213             {
7214                 /* execute the code in the {...} */
7215
7216                 dSP;
7217                 IV before;
7218                 OP * const oop = PL_op;
7219                 COP * const ocurcop = PL_curcop;
7220                 OP *nop;
7221                 CV *newcv;
7222
7223                 /* save *all* paren positions */
7224                 regcppush(rex, 0, maxopenparen);
7225                 REGCP_SET(ST.lastcp);
7226
7227                 if (!caller_cv)
7228                     caller_cv = find_runcv(NULL);
7229
7230                 n = ARG(scan);
7231
7232                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7233                     newcv = (ReANY(
7234                                     (REGEXP*)(rexi->data->data[n])
7235                             ))->qr_anoncv;
7236                     nop = (OP*)rexi->data->data[n+1];
7237                 }
7238                 else if (rexi->data->what[n] == 'l') { /* literal code */
7239                     newcv = caller_cv;
7240                     nop = (OP*)rexi->data->data[n];
7241                     assert(CvDEPTH(newcv));
7242                 }
7243                 else {
7244                     /* literal with own CV */
7245                     assert(rexi->data->what[n] == 'L');
7246                     newcv = rex->qr_anoncv;
7247                     nop = (OP*)rexi->data->data[n];
7248                 }
7249
7250                 /* Some notes about MULTICALL and the context and save stacks.
7251                  *
7252                  * In something like
7253                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7254                  * since codeblocks don't introduce a new scope (so that
7255                  * local() etc accumulate), at the end of a successful
7256                  * match there will be a SAVEt_CLEARSV on the savestack
7257                  * for each of $x, $y, $z. If the three code blocks above
7258                  * happen to have come from different CVs (e.g. via
7259                  * embedded qr//s), then we must ensure that during any
7260                  * savestack unwinding, PL_comppad always points to the
7261                  * right pad at each moment. We achieve this by
7262                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7263                  * there is a change of pad.
7264                  * In theory whenever we call a code block, we should
7265                  * push a CXt_SUB context, then pop it on return from
7266                  * that code block. This causes a bit of an issue in that
7267                  * normally popping a context also clears the savestack
7268                  * back to cx->blk_oldsaveix, but here we specifically
7269                  * don't want to clear the save stack on exit from the
7270                  * code block.
7271                  * Also for efficiency we don't want to keep pushing and
7272                  * popping the single SUB context as we backtrack etc.
7273                  * So instead, we push a single context the first time
7274                  * we need, it, then hang onto it until the end of this
7275                  * function. Whenever we encounter a new code block, we
7276                  * update the CV etc if that's changed. During the times
7277                  * in this function where we're not executing a code
7278                  * block, having the SUB context still there is a bit
7279                  * naughty - but we hope that no-one notices.
7280                  * When the SUB context is initially pushed, we fake up
7281                  * cx->blk_oldsaveix to be as if we'd pushed this context
7282                  * on first entry to S_regmatch rather than at some random
7283                  * point during the regexe execution. That way if we
7284                  * croak, popping the context stack will ensure that
7285                  * *everything* SAVEd by this function is undone and then
7286                  * the context popped, rather than e.g., popping the
7287                  * context (and restoring the original PL_comppad) then
7288                  * popping more of the savestack and restoring a bad
7289                  * PL_comppad.
7290                  */
7291
7292                 /* If this is the first EVAL, push a MULTICALL. On
7293                  * subsequent calls, if we're executing a different CV, or
7294                  * if PL_comppad has got messed up from backtracking
7295                  * through SAVECOMPPADs, then refresh the context.
7296                  */
7297                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7298                 {
7299                     U8 flags = (CXp_SUB_RE |
7300                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7301                     SAVECOMPPAD();
7302                     if (last_pushed_cv) {
7303                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7304                     }
7305                     else {
7306                         PUSH_MULTICALL_FLAGS(newcv, flags);
7307                     }
7308                     /* see notes above */
7309                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7310
7311                     last_pushed_cv = newcv;
7312                 }
7313                 else {
7314                     /* these assignments are just to silence compiler
7315                      * warnings */
7316                     multicall_cop = NULL;
7317                 }
7318                 last_pad = PL_comppad;
7319
7320                 /* the initial nextstate you would normally execute
7321                  * at the start of an eval (which would cause error
7322                  * messages to come from the eval), may be optimised
7323                  * away from the execution path in the regex code blocks;
7324                  * so manually set PL_curcop to it initially */
7325                 {
7326                     OP *o = cUNOPx(nop)->op_first;
7327                     assert(o->op_type == OP_NULL);
7328                     if (o->op_targ == OP_SCOPE) {
7329                         o = cUNOPo->op_first;
7330                     }
7331                     else {
7332                         assert(o->op_targ == OP_LEAVE);
7333                         o = cUNOPo->op_first;
7334                         assert(o->op_type == OP_ENTER);
7335                         o = OpSIBLING(o);
7336                     }
7337
7338                     if (o->op_type != OP_STUB) {
7339                         assert(    o->op_type == OP_NEXTSTATE
7340                                 || o->op_type == OP_DBSTATE
7341                                 || (o->op_type == OP_NULL
7342                                     &&  (  o->op_targ == OP_NEXTSTATE
7343                                         || o->op_targ == OP_DBSTATE
7344                                         )
7345                                     )
7346                         );
7347                         PL_curcop = (COP*)o;
7348                     }
7349                 }
7350                 nop = nop->op_next;
7351
7352                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7353                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7354
7355                 rex->offs[0].end = locinput - reginfo->strbeg;
7356                 if (reginfo->info_aux_eval->pos_magic)
7357                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7358                                   reginfo->sv, reginfo->strbeg,
7359                                   locinput - reginfo->strbeg);
7360
7361                 if (sv_yes_mark) {
7362                     SV *sv_mrk = get_sv("REGMARK", 1);
7363                     sv_setsv(sv_mrk, sv_yes_mark);
7364                 }
7365
7366                 /* we don't use MULTICALL here as we want to call the
7367                  * first op of the block of interest, rather than the
7368                  * first op of the sub. Also, we don't want to free
7369                  * the savestack frame */
7370                 before = (IV)(SP-PL_stack_base);
7371                 PL_op = nop;
7372                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7373                 SPAGAIN;
7374                 if ((IV)(SP-PL_stack_base) == before)
7375                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7376                 else {
7377                     ret = POPs;
7378                     PUTBACK;
7379                 }
7380
7381                 /* before restoring everything, evaluate the returned
7382                  * value, so that 'uninit' warnings don't use the wrong
7383                  * PL_op or pad. Also need to process any magic vars
7384                  * (e.g. $1) *before* parentheses are restored */
7385
7386                 PL_op = NULL;
7387
7388                 re_sv = NULL;
7389                 if (logical == 0)        /*   (?{})/   */
7390                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7391                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7392                     sw = cBOOL(SvTRUE_NN(ret));
7393                     logical = 0;
7394                 }
7395                 else {                   /*  /(??{})  */
7396                     /*  if its overloaded, let the regex compiler handle
7397                      *  it; otherwise extract regex, or stringify  */
7398                     if (SvGMAGICAL(ret))
7399                         ret = sv_mortalcopy(ret);
7400                     if (!SvAMAGIC(ret)) {
7401                         SV *sv = ret;
7402                         if (SvROK(sv))
7403                             sv = SvRV(sv);
7404                         if (SvTYPE(sv) == SVt_REGEXP)
7405                             re_sv = (REGEXP*) sv;
7406                         else if (SvSMAGICAL(ret)) {
7407                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7408                             if (mg)
7409                                 re_sv = (REGEXP *) mg->mg_obj;
7410                         }
7411
7412                         /* force any undef warnings here */
7413                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7414                             ret = sv_mortalcopy(ret);
7415                             (void) SvPV_force_nolen(ret);
7416                         }
7417                     }
7418
7419                 }
7420
7421                 /* *** Note that at this point we don't restore
7422                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7423                  * be used again soon. This is safe as long as nothing
7424                  * in the regexp code uses the pad ! */
7425                 PL_op = oop;
7426                 PL_curcop = ocurcop;
7427                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7428                 PL_curpm_under = PL_curpm;
7429                 PL_curpm = PL_reg_curpm;
7430
7431                 if (logical != 2) {
7432                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7433                     /* NOTREACHED */
7434                 }
7435             }
7436
7437                 /* only /(??{})/  from now on */
7438                 logical = 0;
7439                 {
7440                     /* extract RE object from returned value; compiling if
7441                      * necessary */
7442
7443                     if (re_sv) {
7444                         re_sv = reg_temp_copy(NULL, re_sv);
7445                     }
7446                     else {
7447                         U32 pm_flags = 0;
7448
7449                         if (SvUTF8(ret) && IN_BYTES) {
7450                             /* In use 'bytes': make a copy of the octet
7451                              * sequence, but without the flag on */
7452                             STRLEN len;
7453                             const char *const p = SvPV(ret, len);
7454                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7455                         }
7456                         if (rex->intflags & PREGf_USE_RE_EVAL)
7457                             pm_flags |= PMf_USE_RE_EVAL;
7458
7459                         /* if we got here, it should be an engine which
7460                          * supports compiling code blocks and stuff */
7461                         assert(rex->engine && rex->engine->op_comp);
7462                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7463                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7464                                     rex->engine, NULL, NULL,
7465                                     /* copy /msixn etc to inner pattern */
7466                                     ARG2L(scan),
7467                                     pm_flags);
7468
7469                         if (!(SvFLAGS(ret)
7470                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7471                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7472                             /* This isn't a first class regexp. Instead, it's
7473                                caching a regexp onto an existing, Perl visible
7474                                scalar.  */
7475                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7476                         }
7477                     }
7478                     SAVEFREESV(re_sv);
7479                     re = ReANY(re_sv);
7480                 }
7481                 RXp_MATCH_COPIED_off(re);
7482                 re->subbeg = rex->subbeg;
7483                 re->sublen = rex->sublen;
7484                 re->suboffset = rex->suboffset;
7485                 re->subcoffset = rex->subcoffset;
7486                 re->lastparen = 0;
7487                 re->lastcloseparen = 0;
7488                 rei = RXi_GET(re);
7489                 DEBUG_EXECUTE_r(
7490                     debug_start_match(re_sv, utf8_target, locinput,
7491                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7492                 );
7493                 startpoint = rei->program + 1;
7494                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7495                                              * close_paren only for GOSUB */
7496                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7497                 /* Save all the seen positions so far. */
7498                 ST.cp = regcppush(rex, 0, maxopenparen);
7499                 REGCP_SET(ST.lastcp);
7500                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7501                 maxopenparen = 0;
7502                 /* run the pattern returned from (??{...}) */
7503
7504               eval_recurse_doit: /* Share code with GOSUB below this line
7505                             * At this point we expect the stack context to be
7506                             * set up correctly */
7507
7508                 /* invalidate the S-L poscache. We're now executing a
7509                  * different set of WHILEM ops (and their associated
7510                  * indexes) against the same string, so the bits in the
7511                  * cache are meaningless. Setting maxiter to zero forces
7512                  * the cache to be invalidated and zeroed before reuse.
7513                  * XXX This is too dramatic a measure. Ideally we should
7514                  * save the old cache and restore when running the outer
7515                  * pattern again */
7516                 reginfo->poscache_maxiter = 0;
7517
7518                 /* the new regexp might have a different is_utf8_pat than we do */
7519                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7520
7521                 ST.prev_rex = rex_sv;
7522                 ST.prev_curlyx = cur_curlyx;
7523                 rex_sv = re_sv;
7524                 SET_reg_curpm(rex_sv);
7525                 rex = re;
7526                 rexi = rei;
7527                 cur_curlyx = NULL;
7528                 ST.B = next;
7529                 ST.prev_eval = cur_eval;
7530                 cur_eval = st;
7531                 /* now continue from first node in postoned RE */
7532                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7533                 NOT_REACHED; /* NOTREACHED */
7534         }
7535
7536         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7537             /* note: this is called twice; first after popping B, then A */
7538             DEBUG_STACK_r({
7539                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7540                     depth, cur_eval, ST.prev_eval);
7541             });
7542
7543 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7544             if ( cur_eval && CUR_EVAL.close_paren ) {\
7545                 DEBUG_STACK_r({ \
7546                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7547                         depth,    \
7548                         CUR_EVAL.close_paren - 1,\
7549                         cur_eval, \
7550                         VAL);     \
7551                 });               \
7552                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7553             }
7554
7555             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7556
7557             rex_sv = ST.prev_rex;
7558             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7559             SET_reg_curpm(rex_sv);
7560             rex = ReANY(rex_sv);
7561             rexi = RXi_GET(rex);
7562             {
7563                 /* preserve $^R across LEAVE's. See Bug 121070. */
7564                 SV *save_sv= GvSV(PL_replgv);
7565                 SvREFCNT_inc(save_sv);
7566                 regcpblow(ST.cp); /* LEAVE in disguise */
7567                 sv_setsv(GvSV(PL_replgv), save_sv);
7568                 SvREFCNT_dec(save_sv);
7569             }
7570             cur_eval = ST.prev_eval;
7571             cur_curlyx = ST.prev_curlyx;
7572
7573             /* Invalidate cache. See "invalidate" comment above. */
7574             reginfo->poscache_maxiter = 0;
7575             if ( nochange_depth )
7576                 nochange_depth--;
7577
7578             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7579             sayYES;
7580
7581
7582         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7583             REGCP_UNWIND(ST.lastcp);
7584             sayNO;
7585
7586         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7587             /* note: this is called twice; first after popping B, then A */
7588             DEBUG_STACK_r({
7589                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7590                     depth, cur_eval, ST.prev_eval);
7591             });
7592
7593             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7594
7595             rex_sv = ST.prev_rex;
7596             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7597             SET_reg_curpm(rex_sv);
7598             rex = ReANY(rex_sv);
7599             rexi = RXi_GET(rex);
7600
7601             REGCP_UNWIND(ST.lastcp);
7602             regcppop(rex, &maxopenparen);
7603             cur_eval = ST.prev_eval;
7604             cur_curlyx = ST.prev_curlyx;
7605
7606             /* Invalidate cache. See "invalidate" comment above. */
7607             reginfo->poscache_maxiter = 0;
7608             if ( nochange_depth )
7609                 nochange_depth--;
7610
7611             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7612             sayNO_SILENT;
7613 #undef ST
7614
7615         case OPEN: /*  (  */
7616             n = ARG(scan);  /* which paren pair */
7617             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7618             if (n > maxopenparen)
7619                 maxopenparen = n;
7620             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7621                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7622                 depth,
7623                 PTR2UV(rex),
7624                 PTR2UV(rex->offs),
7625                 (UV)n,
7626                 (IV)rex->offs[n].start_tmp,
7627                 (UV)maxopenparen
7628             ));
7629             lastopen = n;
7630             break;
7631
7632         case SROPEN: /*  (*SCRIPT_RUN:  */
7633             script_run_begin = (U8 *) locinput;
7634             break;
7635
7636
7637         case CLOSE:  /*  )  */
7638             n = ARG(scan);  /* which paren pair */
7639             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7640                              locinput - reginfo->strbeg);
7641             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7642                 goto fake_end;
7643
7644             break;
7645
7646         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7647
7648             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7649             {
7650                 sayNO;
7651             }
7652
7653             break;
7654
7655
7656         case ACCEPT:  /*  (*ACCEPT)  */
7657             if (scan->flags)
7658                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7659             if (ARG2L(scan)){
7660                 regnode *cursor;
7661                 for (cursor=scan;
7662                      cursor && OP(cursor)!=END;
7663                      cursor=regnext(cursor))
7664                 {
7665                     if ( OP(cursor)==CLOSE ){
7666                         n = ARG(cursor);
7667                         if ( n <= lastopen ) {
7668                             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7669                                              locinput - reginfo->strbeg);
7670                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7671                                 break;
7672                         }
7673                     }
7674                 }
7675             }
7676             goto fake_end;
7677             /* NOTREACHED */
7678
7679         case GROUPP:  /*  (?(1))  */
7680             n = ARG(scan);  /* which paren pair */
7681             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7682             break;
7683
7684         case NGROUPP:  /*  (?(<name>))  */
7685             /* reg_check_named_buff_matched returns 0 for no match */
7686             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7687             break;
7688
7689         case INSUBP:   /*  (?(R))  */
7690             n = ARG(scan);
7691             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7692              * of SCAN is already set up as matches a eval.close_paren */
7693             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7694             break;
7695
7696         case DEFINEP:  /*  (?(DEFINE))  */
7697             sw = 0;
7698             break;
7699
7700         case IFTHEN:   /*  (?(cond)A|B)  */
7701             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7702             if (sw)
7703                 next = NEXTOPER(NEXTOPER(scan));
7704             else {
7705                 next = scan + ARG(scan);
7706                 if (OP(next) == IFTHEN) /* Fake one. */
7707                     next = NEXTOPER(NEXTOPER(next));
7708             }
7709             break;
7710
7711         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7712             logical = scan->flags;
7713             break;
7714
7715 /*******************************************************************
7716
7717 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7718 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7719 STAR/PLUS/CURLY/CURLYN are used instead.)
7720
7721 A*B is compiled as <CURLYX><A><WHILEM><B>
7722
7723 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7724 state, which contains the current count, initialised to -1. It also sets
7725 cur_curlyx to point to this state, with any previous value saved in the
7726 state block.
7727
7728 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7729 since the pattern may possibly match zero times (i.e. it's a while {} loop
7730 rather than a do {} while loop).
7731
7732 Each entry to WHILEM represents a successful match of A. The count in the
7733 CURLYX block is incremented, another WHILEM state is pushed, and execution
7734 passes to A or B depending on greediness and the current count.
7735
7736 For example, if matching against the string a1a2a3b (where the aN are
7737 substrings that match /A/), then the match progresses as follows: (the
7738 pushed states are interspersed with the bits of strings matched so far):
7739
7740     <CURLYX cnt=-1>
7741     <CURLYX cnt=0><WHILEM>
7742     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7743     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7744     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7745     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7746
7747 (Contrast this with something like CURLYM, which maintains only a single
7748 backtrack state:
7749
7750     <CURLYM cnt=0> a1
7751     a1 <CURLYM cnt=1> a2
7752     a1 a2 <CURLYM cnt=2> a3
7753     a1 a2 a3 <CURLYM cnt=3> b
7754 )
7755
7756 Each WHILEM state block marks a point to backtrack to upon partial failure
7757 of A or B, and also contains some minor state data related to that
7758 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7759 overall state, such as the count, and pointers to the A and B ops.
7760
7761 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7762 must always point to the *current* CURLYX block, the rules are:
7763
7764 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7765 and set cur_curlyx to point the new block.
7766
7767 When popping the CURLYX block after a successful or unsuccessful match,
7768 restore the previous cur_curlyx.
7769
7770 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7771 to the outer one saved in the CURLYX block.
7772
7773 When popping the WHILEM block after a successful or unsuccessful B match,
7774 restore the previous cur_curlyx.
7775
7776 Here's an example for the pattern (AI* BI)*BO
7777 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7778
7779 cur_
7780 curlyx backtrack stack
7781 ------ ---------------
7782 NULL
7783 CO     <CO prev=NULL> <WO>
7784 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7785 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7786 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7787
7788 At this point the pattern succeeds, and we work back down the stack to
7789 clean up, restoring as we go:
7790
7791 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7792 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7793 CO     <CO prev=NULL> <WO>
7794 NULL
7795
7796 *******************************************************************/
7797
7798 #define ST st->u.curlyx
7799
7800         case CURLYX:    /* start of /A*B/  (for complex A) */
7801         {
7802             /* No need to save/restore up to this paren */
7803             I32 parenfloor = scan->flags;
7804
7805             assert(next); /* keep Coverity happy */
7806             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7807                 next += ARG(next);
7808
7809             /* XXXX Probably it is better to teach regpush to support
7810                parenfloor > maxopenparen ... */
7811             if (parenfloor > (I32)rex->lastparen)
7812                 parenfloor = rex->lastparen; /* Pessimization... */
7813
7814             ST.prev_curlyx= cur_curlyx;
7815             cur_curlyx = st;
7816             ST.cp = PL_savestack_ix;
7817
7818             /* these fields contain the state of the current curly.
7819              * they are accessed by subsequent WHILEMs */
7820             ST.parenfloor = parenfloor;
7821             ST.me = scan;
7822             ST.B = next;
7823             ST.minmod = minmod;
7824             minmod = 0;
7825             ST.count = -1;      /* this will be updated by WHILEM */
7826             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7827
7828             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7829             NOT_REACHED; /* NOTREACHED */
7830         }
7831
7832         case CURLYX_end: /* just finished matching all of A*B */
7833             cur_curlyx = ST.prev_curlyx;
7834             sayYES;
7835             NOT_REACHED; /* NOTREACHED */
7836
7837         case CURLYX_end_fail: /* just failed to match all of A*B */
7838             regcpblow(ST.cp);
7839             cur_curlyx = ST.prev_curlyx;
7840             sayNO;
7841             NOT_REACHED; /* NOTREACHED */
7842
7843
7844 #undef ST
7845 #define ST st->u.whilem
7846
7847         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7848         {
7849             /* see the discussion above about CURLYX/WHILEM */
7850             I32 n;
7851             int min, max;
7852             regnode *A;
7853
7854             assert(cur_curlyx); /* keep Coverity happy */
7855
7856             min = ARG1(cur_curlyx->u.curlyx.me);
7857             max = ARG2(cur_curlyx->u.curlyx.me);
7858             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7859             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7860             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7861             ST.cache_offset = 0;
7862             ST.cache_mask = 0;
7863
7864
7865             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7866                   depth, (long)n, min, max)
7867             );
7868
7869             /* First just match a string of min A's. */
7870
7871             if (n < min) {
7872                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7873                 cur_curlyx->u.curlyx.lastloc = locinput;
7874                 REGCP_SET(ST.lastcp);
7875
7876                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7877                 NOT_REACHED; /* NOTREACHED */
7878             }
7879
7880             /* If degenerate A matches "", assume A done. */
7881
7882             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7883                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7884                    depth)
7885                 );
7886                 goto do_whilem_B_max;
7887             }
7888
7889             /* super-linear cache processing.
7890              *
7891              * The idea here is that for certain types of CURLYX/WHILEM -
7892              * principally those whose upper bound is infinity (and
7893              * excluding regexes that have things like \1 and other very
7894              * non-regular expresssiony things), then if a pattern like
7895              * /....A*.../ fails and we backtrack to the WHILEM, then we
7896              * make a note that this particular WHILEM op was at string
7897              * position 47 (say) when the rest of pattern failed. Then, if
7898              * we ever find ourselves back at that WHILEM, and at string
7899              * position 47 again, we can just fail immediately rather than
7900              * running the rest of the pattern again.
7901              *
7902              * This is very handy when patterns start to go
7903              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7904              * with a combinatorial explosion of backtracking.
7905              *
7906              * The cache is implemented as a bit array, with one bit per
7907              * string byte position per WHILEM op (up to 16) - so its
7908              * between 0.25 and 2x the string size.
7909              *
7910              * To avoid allocating a poscache buffer every time, we do an
7911              * initially countdown; only after we have  executed a WHILEM
7912              * op (string-length x #WHILEMs) times do we allocate the
7913              * cache.
7914              *
7915              * The top 4 bits of scan->flags byte say how many different
7916              * relevant CURLLYX/WHILEM op pairs there are, while the
7917              * bottom 4-bits is the identifying index number of this
7918              * WHILEM.
7919              */
7920
7921             if (scan->flags) {
7922
7923                 if (!reginfo->poscache_maxiter) {
7924                     /* start the countdown: Postpone detection until we
7925                      * know the match is not *that* much linear. */
7926                     reginfo->poscache_maxiter
7927                         =    (reginfo->strend - reginfo->strbeg + 1)
7928                            * (scan->flags>>4);
7929                     /* possible overflow for long strings and many CURLYX's */
7930                     if (reginfo->poscache_maxiter < 0)
7931                         reginfo->poscache_maxiter = I32_MAX;
7932                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7933                 }
7934
7935                 if (reginfo->poscache_iter-- == 0) {
7936                     /* initialise cache */
7937                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7938                     regmatch_info_aux *const aux = reginfo->info_aux;
7939                     if (aux->poscache) {
7940                         if ((SSize_t)reginfo->poscache_size < size) {
7941                             Renew(aux->poscache, size, char);
7942                             reginfo->poscache_size = size;
7943                         }
7944                         Zero(aux->poscache, size, char);
7945                     }
7946                     else {
7947                         reginfo->poscache_size = size;
7948                         Newxz(aux->poscache, size, char);
7949                     }
7950                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7951       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
7952                               PL_colors[4], PL_colors[5])
7953                     );
7954                 }
7955
7956                 if (reginfo->poscache_iter < 0) {
7957                     /* have we already failed at this position? */
7958                     SSize_t offset, mask;
7959
7960                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7961                     offset  = (scan->flags & 0xf) - 1
7962                                 +   (locinput - reginfo->strbeg)
7963                                   * (scan->flags>>4);
7964                     mask    = 1 << (offset % 8);
7965                     offset /= 8;
7966                     if (reginfo->info_aux->poscache[offset] & mask) {
7967                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
7968                             depth)
7969                         );
7970                         cur_curlyx->u.curlyx.count--;
7971                         sayNO; /* cache records failure */
7972                     }
7973                     ST.cache_offset = offset;
7974                     ST.cache_mask   = mask;
7975                 }
7976             }
7977
7978             /* Prefer B over A for minimal matching. */
7979
7980             if (cur_curlyx->u.curlyx.minmod) {
7981                 ST.save_curlyx = cur_curlyx;
7982                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7983                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7984                                     locinput);
7985                 NOT_REACHED; /* NOTREACHED */
7986             }
7987
7988             /* Prefer A over B for maximal matching. */
7989
7990             if (n < max) { /* More greed allowed? */
7991                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7992                             maxopenparen);
7993                 cur_curlyx->u.curlyx.lastloc = locinput;
7994                 REGCP_SET(ST.lastcp);
7995                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7996                 NOT_REACHED; /* NOTREACHED */
7997             }
7998             goto do_whilem_B_max;
7999         }
8000         NOT_REACHED; /* NOTREACHED */
8001
8002         case WHILEM_B_min: /* just matched B in a minimal match */
8003         case WHILEM_B_max: /* just matched B in a maximal match */
8004             cur_curlyx = ST.save_curlyx;
8005             sayYES;
8006             NOT_REACHED; /* NOTREACHED */
8007
8008         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8009             cur_curlyx = ST.save_curlyx;
8010             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8011             cur_curlyx->u.curlyx.count--;
8012             CACHEsayNO;
8013             NOT_REACHED; /* NOTREACHED */
8014
8015         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8016             /* FALLTHROUGH */
8017         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8018             REGCP_UNWIND(ST.lastcp);
8019             regcppop(rex, &maxopenparen);
8020             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8021             cur_curlyx->u.curlyx.count--;
8022             CACHEsayNO;
8023             NOT_REACHED; /* NOTREACHED */
8024
8025         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8026             REGCP_UNWIND(ST.lastcp);
8027             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8028             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8029                 depth)
8030             );
8031           do_whilem_B_max:
8032             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8033                 && ckWARN(WARN_REGEXP)
8034                 && !reginfo->warned)
8035             {
8036                 reginfo->warned = TRUE;
8037                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8038                      "Complex regular subexpression recursion limit (%d) "
8039                      "exceeded",
8040                      REG_INFTY - 1);
8041             }
8042
8043             /* now try B */
8044             ST.save_curlyx = cur_curlyx;
8045             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8046             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8047                                 locinput);
8048             NOT_REACHED; /* NOTREACHED */
8049
8050         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8051             cur_curlyx = ST.save_curlyx;
8052
8053             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8054                 /* Maximum greed exceeded */
8055                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8056                     && ckWARN(WARN_REGEXP)
8057                     && !reginfo->warned)
8058                 {
8059                     reginfo->warned     = TRUE;
8060                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8061                         "Complex regular subexpression recursion "
8062                         "limit (%d) exceeded",
8063                         REG_INFTY - 1);
8064                 }
8065                 cur_curlyx->u.curlyx.count--;
8066                 CACHEsayNO;
8067             }
8068
8069             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8070             );
8071             /* Try grabbing another A and see if it helps. */
8072             cur_curlyx->u.curlyx.lastloc = locinput;
8073             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8074                             maxopenparen);
8075             REGCP_SET(ST.lastcp);
8076             PUSH_STATE_GOTO(WHILEM_A_min,
8077                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8078                 locinput);
8079             NOT_REACHED; /* NOTREACHED */
8080
8081 #undef  ST
8082 #define ST st->u.branch
8083
8084         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8085             next = scan + ARG(scan);
8086             if (next == scan)
8087                 next = NULL;
8088             scan = NEXTOPER(scan);
8089             /* FALLTHROUGH */
8090
8091         case BRANCH:        /*  /(...|A|...)/ */
8092             scan = NEXTOPER(scan); /* scan now points to inner node */
8093             ST.lastparen = rex->lastparen;
8094             ST.lastcloseparen = rex->lastcloseparen;
8095             ST.next_branch = next;
8096             REGCP_SET(ST.cp);
8097
8098             /* Now go into the branch */
8099             if (has_cutgroup) {
8100                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
8101             } else {
8102                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
8103             }
8104             NOT_REACHED; /* NOTREACHED */
8105
8106         case CUTGROUP:  /*  /(*THEN)/  */
8107             sv_yes_mark = st->u.mark.mark_name = scan->flags
8108                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8109                 : NULL;
8110             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
8111             NOT_REACHED; /* NOTREACHED */
8112
8113         case CUTGROUP_next_fail:
8114             do_cutgroup = 1;
8115             no_final = 1;
8116             if (st->u.mark.mark_name)
8117                 sv_commit = st->u.mark.mark_name;
8118             sayNO;
8119             NOT_REACHED; /* NOTREACHED */
8120
8121         case BRANCH_next:
8122             sayYES;
8123             NOT_REACHED; /* NOTREACHED */
8124
8125         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8126             if (do_cutgroup) {
8127                 do_cutgroup = 0;
8128                 no_final = 0;
8129             }
8130             REGCP_UNWIND(ST.cp);
8131             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8132             scan = ST.next_branch;
8133             /* no more branches? */
8134             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8135                 DEBUG_EXECUTE_r({
8136                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8137                         depth,
8138                         PL_colors[4],
8139                         PL_colors[5] );
8140                 });
8141                 sayNO_SILENT;
8142             }
8143             continue; /* execute next BRANCH[J] op */
8144             /* NOTREACHED */
8145
8146         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8147             minmod = 1;
8148             break;
8149
8150 #undef  ST
8151 #define ST st->u.curlym
8152
8153         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8154
8155             /* This is an optimisation of CURLYX that enables us to push
8156              * only a single backtracking state, no matter how many matches
8157              * there are in {m,n}. It relies on the pattern being constant
8158              * length, with no parens to influence future backrefs
8159              */
8160
8161             ST.me = scan;
8162             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8163
8164             ST.lastparen      = rex->lastparen;
8165             ST.lastcloseparen = rex->lastcloseparen;
8166
8167             /* if paren positive, emulate an OPEN/CLOSE around A */
8168             if (ST.me->flags) {
8169                 U32 paren = ST.me->flags;
8170                 if (paren > maxopenparen)
8171                     maxopenparen = paren;
8172                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8173             }
8174             ST.A = scan;
8175             ST.B = next;
8176             ST.alen = 0;
8177             ST.count = 0;
8178             ST.minmod = minmod;
8179             minmod = 0;
8180             ST.c1 = CHRTEST_UNINIT;
8181             REGCP_SET(ST.cp);
8182
8183             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8184                 goto curlym_do_B;
8185
8186           curlym_do_A: /* execute the A in /A{m,n}B/  */
8187             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
8188             NOT_REACHED; /* NOTREACHED */
8189
8190         case CURLYM_A: /* we've just matched an A */
8191             ST.count++;
8192             /* after first match, determine A's length: u.curlym.alen */
8193             if (ST.count == 1) {
8194                 if (reginfo->is_utf8_target) {
8195                     char *s = st->locinput;
8196                     while (s < locinput) {
8197                         ST.alen++;
8198                         s += UTF8SKIP(s);
8199                     }
8200                 }
8201                 else {
8202                     ST.alen = locinput - st->locinput;
8203                 }
8204                 if (ST.alen == 0)
8205                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8206             }
8207             DEBUG_EXECUTE_r(
8208                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8209                           depth, (IV) ST.count, (IV)ST.alen)
8210             );
8211
8212             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8213                 goto fake_end;
8214
8215             {
8216                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8217                 if ( max == REG_INFTY || ST.count < max )
8218                     goto curlym_do_A; /* try to match another A */
8219             }
8220             goto curlym_do_B; /* try to match B */
8221
8222         case CURLYM_A_fail: /* just failed to match an A */
8223             REGCP_UNWIND(ST.cp);
8224
8225
8226             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8227                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8228                 sayNO;
8229
8230           curlym_do_B: /* execute the B in /A{m,n}B/  */
8231             if (ST.c1 == CHRTEST_UNINIT) {
8232                 /* calculate c1 and c2 for possible match of 1st char
8233                  * following curly */
8234                 ST.c1 = ST.c2 = CHRTEST_VOID;
8235                 assert(ST.B);
8236                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8237                     regnode *text_node = ST.B;
8238                     if (! HAS_TEXT(text_node))
8239                         FIND_NEXT_IMPT(text_node);
8240                     /* this used to be
8241
8242                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8243
8244                         But the former is redundant in light of the latter.
8245
8246                         if this changes back then the macro for
8247                         IS_TEXT and friends need to change.
8248                      */
8249                     if (PL_regkind[OP(text_node)] == EXACT) {
8250                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8251                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8252                            reginfo))
8253                         {
8254                             sayNO;
8255                         }
8256                     }
8257                 }
8258             }
8259
8260             DEBUG_EXECUTE_r(
8261                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8262                     depth, (IV)ST.count)
8263                 );
8264             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8265                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8266                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8267                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8268                     {
8269                         /* simulate B failing */
8270                         DEBUG_OPTIMISE_r(
8271                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8272                                 depth,
8273                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8274                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8275                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8276                         );
8277                         state_num = CURLYM_B_fail;
8278                         goto reenter_switch;
8279                     }
8280                 }
8281                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8282                     /* simulate B failing */
8283                     DEBUG_OPTIMISE_r(
8284                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8285                             depth,
8286                             (int) nextchr, ST.c1, ST.c2)
8287                     );
8288                     state_num = CURLYM_B_fail;
8289                     goto reenter_switch;
8290                 }
8291             }
8292
8293             if (ST.me->flags) {
8294                 /* emulate CLOSE: mark current A as captured */
8295                 U32 paren = (U32)ST.me->flags;
8296                 if (ST.count) {
8297                     CLOSE_CAPTURE(paren,
8298                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
8299                         locinput - reginfo->strbeg);
8300                 }
8301                 else
8302                     rex->offs[paren].end = -1;
8303
8304                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8305                 {
8306                     if (ST.count)
8307                         goto fake_end;
8308                     else
8309                         sayNO;
8310                 }
8311             }
8312
8313             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8314             NOT_REACHED; /* NOTREACHED */
8315
8316         case CURLYM_B_fail: /* just failed to match a B */
8317             REGCP_UNWIND(ST.cp);
8318             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8319             if (ST.minmod) {
8320                 I32 max = ARG2(ST.me);
8321                 if (max != REG_INFTY && ST.count == max)
8322                     sayNO;
8323                 goto curlym_do_A; /* try to match a further A */
8324             }
8325             /* backtrack one A */
8326             if (ST.count == ARG1(ST.me) /* min */)
8327                 sayNO;
8328             ST.count--;
8329             SET_locinput(HOPc(locinput, -ST.alen));
8330             goto curlym_do_B; /* try to match B */
8331
8332 #undef ST
8333 #define ST st->u.curly
8334
8335 #define CURLY_SETPAREN(paren, success) \
8336     if (paren) { \
8337         if (success) { \
8338             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8339                                  locinput - reginfo->strbeg); \
8340         } \
8341         else { \
8342             rex->offs[paren].end = -1; \
8343             rex->lastparen      = ST.lastparen; \
8344             rex->lastcloseparen = ST.lastcloseparen; \
8345         } \
8346     }
8347
8348         case STAR:              /*  /A*B/ where A is width 1 char */
8349             ST.paren = 0;
8350             ST.min = 0;
8351             ST.max = REG_INFTY;
8352             scan = NEXTOPER(scan);
8353             goto repeat;
8354
8355         case PLUS:              /*  /A+B/ where A is width 1 char */
8356             ST.paren = 0;
8357             ST.min = 1;
8358             ST.max = REG_INFTY;
8359             scan = NEXTOPER(scan);
8360             goto repeat;
8361
8362         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8363             ST.paren = scan->flags;     /* Which paren to set */
8364             ST.lastparen      = rex->lastparen;
8365             ST.lastcloseparen = rex->lastcloseparen;
8366             if (ST.paren > maxopenparen)
8367                 maxopenparen = ST.paren;
8368             ST.min = ARG1(scan);  /* min to match */
8369             ST.max = ARG2(scan);  /* max to match */
8370             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8371
8372             /* handle the single-char capture called as a GOSUB etc */
8373             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8374             {
8375                 char *li = locinput;
8376                 if (!regrepeat(rex, &li, scan, reginfo, 1))
8377                     sayNO;
8378                 SET_locinput(li);
8379                 goto fake_end;
8380             }
8381
8382             goto repeat;
8383
8384         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8385             ST.paren = 0;
8386             ST.min = ARG1(scan);  /* min to match */
8387             ST.max = ARG2(scan);  /* max to match */
8388             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8389           repeat:
8390             /*
8391             * Lookahead to avoid useless match attempts
8392             * when we know what character comes next.
8393             *
8394             * Used to only do .*x and .*?x, but now it allows
8395             * for )'s, ('s and (?{ ... })'s to be in the way
8396             * of the quantifier and the EXACT-like node.  -- japhy
8397             */
8398
8399             assert(ST.min <= ST.max);
8400             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8401                 ST.c1 = ST.c2 = CHRTEST_VOID;
8402             }
8403             else {
8404                 regnode *text_node = next;
8405
8406                 if (! HAS_TEXT(text_node))
8407                     FIND_NEXT_IMPT(text_node);
8408
8409                 if (! HAS_TEXT(text_node))
8410                     ST.c1 = ST.c2 = CHRTEST_VOID;
8411                 else {
8412                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8413                         ST.c1 = ST.c2 = CHRTEST_VOID;
8414                     }
8415                     else {
8416
8417                     /*  Currently we only get here when
8418
8419                         PL_rekind[OP(text_node)] == EXACT
8420
8421                         if this changes back then the macro for IS_TEXT and
8422                         friends need to change. */
8423                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8424                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8425                            reginfo))
8426                         {
8427                             sayNO;
8428                         }
8429                     }
8430                 }
8431             }
8432
8433             ST.A = scan;
8434             ST.B = next;
8435             if (minmod) {
8436                 char *li = locinput;
8437                 minmod = 0;
8438                 if (ST.min &&
8439                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8440                             < ST.min)
8441                     sayNO;
8442                 SET_locinput(li);
8443                 ST.count = ST.min;
8444                 REGCP_SET(ST.cp);
8445                 if (ST.c1 == CHRTEST_VOID)
8446                     goto curly_try_B_min;
8447
8448                 ST.oldloc = locinput;
8449
8450                 /* set ST.maxpos to the furthest point along the
8451                  * string that could possibly match */
8452                 if  (ST.max == REG_INFTY) {
8453                     ST.maxpos = reginfo->strend - 1;
8454                     if (utf8_target)
8455                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8456                             ST.maxpos--;
8457                 }
8458                 else if (utf8_target) {
8459                     int m = ST.max - ST.min;
8460                     for (ST.maxpos = locinput;
8461                          m >0 && ST.maxpos < reginfo->strend; m--)
8462                         ST.maxpos += UTF8SKIP(ST.maxpos);
8463                 }
8464                 else {
8465                     ST.maxpos = locinput + ST.max - ST.min;
8466                     if (ST.maxpos >= reginfo->strend)
8467                         ST.maxpos = reginfo->strend - 1;
8468                 }
8469                 goto curly_try_B_min_known;
8470
8471             }
8472             else {
8473                 /* avoid taking address of locinput, so it can remain
8474                  * a register var */
8475                 char *li = locinput;
8476                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8477                 if (ST.count < ST.min)
8478                     sayNO;
8479                 SET_locinput(li);
8480                 if ((ST.count > ST.min)
8481                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8482                 {
8483                     /* A{m,n} must come at the end of the string, there's
8484                      * no point in backing off ... */
8485                     ST.min = ST.count;
8486                     /* ...except that $ and \Z can match before *and* after
8487                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8488                        We may back off by one in this case. */
8489                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8490                         ST.min--;
8491                 }
8492                 REGCP_SET(ST.cp);
8493                 goto curly_try_B_max;
8494             }
8495             NOT_REACHED; /* NOTREACHED */
8496
8497         case CURLY_B_min_fail:
8498             /* failed to find B in a non-greedy match.
8499              * Handles both cases where c1,c2 valid or not */
8500
8501             REGCP_UNWIND(ST.cp);
8502             if (ST.paren) {
8503                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8504             }
8505
8506             if (ST.c1 == CHRTEST_VOID) {
8507                 /* failed -- move forward one */
8508                 char *li = locinput;
8509                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8510                     sayNO;
8511                 }
8512                 locinput = li;
8513                 ST.count++;
8514                 if (!(   ST.count <= ST.max
8515                         /* count overflow ? */
8516                      || (ST.max == REG_INFTY && ST.count > 0))
8517                 )
8518                     sayNO;
8519             }
8520             else {
8521                 int n;
8522                 /* Couldn't or didn't -- move forward. */
8523                 ST.oldloc = locinput;
8524                 if (utf8_target)
8525                     locinput += UTF8SKIP(locinput);
8526                 else
8527                     locinput++;
8528                 ST.count++;
8529
8530               curly_try_B_min_known:
8531                 /* find the next place where 'B' could work, then call B */
8532                 if (utf8_target) {
8533                     n = (ST.oldloc == locinput) ? 0 : 1;
8534                     if (ST.c1 == ST.c2) {
8535                         /* set n to utf8_distance(oldloc, locinput) */
8536                         while (locinput <= ST.maxpos
8537                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8538                         {
8539                             locinput += UTF8SKIP(locinput);
8540                             n++;
8541                         }
8542                     }
8543                     else {
8544                         /* set n to utf8_distance(oldloc, locinput) */
8545                         while (locinput <= ST.maxpos
8546                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8547                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8548                         {
8549                             locinput += UTF8SKIP(locinput);
8550                             n++;
8551                         }
8552                     }
8553                 }
8554                 else {  /* Not utf8_target */
8555                     if (ST.c1 == ST.c2) {
8556                         locinput = (char *) memchr(locinput,
8557                                                    ST.c1,
8558                                                    ST.maxpos + 1 - locinput);
8559                         if (! locinput) {
8560                             locinput = ST.maxpos + 1;
8561                         }
8562                     }
8563                     else {
8564                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8565
8566                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8567                             while (   locinput <= ST.maxpos
8568                                    && UCHARAT(locinput) != ST.c1
8569                                    && UCHARAT(locinput) != ST.c2)
8570                             {
8571                                 locinput++;
8572                             }
8573                         }
8574                         else {
8575                             /* If c1 and c2 only differ by a single bit, we can
8576                              * avoid a conditional each time through the loop,
8577                              * at the expense of a little preliminary setup and
8578                              * an extra mask each iteration.  By masking out
8579                              * that bit, we match exactly two characters, c1
8580                              * and c2, and so we don't have to test for both.
8581                              * On both ASCII and EBCDIC platforms, most of the
8582                              * ASCII-range and Latin1-range folded equivalents
8583                              * differ only in a single bit, so this is actually
8584                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8585                              * 0x61). */
8586                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8587                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8588                             while (   locinput <= ST.maxpos
8589                                    && (UCHARAT(locinput) & c1_c2_mask)
8590                                                                 != c1_masked)
8591                             {
8592                                 locinput++;
8593                             }
8594                         }
8595                     }
8596                     n = locinput - ST.oldloc;
8597                 }
8598                 if (locinput > ST.maxpos)
8599                     sayNO;
8600                 if (n) {
8601                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8602                      * at b; check that everything between oldloc and
8603                      * locinput matches */
8604                     char *li = ST.oldloc;
8605                     ST.count += n;
8606                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8607                         sayNO;
8608                     assert(n == REG_INFTY || locinput == li);
8609                 }
8610             }
8611
8612           curly_try_B_min:
8613             CURLY_SETPAREN(ST.paren, ST.count);
8614             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8615             NOT_REACHED; /* NOTREACHED */
8616
8617
8618           curly_try_B_max:
8619             /* a successful greedy match: now try to match B */
8620             {
8621                 bool could_match = locinput < reginfo->strend;
8622
8623                 /* If it could work, try it. */
8624                 if (ST.c1 != CHRTEST_VOID && could_match) {
8625                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8626                     {
8627                         could_match = memEQ(locinput,
8628                                             ST.c1_utf8,
8629                                             UTF8SKIP(locinput))
8630                                     || memEQ(locinput,
8631                                              ST.c2_utf8,
8632                                              UTF8SKIP(locinput));
8633                     }
8634                     else {
8635                         could_match = UCHARAT(locinput) == ST.c1
8636                                       || UCHARAT(locinput) == ST.c2;
8637                     }
8638                 }
8639                 if (ST.c1 == CHRTEST_VOID || could_match) {
8640                     CURLY_SETPAREN(ST.paren, ST.count);
8641                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8642                     NOT_REACHED; /* NOTREACHED */
8643                 }
8644             }
8645             /* FALLTHROUGH */
8646
8647         case CURLY_B_max_fail:
8648             /* failed to find B in a greedy match */
8649
8650             REGCP_UNWIND(ST.cp);
8651             if (ST.paren) {
8652                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8653             }
8654             /*  back up. */
8655             if (--ST.count < ST.min)
8656                 sayNO;
8657             locinput = HOPc(locinput, -1);
8658             goto curly_try_B_max;
8659
8660 #undef ST
8661
8662         case END: /*  last op of main pattern  */
8663           fake_end:
8664             if (cur_eval) {
8665                 /* we've just finished A in /(??{A})B/; now continue with B */
8666                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8667                 st->u.eval.prev_rex = rex_sv;           /* inner */
8668
8669                 /* Save *all* the positions. */
8670                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8671                 rex_sv = CUR_EVAL.prev_rex;
8672                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8673                 SET_reg_curpm(rex_sv);
8674                 rex = ReANY(rex_sv);
8675                 rexi = RXi_GET(rex);
8676
8677                 st->u.eval.prev_curlyx = cur_curlyx;
8678                 cur_curlyx = CUR_EVAL.prev_curlyx;
8679
8680                 REGCP_SET(st->u.eval.lastcp);
8681
8682                 /* Restore parens of the outer rex without popping the
8683                  * savestack */
8684                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8685
8686                 st->u.eval.prev_eval = cur_eval;
8687                 cur_eval = CUR_EVAL.prev_eval;
8688                 DEBUG_EXECUTE_r(
8689                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8690                                       depth, cur_eval););
8691                 if ( nochange_depth )
8692                     nochange_depth--;
8693
8694                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8695
8696                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8697                                     locinput); /* match B */
8698             }
8699
8700             if (locinput < reginfo->till) {
8701                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8702                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8703                                       PL_colors[4],
8704                                       (long)(locinput - startpos),
8705                                       (long)(reginfo->till - startpos),
8706                                       PL_colors[5]));
8707
8708                 sayNO_SILENT;           /* Cannot match: too short. */
8709             }
8710             sayYES;                     /* Success! */
8711
8712         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8713             DEBUG_EXECUTE_r(
8714             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8715                 depth, PL_colors[4], PL_colors[5]));
8716             sayYES;                     /* Success! */
8717
8718 #undef  ST
8719 #define ST st->u.ifmatch
8720
8721         {
8722             char *newstart;
8723
8724         case SUSPEND:   /* (?>A) */
8725             ST.wanted = 1;
8726             newstart = locinput;
8727             goto do_ifmatch;
8728
8729         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8730             ST.wanted = 0;
8731             goto ifmatch_trivial_fail_test;
8732
8733         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8734             ST.wanted = 1;
8735           ifmatch_trivial_fail_test:
8736             if (scan->flags) {
8737                 char * const s = HOPBACKc(locinput, scan->flags);
8738                 if (!s) {
8739                     /* trivial fail */
8740                     if (logical) {
8741                         logical = 0;
8742                         sw = 1 - cBOOL(ST.wanted);
8743                     }
8744                     else if (ST.wanted)
8745                         sayNO;
8746                     next = scan + ARG(scan);
8747                     if (next == scan)
8748                         next = NULL;
8749                     break;
8750                 }
8751                 newstart = s;
8752             }
8753             else
8754                 newstart = locinput;
8755
8756           do_ifmatch:
8757             ST.me = scan;
8758             ST.logical = logical;
8759             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8760
8761             /* execute body of (?...A) */
8762             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8763             NOT_REACHED; /* NOTREACHED */
8764         }
8765
8766         case IFMATCH_A_fail: /* body of (?...A) failed */
8767             ST.wanted = !ST.wanted;
8768             /* FALLTHROUGH */
8769
8770         case IFMATCH_A: /* body of (?...A) succeeded */
8771             if (ST.logical) {
8772                 sw = cBOOL(ST.wanted);
8773             }
8774             else if (!ST.wanted)
8775                 sayNO;
8776
8777             if (OP(ST.me) != SUSPEND) {
8778                 /* restore old position except for (?>...) */
8779                 locinput = st->locinput;
8780             }
8781             scan = ST.me + ARG(ST.me);
8782             if (scan == ST.me)
8783                 scan = NULL;
8784             continue; /* execute B */
8785
8786 #undef ST
8787
8788         case LONGJMP: /*  alternative with many branches compiles to
8789                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8790             next = scan + ARG(scan);
8791             if (next == scan)
8792                 next = NULL;
8793             break;
8794
8795         case COMMIT:  /*  (*COMMIT)  */
8796             reginfo->cutpoint = reginfo->strend;
8797             /* FALLTHROUGH */
8798
8799         case PRUNE:   /*  (*PRUNE)   */
8800             if (scan->flags)
8801                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8802             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8803             NOT_REACHED; /* NOTREACHED */
8804
8805         case COMMIT_next_fail:
8806             no_final = 1;
8807             /* FALLTHROUGH */
8808             sayNO;
8809             NOT_REACHED; /* NOTREACHED */
8810
8811         case OPFAIL:   /* (*FAIL)  */
8812             if (scan->flags)
8813                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8814             if (logical) {
8815                 /* deal with (?(?!)X|Y) properly,
8816                  * make sure we trigger the no branch
8817                  * of the trailing IFTHEN structure*/
8818                 sw= 0;
8819                 break;
8820             } else {
8821                 sayNO;
8822             }
8823             NOT_REACHED; /* NOTREACHED */
8824
8825 #define ST st->u.mark
8826         case MARKPOINT: /*  (*MARK:foo)  */
8827             ST.prev_mark = mark_state;
8828             ST.mark_name = sv_commit = sv_yes_mark
8829                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8830             mark_state = st;
8831             ST.mark_loc = locinput;
8832             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8833             NOT_REACHED; /* NOTREACHED */
8834
8835         case MARKPOINT_next:
8836             mark_state = ST.prev_mark;
8837             sayYES;
8838             NOT_REACHED; /* NOTREACHED */
8839
8840         case MARKPOINT_next_fail:
8841             if (popmark && sv_eq(ST.mark_name,popmark))
8842             {
8843                 if (ST.mark_loc > startpoint)
8844                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8845                 popmark = NULL; /* we found our mark */
8846                 sv_commit = ST.mark_name;
8847
8848                 DEBUG_EXECUTE_r({
8849                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8850                             depth,
8851                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8852                 });
8853             }
8854             mark_state = ST.prev_mark;
8855             sv_yes_mark = mark_state ?
8856                 mark_state->u.mark.mark_name : NULL;
8857             sayNO;
8858             NOT_REACHED; /* NOTREACHED */
8859
8860         case SKIP:  /*  (*SKIP)  */
8861             if (!scan->flags) {
8862                 /* (*SKIP) : if we fail we cut here*/
8863                 ST.mark_name = NULL;
8864                 ST.mark_loc = locinput;
8865                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8866             } else {
8867                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8868                    otherwise do nothing.  Meaning we need to scan
8869                  */
8870                 regmatch_state *cur = mark_state;
8871                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8872
8873                 while (cur) {
8874                     if ( sv_eq( cur->u.mark.mark_name,
8875                                 find ) )
8876                     {
8877                         ST.mark_name = find;
8878                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8879                     }
8880                     cur = cur->u.mark.prev_mark;
8881                 }
8882             }
8883             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8884             break;
8885
8886         case SKIP_next_fail:
8887             if (ST.mark_name) {
8888                 /* (*CUT:NAME) - Set up to search for the name as we
8889                    collapse the stack*/
8890                 popmark = ST.mark_name;
8891             } else {
8892                 /* (*CUT) - No name, we cut here.*/
8893                 if (ST.mark_loc > startpoint)
8894                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8895                 /* but we set sv_commit to latest mark_name if there
8896                    is one so they can test to see how things lead to this
8897                    cut */
8898                 if (mark_state)
8899                     sv_commit=mark_state->u.mark.mark_name;
8900             }
8901             no_final = 1;
8902             sayNO;
8903             NOT_REACHED; /* NOTREACHED */
8904 #undef ST
8905
8906         case LNBREAK: /* \R */
8907             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8908                 locinput += n;
8909             } else
8910                 sayNO;
8911             break;
8912
8913         default:
8914             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8915                           PTR2UV(scan), OP(scan));
8916             Perl_croak(aTHX_ "regexp memory corruption");
8917
8918         /* this is a point to jump to in order to increment
8919          * locinput by one character */
8920           increment_locinput:
8921             assert(!NEXTCHR_IS_EOS);
8922             if (utf8_target) {
8923                 locinput += PL_utf8skip[nextchr];
8924                 /* locinput is allowed to go 1 char off the end (signifying
8925                  * EOS), but not 2+ */
8926                 if (locinput > reginfo->strend)
8927                     sayNO;
8928             }
8929             else
8930                 locinput++;
8931             break;
8932
8933         } /* end switch */
8934
8935         /* switch break jumps here */
8936         scan = next; /* prepare to execute the next op and ... */
8937         continue;    /* ... jump back to the top, reusing st */
8938         /* NOTREACHED */
8939
8940       push_yes_state:
8941         /* push a state that backtracks on success */
8942         st->u.yes.prev_yes_state = yes_state;
8943         yes_state = st;
8944         /* FALLTHROUGH */
8945       push_state:
8946         /* push a new regex state, then continue at scan  */
8947         {
8948             regmatch_state *newst;
8949
8950             DEBUG_STACK_r({
8951                 regmatch_state *cur = st;
8952                 regmatch_state *curyes = yes_state;
8953                 U32 i;
8954                 regmatch_slab *slab = PL_regmatch_slab;
8955                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8956                     if (cur < SLAB_FIRST(slab)) {
8957                         slab = slab->prev;
8958                         cur = SLAB_LAST(slab);
8959                     }
8960                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8961                         depth,
8962                         i ? "    " : "push",
8963                         depth - i, PL_reg_name[cur->resume_state],
8964                         (curyes == cur) ? "yes" : ""
8965                     );
8966                     if (curyes == cur)
8967                         curyes = cur->u.yes.prev_yes_state;
8968                 }
8969             } else
8970                 DEBUG_STATE_pp("push")
8971             );
8972             depth++;
8973             st->locinput = locinput;
8974             newst = st+1;
8975             if (newst >  SLAB_LAST(PL_regmatch_slab))
8976                 newst = S_push_slab(aTHX);
8977             PL_regmatch_state = newst;
8978
8979             locinput = pushinput;
8980             st = newst;
8981             continue;
8982             /* NOTREACHED */
8983         }
8984     }
8985 #ifdef SOLARIS_BAD_OPTIMIZER
8986 #  undef PL_charclass
8987 #endif
8988
8989     /*
8990     * We get here only if there's trouble -- normally "case END" is
8991     * the terminating point.
8992     */
8993     Perl_croak(aTHX_ "corrupted regexp pointers");
8994     NOT_REACHED; /* NOTREACHED */
8995
8996   yes:
8997     if (yes_state) {
8998         /* we have successfully completed a subexpression, but we must now
8999          * pop to the state marked by yes_state and continue from there */
9000         assert(st != yes_state);
9001 #ifdef DEBUGGING
9002         while (st != yes_state) {
9003             st--;
9004             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9005                 PL_regmatch_slab = PL_regmatch_slab->prev;
9006                 st = SLAB_LAST(PL_regmatch_slab);
9007             }
9008             DEBUG_STATE_r({
9009                 if (no_final) {
9010                     DEBUG_STATE_pp("pop (no final)");
9011                 } else {
9012                     DEBUG_STATE_pp("pop (yes)");
9013                 }
9014             });
9015             depth--;
9016         }
9017 #else
9018         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9019             || yes_state > SLAB_LAST(PL_regmatch_slab))
9020         {
9021             /* not in this slab, pop slab */
9022             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9023             PL_regmatch_slab = PL_regmatch_slab->prev;
9024             st = SLAB_LAST(PL_regmatch_slab);
9025         }
9026         depth -= (st - yes_state);
9027 #endif
9028         st = yes_state;
9029         yes_state = st->u.yes.prev_yes_state;
9030         PL_regmatch_state = st;
9031
9032         if (no_final)
9033             locinput= st->locinput;
9034         state_num = st->resume_state + no_final;
9035         goto reenter_switch;
9036     }
9037
9038     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9039                           PL_colors[4], PL_colors[5]));
9040
9041     if (reginfo->info_aux_eval) {
9042         /* each successfully executed (?{...}) block does the equivalent of
9043          *   local $^R = do {...}
9044          * When popping the save stack, all these locals would be undone;
9045          * bypass this by setting the outermost saved $^R to the latest
9046          * value */
9047         /* I dont know if this is needed or works properly now.
9048          * see code related to PL_replgv elsewhere in this file.
9049          * Yves
9050          */
9051         if (oreplsv != GvSV(PL_replgv))
9052             sv_setsv(oreplsv, GvSV(PL_replgv));
9053     }
9054     result = 1;
9055     goto final_exit;
9056
9057   no:
9058     DEBUG_EXECUTE_r(
9059         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9060             depth,
9061             PL_colors[4], PL_colors[5])
9062         );
9063
9064   no_silent:
9065     if (no_final) {
9066         if (yes_state) {
9067             goto yes;
9068         } else {
9069             goto final_exit;
9070         }
9071     }
9072     if (depth) {
9073         /* there's a previous state to backtrack to */
9074         st--;
9075         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9076             PL_regmatch_slab = PL_regmatch_slab->prev;
9077             st = SLAB_LAST(PL_regmatch_slab);
9078         }
9079         PL_regmatch_state = st;
9080         locinput= st->locinput;
9081
9082         DEBUG_STATE_pp("pop");
9083         depth--;
9084         if (yes_state == st)
9085             yes_state = st->u.yes.prev_yes_state;
9086
9087         state_num = st->resume_state + 1; /* failure = success + 1 */
9088         PERL_ASYNC_CHECK();
9089         goto reenter_switch;
9090     }
9091     result = 0;
9092
9093   final_exit:
9094     if (rex->intflags & PREGf_VERBARG_SEEN) {
9095         SV *sv_err = get_sv("REGERROR", 1);
9096         SV *sv_mrk = get_sv("REGMARK", 1);
9097         if (result) {
9098             sv_commit = &PL_sv_no;
9099             if (!sv_yes_mark)
9100                 sv_yes_mark = &PL_sv_yes;
9101         } else {
9102             if (!sv_commit)
9103                 sv_commit = &PL_sv_yes;
9104             sv_yes_mark = &PL_sv_no;
9105         }
9106         assert(sv_err);
9107         assert(sv_mrk);
9108         sv_setsv(sv_err, sv_commit);
9109         sv_setsv(sv_mrk, sv_yes_mark);
9110     }
9111
9112
9113     if (last_pushed_cv) {
9114         dSP;
9115         /* see "Some notes about MULTICALL" above */
9116         POP_MULTICALL;
9117         PERL_UNUSED_VAR(SP);
9118     }
9119     else
9120         LEAVE_SCOPE(orig_savestack_ix);
9121
9122     assert(!result ||  locinput - reginfo->strbeg >= 0);
9123     return result ?  locinput - reginfo->strbeg : -1;
9124 }
9125
9126 /*
9127  - regrepeat - repeatedly match something simple, report how many
9128  *
9129  * What 'simple' means is a node which can be the operand of a quantifier like
9130  * '+', or {1,3}
9131  *
9132  * startposp - pointer a pointer to the start position.  This is updated
9133  *             to point to the byte following the highest successful
9134  *             match.
9135  * p         - the regnode to be repeatedly matched against.
9136  * reginfo   - struct holding match state, such as strend
9137  * max       - maximum number of things to match.
9138  * depth     - (for debugging) backtracking depth.
9139  */
9140 STATIC I32
9141 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9142             regmatch_info *const reginfo, I32 max _pDEPTH)
9143 {
9144     char *scan;     /* Pointer to current position in target string */
9145     I32 c;
9146     char *loceol = reginfo->strend;   /* local version */
9147     I32 hardcount = 0;  /* How many matches so far */
9148     bool utf8_target = reginfo->is_utf8_target;
9149     unsigned int to_complement = 0;  /* Invert the result? */
9150     UV utf8_flags;
9151     _char_class_number classnum;
9152
9153     PERL_ARGS_ASSERT_REGREPEAT;
9154
9155     scan = *startposp;
9156     if (max == REG_INFTY)
9157         max = I32_MAX;
9158     else if (! utf8_target && loceol - scan > max)
9159         loceol = scan + max;
9160
9161     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
9162      * to the maximum of how far we should go in it (leaving it set to the real
9163      * end, if the maximum permissible would take us beyond that).  This allows
9164      * us to make the loop exit condition that we haven't gone past <loceol> to
9165      * also mean that we haven't exceeded the max permissible count, saving a
9166      * test each time through the loop.  But it assumes that the OP matches a
9167      * single byte, which is true for most of the OPs below when applied to a
9168      * non-UTF-8 target.  Those relatively few OPs that don't have this
9169      * characteristic will have to compensate.
9170      *
9171      * There is no adjustment for UTF-8 targets, as the number of bytes per
9172      * character varies.  OPs will have to test both that the count is less
9173      * than the max permissible (using <hardcount> to keep track), and that we
9174      * are still within the bounds of the string (using <loceol>.  A few OPs
9175      * match a single byte no matter what the encoding.  They can omit the max
9176      * test if, for the UTF-8 case, they do the adjustment that was skipped
9177      * above.
9178      *
9179      * Thus, the code above sets things up for the common case; and exceptional
9180      * cases need extra work; the common case is to make sure <scan> doesn't
9181      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
9182      * count doesn't exceed the maximum permissible */
9183
9184     switch (OP(p)) {
9185     case REG_ANY:
9186         if (utf8_target) {
9187             while (scan < loceol && hardcount < max && *scan != '\n') {
9188                 scan += UTF8SKIP(scan);
9189                 hardcount++;
9190             }
9191         } else {
9192             scan = (char *) memchr(scan, '\n', loceol - scan);
9193             if (! scan) {
9194                 scan = loceol;
9195             }
9196         }
9197         break;
9198     case SANY:
9199         if (utf8_target) {
9200             while (scan < loceol && hardcount < max) {
9201                 scan += UTF8SKIP(scan);
9202                 hardcount++;
9203             }
9204         }
9205         else
9206             scan = loceol;
9207         break;
9208     case EXACTL:
9209         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9210         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9211             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9212         }
9213         goto do_exact;
9214
9215     case EXACT_ONLY8:
9216         if (! utf8_target) {
9217             break;
9218         }
9219         /* FALLTHROUGH */
9220     case EXACT:
9221       do_exact:
9222         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9223
9224         c = (U8)*STRING(p);
9225
9226         /* Can use a simple find if the pattern char to match on is invariant
9227          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9228          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9229          * true iff it doesn't matter if the argument is in UTF-8 or not */
9230         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9231             if (utf8_target && loceol - scan > max) {
9232                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9233                  * since here, to match at all, 1 char == 1 byte */
9234                 loceol = scan + max;
9235             }
9236             scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9237         }
9238         else if (reginfo->is_utf8_pat) {
9239             if (utf8_target) {
9240                 STRLEN scan_char_len;
9241
9242                 /* When both target and pattern are UTF-8, we have to do
9243                  * string EQ */
9244                 while (hardcount < max
9245                        && scan < loceol
9246                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9247                        && memEQ(scan, STRING(p), scan_char_len))
9248                 {
9249                     scan += scan_char_len;
9250                     hardcount++;
9251                 }
9252             }
9253             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9254
9255                 /* Target isn't utf8; convert the character in the UTF-8
9256                  * pattern to non-UTF8, and do a simple find */
9257                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9258                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9259             } /* else pattern char is above Latin1, can't possibly match the
9260                  non-UTF-8 target */
9261         }
9262         else {
9263
9264             /* Here, the string must be utf8; pattern isn't, and <c> is
9265              * different in utf8 than not, so can't compare them directly.
9266              * Outside the loop, find the two utf8 bytes that represent c, and
9267              * then look for those in sequence in the utf8 string */
9268             U8 high = UTF8_TWO_BYTE_HI(c);
9269             U8 low = UTF8_TWO_BYTE_LO(c);
9270
9271             while (hardcount < max
9272                     && scan + 1 < loceol
9273                     && UCHARAT(scan) == high
9274                     && UCHARAT(scan + 1) == low)
9275             {
9276                 scan += 2;
9277                 hardcount++;
9278             }
9279         }
9280         break;
9281
9282     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9283         assert(! reginfo->is_utf8_pat);
9284         /* FALLTHROUGH */
9285     case EXACTFAA:
9286         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9287         goto do_exactf;
9288
9289     case EXACTFL:
9290         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9291         utf8_flags = FOLDEQ_LOCALE;
9292         goto do_exactf;
9293
9294     case EXACTF:   /* This node only generated for non-utf8 patterns */
9295         assert(! reginfo->is_utf8_pat);
9296         utf8_flags = 0;
9297         goto do_exactf;
9298
9299     case EXACTFLU8:
9300         if (! utf8_target) {
9301             break;
9302         }
9303         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9304                                     | FOLDEQ_S2_FOLDS_SANE;
9305         goto do_exactf;
9306
9307     case EXACTFU_ONLY8:
9308         if (! utf8_target) {
9309             break;
9310         }
9311         assert(reginfo->is_utf8_pat);
9312         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9313         goto do_exactf;
9314
9315     case EXACTFU_SS:
9316     case EXACTFU:
9317         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9318
9319       do_exactf: {
9320         int c1, c2;
9321         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9322
9323         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9324
9325         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9326                                         reginfo))
9327         {
9328             if (c1 == CHRTEST_VOID) {
9329                 /* Use full Unicode fold matching */
9330                 char *tmpeol = reginfo->strend;
9331                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9332                 while (hardcount < max
9333                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9334                                              STRING(p), NULL, pat_len,
9335                                              reginfo->is_utf8_pat, utf8_flags))
9336                 {
9337                     scan = tmpeol;
9338                     tmpeol = reginfo->strend;
9339                     hardcount++;
9340                 }
9341             }
9342             else if (utf8_target) {
9343                 if (c1 == c2) {
9344                     while (scan < loceol
9345                            && hardcount < max
9346                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9347                     {
9348                         scan += UTF8SKIP(scan);
9349                         hardcount++;
9350                     }
9351                 }
9352                 else {
9353                     while (scan < loceol
9354                            && hardcount < max
9355                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9356                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9357                     {
9358                         scan += UTF8SKIP(scan);
9359                         hardcount++;
9360                     }
9361                 }
9362             }
9363             else if (c1 == c2) {
9364                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c1);
9365             }
9366             else {
9367                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9368                  * a conditional each time through the loop if the characters
9369                  * differ only in a single bit, as is the usual situation */
9370                 U8 c1_c2_bits_differing = c1 ^ c2;
9371
9372                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9373                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9374
9375                     scan = (char *) find_span_end_mask((U8 *) scan,
9376                                                        (U8 *) loceol,
9377                                                        c1 & c1_c2_mask,
9378                                                        c1_c2_mask);
9379                 }
9380                 else {
9381                     while (    scan < loceol
9382                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9383                     {
9384                         scan++;
9385                     }
9386                 }
9387             }
9388         }
9389         break;
9390     }
9391     case ANYOFPOSIXL:
9392     case ANYOFL:
9393         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9394
9395         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9396             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9397         }
9398         /* FALLTHROUGH */
9399     case ANYOFD:
9400     case ANYOF:
9401         if (utf8_target) {
9402             while (hardcount < max
9403                    && scan < loceol
9404                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9405             {
9406                 scan += UTF8SKIP(scan);
9407                 hardcount++;
9408             }
9409         }
9410         else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
9411             while (scan < loceol
9412                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9413                 scan++;
9414         }
9415         else {
9416             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9417                 scan++;
9418         }
9419         break;
9420
9421     case ANYOFM:
9422         if (utf8_target && loceol - scan > max) {
9423
9424             /* We didn't adjust <loceol> at the beginning of this routine
9425              * because is UTF-8, but it is actually ok to do so, since here, to
9426              * match, 1 char == 1 byte. */
9427             loceol = scan + max;
9428         }
9429
9430         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9431         break;
9432
9433     case NANYOFM:
9434         if (utf8_target) {
9435             while (     hardcount < max
9436                    &&   scan < loceol
9437                    &&  (*scan & FLAGS(p)) != ARG(p))
9438             {
9439                 scan += UTF8SKIP(scan);
9440                 hardcount++;
9441             }
9442         }
9443         else {
9444             scan = (char *) find_next_masked((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9445         }
9446         break;
9447
9448     case ASCII:
9449         if (utf8_target && loceol - scan > max) {
9450             loceol = scan + max;
9451         }
9452
9453         scan = find_next_non_ascii(scan, loceol, utf8_target);
9454         break;
9455
9456     case NASCII:
9457         if (utf8_target) {
9458             while (     hardcount < max
9459                    &&   scan < loceol
9460                    && ! isASCII_utf8_safe(scan, loceol))
9461             {
9462                 scan += UTF8SKIP(scan);
9463                 hardcount++;
9464             }
9465         }
9466         else {
9467             scan = find_next_ascii(scan, loceol, utf8_target);
9468         }
9469         break;
9470
9471     /* The argument (FLAGS) to all the POSIX node types is the class number */
9472
9473     case NPOSIXL:
9474         to_complement = 1;
9475         /* FALLTHROUGH */
9476
9477     case POSIXL:
9478         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9479         if (! utf8_target) {
9480             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9481                                                                    *scan)))
9482             {
9483                 scan++;
9484             }
9485         } else {
9486             while (hardcount < max && scan < loceol
9487                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9488                                                                   (U8 *) scan,
9489                                                                   (U8 *) loceol)))
9490             {
9491                 scan += UTF8SKIP(scan);
9492                 hardcount++;
9493             }
9494         }
9495         break;
9496
9497     case POSIXD:
9498         if (utf8_target) {
9499             goto utf8_posix;
9500         }
9501         /* FALLTHROUGH */
9502
9503     case POSIXA:
9504         if (utf8_target && loceol - scan > max) {
9505
9506             /* We didn't adjust <loceol> at the beginning of this routine
9507              * because is UTF-8, but it is actually ok to do so, since here, to
9508              * match, 1 char == 1 byte. */
9509             loceol = scan + max;
9510         }
9511         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9512             scan++;
9513         }
9514         break;
9515
9516     case NPOSIXD:
9517         if (utf8_target) {
9518             to_complement = 1;
9519             goto utf8_posix;
9520         }
9521         /* FALLTHROUGH */
9522
9523     case NPOSIXA:
9524         if (! utf8_target) {
9525             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9526                 scan++;
9527             }
9528         }
9529         else {
9530
9531             /* The complement of something that matches only ASCII matches all
9532              * non-ASCII, plus everything in ASCII that isn't in the class. */
9533             while (hardcount < max && scan < loceol
9534                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9535                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9536             {
9537                 scan += UTF8SKIP(scan);
9538                 hardcount++;
9539             }
9540         }
9541         break;
9542
9543     case NPOSIXU:
9544         to_complement = 1;
9545         /* FALLTHROUGH */
9546
9547     case POSIXU:
9548         if (! utf8_target) {
9549             while (scan < loceol && to_complement
9550                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9551             {
9552                 scan++;
9553             }
9554         }
9555         else {
9556           utf8_posix:
9557             classnum = (_char_class_number) FLAGS(p);
9558             switch (classnum) {
9559                 default:
9560                     while (   hardcount < max && scan < loceol
9561                            && to_complement ^ cBOOL(_invlist_contains_cp(
9562                                               PL_XPosix_ptrs[classnum],
9563                                               utf8_to_uvchr_buf((U8 *) scan,
9564                                                                 (U8 *) loceol,
9565                                                                 NULL))))
9566                     {
9567                         scan += UTF8SKIP(scan);
9568                         hardcount++;
9569                     }
9570                     break;
9571
9572                     /* For the classes below, the knowledge of how to handle
9573                      * every code point is compiled in to Perl via a macro.
9574                      * This code is written for making the loops as tight as
9575                      * possible.  It could be refactored to save space instead.
9576                      * */
9577
9578                 case _CC_ENUM_SPACE:
9579                     while (hardcount < max
9580                            && scan < loceol
9581                            && (to_complement
9582                                ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9583                     {
9584                         scan += UTF8SKIP(scan);
9585                         hardcount++;
9586                     }
9587                     break;
9588                 case _CC_ENUM_BLANK:
9589                     while (hardcount < max
9590                            && scan < loceol
9591                            && (to_complement
9592                                 ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9593                     {
9594                         scan += UTF8SKIP(scan);
9595                         hardcount++;
9596                     }
9597                     break;
9598                 case _CC_ENUM_XDIGIT:
9599                     while (hardcount < max
9600                            && scan < loceol
9601                            && (to_complement
9602                                ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9603                     {
9604                         scan += UTF8SKIP(scan);
9605                         hardcount++;
9606                     }
9607                     break;
9608                 case _CC_ENUM_VERTSPACE:
9609                     while (hardcount < max
9610                            && scan < loceol
9611                            && (to_complement
9612                                ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9613                     {
9614                         scan += UTF8SKIP(scan);
9615                         hardcount++;
9616                     }
9617                     break;
9618                 case _CC_ENUM_CNTRL:
9619                     while (hardcount < max
9620                            && scan < loceol
9621                            && (to_complement
9622                                ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9623                     {
9624                         scan += UTF8SKIP(scan);
9625                         hardcount++;
9626                     }
9627                     break;
9628             }
9629         }
9630         break;
9631
9632     case LNBREAK:
9633         if (utf8_target) {
9634             while (hardcount < max && scan < loceol &&
9635                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9636                 scan += c;
9637                 hardcount++;
9638             }
9639         } else {
9640             /* LNBREAK can match one or two latin chars, which is ok, but we
9641              * have to use hardcount in this situation, and throw away the
9642              * adjustment to <loceol> done before the switch statement */
9643             loceol = reginfo->strend;
9644             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9645                 scan+=c;
9646                 hardcount++;
9647             }
9648         }
9649         break;
9650
9651     case BOUNDL:
9652     case NBOUNDL:
9653         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9654         /* FALLTHROUGH */
9655     case BOUND:
9656     case BOUNDA:
9657     case BOUNDU:
9658     case EOS:
9659     case GPOS:
9660     case KEEPS:
9661     case NBOUND:
9662     case NBOUNDA:
9663     case NBOUNDU:
9664     case OPFAIL:
9665     case SBOL:
9666     case SEOL:
9667         /* These are all 0 width, so match right here or not at all. */
9668         break;
9669
9670     default:
9671         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9672         NOT_REACHED; /* NOTREACHED */
9673
9674     }
9675
9676     if (hardcount)
9677         c = hardcount;
9678     else
9679         c = scan - *startposp;
9680     *startposp = scan;
9681
9682     DEBUG_r({
9683         GET_RE_DEBUG_FLAGS_DECL;
9684         DEBUG_EXECUTE_r({
9685             SV * const prop = sv_newmortal();
9686             regprop(prog, prop, p, reginfo, NULL);
9687             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9688                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9689         });
9690     });
9691
9692     return(c);
9693 }
9694
9695
9696 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9697 /*
9698 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9699 create a copy so that changes the caller makes won't change the shared one.
9700 If <altsvp> is non-null, will return NULL in it, for back-compat.
9701  */
9702 SV *
9703 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9704 {
9705     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9706
9707     if (altsvp) {
9708         *altsvp = NULL;
9709     }
9710
9711     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9712 }
9713
9714 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9715
9716 /*
9717  - reginclass - determine if a character falls into a character class
9718
9719   n is the ANYOF-type regnode
9720   p is the target string
9721   p_end points to one byte beyond the end of the target string
9722   utf8_target tells whether p is in UTF-8.
9723
9724   Returns true if matched; false otherwise.
9725
9726   Note that this can be a synthetic start class, a combination of various
9727   nodes, so things you think might be mutually exclusive, such as locale,
9728   aren't.  It can match both locale and non-locale
9729
9730  */
9731
9732 STATIC bool
9733 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9734 {
9735     dVAR;
9736     const char flags = ANYOF_FLAGS(n);
9737     bool match = FALSE;
9738     UV c = *p;
9739
9740     PERL_ARGS_ASSERT_REGINCLASS;
9741
9742     /* If c is not already the code point, get it.  Note that
9743      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9744     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9745         STRLEN c_len = 0;
9746         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9747         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9748         if (c_len == (STRLEN)-1) {
9749             _force_out_malformed_utf8_message(p, p_end,
9750                                               utf8n_flags,
9751                                               1 /* 1 means die */ );
9752             NOT_REACHED; /* NOTREACHED */
9753         }
9754         if (     c > 255
9755             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9756             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9757         {
9758             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9759         }
9760     }
9761
9762     /* If this character is potentially in the bitmap, check it */
9763     if (c < NUM_ANYOF_CODE_POINTS) {
9764         if (ANYOF_BITMAP_TEST(n, c))
9765             match = TRUE;
9766         else if ((flags
9767                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9768                   && OP(n) == ANYOFD
9769                   && ! utf8_target
9770                   && ! isASCII(c))
9771         {
9772             match = TRUE;
9773         }
9774         else if (flags & ANYOF_LOCALE_FLAGS) {
9775             if ((flags & ANYOFL_FOLD)
9776                 && c < 256
9777                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9778             {
9779                 match = TRUE;
9780             }
9781             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9782                      && c < 256
9783             ) {
9784
9785                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9786                  * if the class includes the Posix character class given by
9787                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9788                  * complemented Posix class given by int(bit/2).  So we loop
9789                  * through the bits, each time changing whether we complement
9790                  * the result or not.  Suppose for the sake of illustration
9791                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9792                  * is set, it means there is a match for this ANYOF node if the
9793                  * character is in the class given by the expression (0 / 2 = 0
9794                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9795                  * and since 'to_complement' is 0, the result will stay TRUE,
9796                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9797                  * bit 1 is 1.  That means there is a match if the character
9798                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9799                  * but will on bit 1.  On the second iteration 'to_complement'
9800                  * will be 1, so the exclusive or will reverse things, so we
9801                  * are testing for \W.  On the third iteration, 'to_complement'
9802                  * will be 0, and we would be testing for \s; the fourth
9803                  * iteration would test for \S, etc.
9804                  *
9805                  * Note that this code assumes that all the classes are closed
9806                  * under folding.  For example, if a character matches \w, then
9807                  * its fold does too; and vice versa.  This should be true for
9808                  * any well-behaved locale for all the currently defined Posix
9809                  * classes, except for :lower: and :upper:, which are handled
9810                  * by the pseudo-class :cased: which matches if either of the
9811                  * other two does.  To get rid of this assumption, an outer
9812                  * loop could be used below to iterate over both the source
9813                  * character, and its fold (if different) */
9814
9815                 int count = 0;
9816                 int to_complement = 0;
9817
9818                 while (count < ANYOF_MAX) {
9819                     if (ANYOF_POSIXL_TEST(n, count)
9820                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9821                     {
9822                         match = TRUE;
9823                         break;
9824                     }
9825                     count++;
9826                     to_complement ^= 1;
9827                 }
9828             }
9829         }
9830     }
9831
9832
9833     /* If the bitmap didn't (or couldn't) match, and something outside the
9834      * bitmap could match, try that. */
9835     if (!match) {
9836         if (c >= NUM_ANYOF_CODE_POINTS
9837             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9838         {
9839             match = TRUE;       /* Everything above the bitmap matches */
9840         }
9841             /* Here doesn't match everything above the bitmap.  If there is
9842              * some information available beyond the bitmap, we may find a
9843              * match in it.  If so, this is most likely because the code point
9844              * is outside the bitmap range.  But rarely, it could be because of
9845              * some other reason.  If so, various flags are set to indicate
9846              * this possibility.  On ANYOFD nodes, there may be matches that
9847              * happen only when the target string is UTF-8; or for other node
9848              * types, because runtime lookup is needed, regardless of the
9849              * UTF-8ness of the target string.  Finally, under /il, there may
9850              * be some matches only possible if the locale is a UTF-8 one. */
9851         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9852                  && (   c >= NUM_ANYOF_CODE_POINTS
9853                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9854                          && (   UNLIKELY(OP(n) != ANYOFD)
9855                              || (utf8_target && ! isASCII_uni(c)
9856 #                               if NUM_ANYOF_CODE_POINTS > 256
9857                                                                  && c < 256
9858 #                               endif
9859                                 )))
9860                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9861                          && IN_UTF8_CTYPE_LOCALE)))
9862         {
9863             SV* only_utf8_locale = NULL;
9864             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9865                                                        &only_utf8_locale, NULL);
9866             if (sw) {
9867                 U8 utf8_buffer[2];
9868                 U8 * utf8_p;
9869                 if (utf8_target) {
9870                     utf8_p = (U8 *) p;
9871                 } else { /* Convert to utf8 */
9872                     utf8_p = utf8_buffer;
9873                     append_utf8_from_native_byte(*p, &utf8_p);
9874                     utf8_p = utf8_buffer;
9875                 }
9876
9877                 if (swash_fetch(sw, utf8_p, TRUE)) {
9878                     match = TRUE;
9879                 }
9880             }
9881             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9882                 match = _invlist_contains_cp(only_utf8_locale, c);
9883             }
9884         }
9885
9886         if (UNICODE_IS_SUPER(c)
9887             && (flags
9888                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9889             && OP(n) != ANYOFD
9890             && ckWARN_d(WARN_NON_UNICODE))
9891         {
9892             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9893                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9894         }
9895     }
9896
9897 #if ANYOF_INVERT != 1
9898     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9899      * use it, don't */
9900 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9901 #endif
9902
9903     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9904     return (flags & ANYOF_INVERT) ^ match;
9905 }
9906
9907 STATIC U8 *
9908 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9909 {
9910     /* return the position 'off' UTF-8 characters away from 's', forward if
9911      * 'off' >= 0, backwards if negative.  But don't go outside of position
9912      * 'lim', which better be < s  if off < 0 */
9913
9914     PERL_ARGS_ASSERT_REGHOP3;
9915
9916     if (off >= 0) {
9917         while (off-- && s < lim) {
9918             /* XXX could check well-formedness here */
9919             U8 *new_s = s + UTF8SKIP(s);
9920             if (new_s > lim) /* lim may be in the middle of a long character */
9921                 return s;
9922             s = new_s;
9923         }
9924     }
9925     else {
9926         while (off++ && s > lim) {
9927             s--;
9928             if (UTF8_IS_CONTINUED(*s)) {
9929                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9930                     s--;
9931                 if (! UTF8_IS_START(*s)) {
9932                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9933                 }
9934             }
9935             /* XXX could check well-formedness here */
9936         }
9937     }
9938     return s;
9939 }
9940
9941 STATIC U8 *
9942 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9943 {
9944     PERL_ARGS_ASSERT_REGHOP4;
9945
9946     if (off >= 0) {
9947         while (off-- && s < rlim) {
9948             /* XXX could check well-formedness here */
9949             s += UTF8SKIP(s);
9950         }
9951     }
9952     else {
9953         while (off++ && s > llim) {
9954             s--;
9955             if (UTF8_IS_CONTINUED(*s)) {
9956                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9957                     s--;
9958                 if (! UTF8_IS_START(*s)) {
9959                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9960                 }
9961             }
9962             /* XXX could check well-formedness here */
9963         }
9964     }
9965     return s;
9966 }
9967
9968 /* like reghop3, but returns NULL on overrun, rather than returning last
9969  * char pos */
9970
9971 STATIC U8 *
9972 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9973 {
9974     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9975
9976     if (off >= 0) {
9977         while (off-- && s < lim) {
9978             /* XXX could check well-formedness here */
9979             s += UTF8SKIP(s);
9980         }
9981         if (off >= 0)
9982             return NULL;
9983     }
9984     else {
9985         while (off++ && s > lim) {
9986             s--;
9987             if (UTF8_IS_CONTINUED(*s)) {
9988                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9989                     s--;
9990                 if (! UTF8_IS_START(*s)) {
9991                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9992                 }
9993             }
9994             /* XXX could check well-formedness here */
9995         }
9996         if (off <= 0)
9997             return NULL;
9998     }
9999     return s;
10000 }
10001
10002
10003 /* when executing a regex that may have (?{}), extra stuff needs setting
10004    up that will be visible to the called code, even before the current
10005    match has finished. In particular:
10006
10007    * $_ is localised to the SV currently being matched;
10008    * pos($_) is created if necessary, ready to be updated on each call-out
10009      to code;
10010    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10011      isn't set until the current pattern is successfully finished), so that
10012      $1 etc of the match-so-far can be seen;
10013    * save the old values of subbeg etc of the current regex, and  set then
10014      to the current string (again, this is normally only done at the end
10015      of execution)
10016 */
10017
10018 static void
10019 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10020 {
10021     MAGIC *mg;
10022     regexp *const rex = ReANY(reginfo->prog);
10023     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10024
10025     eval_state->rex = rex;
10026
10027     if (reginfo->sv) {
10028         /* Make $_ available to executed code. */
10029         if (reginfo->sv != DEFSV) {
10030             SAVE_DEFSV;
10031             DEFSV_set(reginfo->sv);
10032         }
10033
10034         if (!(mg = mg_find_mglob(reginfo->sv))) {
10035             /* prepare for quick setting of pos */
10036             mg = sv_magicext_mglob(reginfo->sv);
10037             mg->mg_len = -1;
10038         }
10039         eval_state->pos_magic = mg;
10040         eval_state->pos       = mg->mg_len;
10041         eval_state->pos_flags = mg->mg_flags;
10042     }
10043     else
10044         eval_state->pos_magic = NULL;
10045
10046     if (!PL_reg_curpm) {
10047         /* PL_reg_curpm is a fake PMOP that we can attach the current
10048          * regex to and point PL_curpm at, so that $1 et al are visible
10049          * within a /(?{})/. It's just allocated once per interpreter the
10050          * first time its needed */
10051         Newxz(PL_reg_curpm, 1, PMOP);
10052 #ifdef USE_ITHREADS
10053         {
10054             SV* const repointer = &PL_sv_undef;
10055             /* this regexp is also owned by the new PL_reg_curpm, which
10056                will try to free it.  */
10057             av_push(PL_regex_padav, repointer);
10058             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10059             PL_regex_pad = AvARRAY(PL_regex_padav);
10060         }
10061 #endif
10062     }
10063     SET_reg_curpm(reginfo->prog);
10064     eval_state->curpm = PL_curpm;
10065     PL_curpm_under = PL_curpm;
10066     PL_curpm = PL_reg_curpm;
10067     if (RXp_MATCH_COPIED(rex)) {
10068         /*  Here is a serious problem: we cannot rewrite subbeg,
10069             since it may be needed if this match fails.  Thus
10070             $` inside (?{}) could fail... */
10071         eval_state->subbeg     = rex->subbeg;
10072         eval_state->sublen     = rex->sublen;
10073         eval_state->suboffset  = rex->suboffset;
10074         eval_state->subcoffset = rex->subcoffset;
10075 #ifdef PERL_ANY_COW
10076         eval_state->saved_copy = rex->saved_copy;
10077 #endif
10078         RXp_MATCH_COPIED_off(rex);
10079     }
10080     else
10081         eval_state->subbeg = NULL;
10082     rex->subbeg = (char *)reginfo->strbeg;
10083     rex->suboffset = 0;
10084     rex->subcoffset = 0;
10085     rex->sublen = reginfo->strend - reginfo->strbeg;
10086 }
10087
10088
10089 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10090
10091 static void
10092 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10093 {
10094     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10095     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10096     regmatch_slab *s;
10097
10098     Safefree(aux->poscache);
10099
10100     if (eval_state) {
10101
10102         /* undo the effects of S_setup_eval_state() */
10103
10104         if (eval_state->subbeg) {
10105             regexp * const rex = eval_state->rex;
10106             rex->subbeg     = eval_state->subbeg;
10107             rex->sublen     = eval_state->sublen;
10108             rex->suboffset  = eval_state->suboffset;
10109             rex->subcoffset = eval_state->subcoffset;
10110 #ifdef PERL_ANY_COW
10111             rex->saved_copy = eval_state->saved_copy;
10112 #endif
10113             RXp_MATCH_COPIED_on(rex);
10114         }
10115         if (eval_state->pos_magic)
10116         {
10117             eval_state->pos_magic->mg_len = eval_state->pos;
10118             eval_state->pos_magic->mg_flags =
10119                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10120                | (eval_state->pos_flags & MGf_BYTES);
10121         }
10122
10123         PL_curpm = eval_state->curpm;
10124     }
10125
10126     PL_regmatch_state = aux->old_regmatch_state;
10127     PL_regmatch_slab  = aux->old_regmatch_slab;
10128
10129     /* free all slabs above current one - this must be the last action
10130      * of this function, as aux and eval_state are allocated within
10131      * slabs and may be freed here */
10132
10133     s = PL_regmatch_slab->next;
10134     if (s) {
10135         PL_regmatch_slab->next = NULL;
10136         while (s) {
10137             regmatch_slab * const osl = s;
10138             s = s->next;
10139             Safefree(osl);
10140         }
10141     }
10142 }
10143
10144
10145 STATIC void
10146 S_to_utf8_substr(pTHX_ regexp *prog)
10147 {
10148     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10149      * on the converted value */
10150
10151     int i = 1;
10152
10153     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10154
10155     do {
10156         if (prog->substrs->data[i].substr
10157             && !prog->substrs->data[i].utf8_substr) {
10158             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10159             prog->substrs->data[i].utf8_substr = sv;
10160             sv_utf8_upgrade(sv);
10161             if (SvVALID(prog->substrs->data[i].substr)) {
10162                 if (SvTAIL(prog->substrs->data[i].substr)) {
10163                     /* Trim the trailing \n that fbm_compile added last
10164                        time.  */
10165                     SvCUR_set(sv, SvCUR(sv) - 1);
10166                     /* Whilst this makes the SV technically "invalid" (as its
10167                        buffer is no longer followed by "\0") when fbm_compile()
10168                        adds the "\n" back, a "\0" is restored.  */
10169                     fbm_compile(sv, FBMcf_TAIL);
10170                 } else
10171                     fbm_compile(sv, 0);
10172             }
10173             if (prog->substrs->data[i].substr == prog->check_substr)
10174                 prog->check_utf8 = sv;
10175         }
10176     } while (i--);
10177 }
10178
10179 STATIC bool
10180 S_to_byte_substr(pTHX_ regexp *prog)
10181 {
10182     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10183      * on the converted value; returns FALSE if can't be converted. */
10184
10185     int i = 1;
10186
10187     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10188
10189     do {
10190         if (prog->substrs->data[i].utf8_substr
10191             && !prog->substrs->data[i].substr) {
10192             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10193             if (! sv_utf8_downgrade(sv, TRUE)) {
10194                 return FALSE;
10195             }
10196             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10197                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10198                     /* Trim the trailing \n that fbm_compile added last
10199                         time.  */
10200                     SvCUR_set(sv, SvCUR(sv) - 1);
10201                     fbm_compile(sv, FBMcf_TAIL);
10202                 } else
10203                     fbm_compile(sv, 0);
10204             }
10205             prog->substrs->data[i].substr = sv;
10206             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10207                 prog->check_substr = sv;
10208         }
10209     } while (i--);
10210
10211     return TRUE;
10212 }
10213
10214 #ifndef PERL_IN_XSUB_RE
10215
10216 bool
10217 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10218 {
10219     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10220      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10221      * the larger string bounded by 'strbeg' and 'strend'.
10222      *
10223      * 'cp' needs to be assigned (if not a future version of the Unicode
10224      * Standard could make it something that combines with adjacent characters,
10225      * so code using it would then break), and there has to be a GCB break
10226      * before and after the character. */
10227
10228     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10229     const U8 * prev_cp_start;
10230
10231     PERL_ARGS_ASSERT__IS_GRAPHEME;
10232
10233     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10234         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10235     {
10236         /* These are considered graphemes */
10237         return TRUE;
10238     }
10239
10240     /* Otherwise, unassigned code points are forbidden */
10241     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10242                                     _invlist_search(PL_Assigned_invlist, cp))))
10243     {
10244         return FALSE;
10245     }
10246
10247     cp_gcb_val = getGCB_VAL_CP(cp);
10248
10249     /* Find the GCB value of the previous code point in the input */
10250     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10251     if (UNLIKELY(prev_cp_start == s)) {
10252         prev_cp_gcb_val = GCB_EDGE;
10253     }
10254     else {
10255         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10256     }
10257
10258     /* And check that is a grapheme boundary */
10259     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10260                 TRUE /* is UTF-8 encoded */ ))
10261     {
10262         return FALSE;
10263     }
10264
10265     /* Similarly verify there is a break between the current character and the
10266      * following one */
10267     s += UTF8SKIP(s);
10268     if (s >= strend) {
10269         next_cp_gcb_val = GCB_EDGE;
10270     }
10271     else {
10272         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10273     }
10274
10275     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10276 }
10277
10278 /*
10279 =head1 Unicode Support
10280
10281 =for apidoc isSCRIPT_RUN
10282
10283 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10284 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10285 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10286 two degenerate cases given below, this function returns TRUE iff all code
10287 points in it come from any combination of three "scripts" given by the Unicode
10288 "Script Extensions" property: Common, Inherited, and possibly one other.
10289 Additionally all decimal digits must come from the same consecutive sequence of
10290 10.
10291
10292 For example, if all the characters in the sequence are Greek, or Common, or
10293 Inherited, this function will return TRUE, provided any decimal digits in it
10294 are the ASCII digits "0".."9".  For scripts (unlike Greek) that have their own
10295 digits defined this will accept either digits from that set or from 0..9, but
10296 not a combination of the two.  Some scripts, such as Arabic, have more than one
10297 set of digits.  All digits must come from the same set for this function to
10298 return TRUE.
10299
10300 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10301 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10302 C<SCX_INVALID> if the function returns FALSE.
10303
10304 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10305 will be C<SCX_INVALID>.
10306
10307 If the sequence contains a single code point which is unassigned to a character
10308 in the version of Unicode being used, the function will return TRUE, and the
10309 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10310 in the input sequence will result in the function treating the input as not
10311 being a script run.
10312
10313 The returned script will be C<SCX_Inherited> iff all the code points in it are
10314 from the Inherited script.
10315
10316 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10317 it are from the Inherited or Common scripts.
10318
10319 =cut
10320
10321 */
10322
10323 bool
10324 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10325 {
10326     /* Basically, it looks at each character in the sequence to see if the
10327      * above conditions are met; if not it fails.  It uses an inversion map to
10328      * find the enum corresponding to the script of each character.  But this
10329      * is complicated by the fact that a few code points can be in any of
10330      * several scripts.  The data has been constructed so that there are
10331      * additional enum values (all negative) for these situations.  The
10332      * absolute value of those is an index into another table which contains
10333      * pointers to auxiliary tables for each such situation.  Each aux array
10334      * lists all the scripts for the given situation.  There is another,
10335      * parallel, table that gives the number of entries in each aux table.
10336      * These are all defined in charclass_invlists.h */
10337
10338     /* XXX Here are the additional things UTS 39 says could be done:
10339      *
10340      * Forbid sequences of the same nonspacing mark
10341      *
10342      * Check to see that all the characters are in the sets of exemplar
10343      * characters for at least one language in the Unicode Common Locale Data
10344      * Repository [CLDR]. */
10345
10346
10347     /* Things that match /\d/u */
10348     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10349     UV * decimals_array = invlist_array(decimals_invlist);
10350
10351     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10352      * not currently known) */
10353     UV zero_of_run = 0;
10354
10355     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10356     SCX_enum script_of_char = SCX_INVALID;
10357
10358     /* If the script remains not fully determined from iteration to iteration,
10359      * this is the current intersection of the possiblities.  */
10360     SCX_enum * intersection = NULL;
10361     PERL_UINT_FAST8_T intersection_len = 0;
10362
10363     bool retval = TRUE;
10364     SCX_enum * ret_script = NULL;
10365
10366     assert(send >= s);
10367
10368     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10369
10370     /* All code points in 0..255 are either Common or Latin, so must be a
10371      * script run.  We can return immediately unless we need to know which
10372      * script it is. */
10373     if (! utf8_target && LIKELY(send > s)) {
10374         if (ret_script == NULL) {
10375             return TRUE;
10376         }
10377
10378         /* If any character is Latin, the run is Latin */
10379         while (s < send) {
10380             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10381                 *ret_script = SCX_Latin;
10382                 return TRUE;
10383             }
10384         }
10385
10386         /* Here, all are Common */
10387         *ret_script = SCX_Common;
10388         return TRUE;
10389     }
10390
10391     /* Look at each character in the sequence */
10392     while (s < send) {
10393         /* If the current character being examined is a digit, this is the code
10394          * point of the zero for its sequence of 10 */
10395         UV zero_of_char;
10396
10397         UV cp;
10398
10399         /* The code allows all scripts to use the ASCII digits.  This is
10400          * because they are used in commerce even in scripts that have their
10401          * own set.  Hence any ASCII ones found are ok, unless and until a
10402          * digit from another set has already been encountered.  (The other
10403          * digit ranges in Common are not similarly blessed) */
10404         if (UNLIKELY(isDIGIT(*s))) {
10405             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10406                 retval = FALSE;
10407                 break;
10408             }
10409             if (zero_of_run) {
10410                 if (zero_of_run != '0') {
10411                     retval = FALSE;
10412                     break;
10413                 }
10414             }
10415             else {
10416                 zero_of_run = '0';
10417             }
10418             s++;
10419             continue;
10420         }
10421
10422         /* Here, isn't an ASCII digit.  Find the code point of the character */
10423         if (! UTF8_IS_INVARIANT(*s)) {
10424             Size_t len;
10425             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10426             s += len;
10427         }
10428         else {
10429             cp = *(s++);
10430         }
10431
10432         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10433          * digit in that script.  We can skip the rest of this code for this
10434          * character. */
10435         if (UNLIKELY(   zero_of_run
10436                      && cp >= zero_of_run
10437                      && cp - zero_of_run <= 9))
10438         {
10439             continue;
10440         }
10441
10442         /* Find the character's script.  The correct values are hard-coded here
10443          * for small-enough code points. */
10444         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10445                                unlikely to change */
10446             if (       cp > 255
10447                 || (   isALPHA_L1(cp)
10448                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10449             {
10450                 script_of_char = SCX_Latin;
10451             }
10452             else {
10453                 script_of_char = SCX_Common;
10454             }
10455         }
10456         else {
10457             script_of_char = _Perl_SCX_invmap[
10458                                        _invlist_search(PL_SCX_invlist, cp)];
10459         }
10460
10461         /* We arbitrarily accept a single unassigned character, but not in
10462          * combination with anything else, and not a run of them. */
10463         if (   UNLIKELY(script_of_run == SCX_Unknown)
10464             || UNLIKELY(   script_of_run != SCX_INVALID
10465                         && script_of_char == SCX_Unknown))
10466         {
10467             retval = FALSE;
10468             break;
10469         }
10470
10471         /* For the first character, or the run is inherited, the run's script
10472          * is set to the char's */
10473         if (   UNLIKELY(script_of_run == SCX_INVALID)
10474             || UNLIKELY(script_of_run == SCX_Inherited))
10475         {
10476             script_of_run = script_of_char;
10477         }
10478
10479         /* For the character's script to be Unknown, it must be the first
10480          * character in the sequence (for otherwise a test above would have
10481          * prevented us from reaching here), and we have set the run's script
10482          * to it.  Nothing further to be done for this character */
10483         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10484             continue;
10485         }
10486
10487         /* We accept 'inherited' script characters currently even at the
10488          * beginning.  (We know that no characters in Inherited are digits, or
10489          * we'd have to check for that) */
10490         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10491             continue;
10492         }
10493
10494         /* If the run so far is Common, and the new character isn't, change the
10495          * run's script to that of this character */
10496         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10497
10498             /* But Common contains several sets of digits.  Only the '0' set
10499              * can be part of another script. */
10500             if (zero_of_run && zero_of_run != '0') {
10501                 retval = FALSE;
10502                 break;
10503             }
10504
10505             script_of_run = script_of_char;
10506         }
10507
10508         /* Now we can see if the script of the character is the same as that of
10509          * the run */
10510         if (LIKELY(script_of_char == script_of_run)) {
10511             /* By far the most common case */
10512             goto scripts_match;
10513         }
10514
10515         /* Here, the script of the run isn't Common.  But characters in Common
10516          * match any script */
10517         if (script_of_char == SCX_Common) {
10518             goto scripts_match;
10519         }
10520
10521 #ifndef HAS_SCX_AUX_TABLES
10522
10523         /* Too early a Unicode version to have a code point belonging to more
10524          * than one script, so, if the scripts don't exactly match, fail */
10525         PERL_UNUSED_VAR(intersection_len);
10526         retval = FALSE;
10527         break;
10528
10529 #else
10530
10531         /* Here there is no exact match between the character's script and the
10532          * run's.  And we've handled the special cases of scripts Unknown,
10533          * Inherited, and Common.
10534          *
10535          * Negative script numbers signify that the value may be any of several
10536          * scripts, and we need to look at auxiliary information to make our
10537          * deterimination.  But if both are non-negative, we can fail now */
10538         if (LIKELY(script_of_char >= 0)) {
10539             const SCX_enum * search_in;
10540             PERL_UINT_FAST8_T search_in_len;
10541             PERL_UINT_FAST8_T i;
10542
10543             if (LIKELY(script_of_run >= 0)) {
10544                 retval = FALSE;
10545                 break;
10546             }
10547
10548             /* Use the previously constructed set of possible scripts, if any.
10549              * */
10550             if (intersection) {
10551                 search_in = intersection;
10552                 search_in_len = intersection_len;
10553             }
10554             else {
10555                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10556                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10557             }
10558
10559             for (i = 0; i < search_in_len; i++) {
10560                 if (search_in[i] == script_of_char) {
10561                     script_of_run = script_of_char;
10562                     goto scripts_match;
10563                 }
10564             }
10565
10566             retval = FALSE;
10567             break;
10568         }
10569         else if (LIKELY(script_of_run >= 0)) {
10570             /* script of character could be one of several, but run is a single
10571              * script */
10572             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10573             const PERL_UINT_FAST8_T search_in_len
10574                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10575             PERL_UINT_FAST8_T i;
10576
10577             for (i = 0; i < search_in_len; i++) {
10578                 if (search_in[i] == script_of_run) {
10579                     script_of_char = script_of_run;
10580                     goto scripts_match;
10581                 }
10582             }
10583
10584             retval = FALSE;
10585             break;
10586         }
10587         else {
10588             /* Both run and char could be in one of several scripts.  If the
10589              * intersection is empty, then this character isn't in this script
10590              * run.  Otherwise, we need to calculate the intersection to use
10591              * for future iterations of the loop, unless we are already at the
10592              * final character */
10593             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10594             const PERL_UINT_FAST8_T char_len
10595                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10596             const SCX_enum * search_run;
10597             PERL_UINT_FAST8_T run_len;
10598
10599             SCX_enum * new_overlap = NULL;
10600             PERL_UINT_FAST8_T i, j;
10601
10602             if (intersection) {
10603                 search_run = intersection;
10604                 run_len = intersection_len;
10605             }
10606             else {
10607                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10608                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10609             }
10610
10611             intersection_len = 0;
10612
10613             for (i = 0; i < run_len; i++) {
10614                 for (j = 0; j < char_len; j++) {
10615                     if (search_run[i] == search_char[j]) {
10616
10617                         /* Here, the script at i,j matches.  That means this
10618                          * character is in the run.  But continue on to find
10619                          * the complete intersection, for the next loop
10620                          * iteration, and for the digit check after it.
10621                          *
10622                          * On the first found common script, we malloc space
10623                          * for the intersection list for the worst case of the
10624                          * intersection, which is the minimum of the number of
10625                          * scripts remaining in each set. */
10626                         if (intersection_len == 0) {
10627                             Newx(new_overlap,
10628                                  MIN(run_len - i, char_len - j),
10629                                  SCX_enum);
10630                         }
10631                         new_overlap[intersection_len++] = search_run[i];
10632                     }
10633                 }
10634             }
10635
10636             /* Here we've looked through everything.  If they have no scripts
10637              * in common, not a run */
10638             if (intersection_len == 0) {
10639                 retval = FALSE;
10640                 break;
10641             }
10642
10643             /* If there is only a single script in common, set to that.
10644              * Otherwise, use the intersection going forward */
10645             Safefree(intersection);
10646             intersection = NULL;
10647             if (intersection_len == 1) {
10648                 script_of_run = script_of_char = new_overlap[0];
10649                 Safefree(new_overlap);
10650                 new_overlap = NULL;
10651             }
10652             else {
10653                 intersection = new_overlap;
10654             }
10655         }
10656
10657 #endif
10658
10659   scripts_match:
10660
10661         /* Here, the script of the character is compatible with that of the
10662          * run.  That means that in most cases, it continues the script run.
10663          * Either it and the run match exactly, or one or both can be in any of
10664          * several scripts, and the intersection is not empty.  However, if the
10665          * character is a decimal digit, it could still mean failure if it is
10666          * from the wrong sequence of 10.  So, we need to look at if it's a
10667          * digit.  We've already handled the 10 decimal digits, and the next
10668          * lowest one is this one: */
10669         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10670             continue;   /* Not a digit; this character is part of the run */
10671         }
10672
10673         /* If we have a definitive '0' for the script of this character, we
10674          * know that for this to be a digit, it must be in the range of +0..+9
10675          * of that zero. */
10676         if (   script_of_char >= 0
10677             && (zero_of_char = script_zeros[script_of_char]))
10678         {
10679             if (   cp < zero_of_char
10680                 || cp > zero_of_char + 9)
10681             {
10682                 continue;   /* Not a digit; this character is part of the run
10683                              */
10684             }
10685
10686         }
10687         else {  /* Need to look up if this character is a digit or not */
10688             SSize_t index_of_zero_of_char;
10689             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10690             if (     UNLIKELY(index_of_zero_of_char < 0)
10691                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10692             {
10693                 continue;   /* Not a digit; this character is part of the run.
10694                              */
10695             }
10696
10697             zero_of_char = decimals_array[index_of_zero_of_char];
10698         }
10699
10700         /* Here, the character is a decimal digit, and the zero of its sequence
10701          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
10702          * they better be the same. */
10703         if (zero_of_run) {
10704             if (zero_of_run != zero_of_char) {
10705                 retval = FALSE;
10706                 break;
10707             }
10708         }
10709         else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
10710
10711             /* Here, the script run isn't Common, but the current digit is in
10712              * Common, and isn't '0'-'9' (those were handled earlier).   Only
10713              * '0'-'9' are acceptable in non-Common scripts. */
10714             retval = FALSE;
10715             break;
10716         }
10717         else {  /* Otherwise we now have a zero for this run */
10718             zero_of_run = zero_of_char;
10719         }
10720     } /* end of looping through CLOSESR text */
10721
10722     Safefree(intersection);
10723
10724     if (ret_script != NULL) {
10725         if (retval) {
10726             *ret_script = script_of_run;
10727         }
10728         else {
10729             *ret_script = SCX_INVALID;
10730         }
10731     }
10732
10733     return retval;
10734 }
10735
10736 #endif /* ifndef PERL_IN_XSUB_RE */
10737
10738 /*
10739  * ex: set ts=8 sts=4 sw=4 et:
10740  */