regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  *
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "invlist_inline.h"
  84 #include "unicode_constants.h"
  85
  86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  88
  89 static const char utf8_locale_required[] =
  90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
  91
  92 #ifdef DEBUGGING
  93 /* At least one required character in the target string is expressible only in
  94  * UTF-8. */
  95 static const char* const non_utf8_target_but_utf8_required
  96                 = "Can't match, because target string needs to be in UTF-8\n";
  97 #endif
  98
  99 /* Returns a boolean as to whether the input unsigned number is a power of 2
 100  * (2**0, 2**1, etc).  In other words if it has just a single bit set.
 101  * If not, subtracting 1 would leave the uppermost bit set, so the & would
 102  * yield non-zero */
 103 #define isPOWER_OF_2(n) ((n & (n-1)) == 0)
 104
 105 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
 106     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
 107     goto target;                                                         \
 108 } STMT_END
 109
 110 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 111
 112 #ifndef STATIC
 113 #define STATIC  static
 114 #endif
 115
 116 /* Valid only if 'c', the character being looke-up, is an invariant under
 117  * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
 118  * everything matchable is straight forward in the bitmap */
 119 #define REGINCLASS(prog,p,c,u)  (ANYOF_FLAGS(p)                             \
 120                                 ? reginclass(prog,p,c,c+1,u)                \
 121                                 : ANYOF_BITMAP_TEST(p,*(c)))
 122
 123 /*
 124  * Forwards.
 125  */
 126
 127 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 128
 129 #define HOPc(pos,off) \
 130         (char *)(reginfo->is_utf8_target \
 131             ? reghop3((U8*)pos, off, \
 132                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 133             : (U8*)(pos + off))
 134
 135 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
 136 #define HOPBACK3(pos, off, lim) \
 137         (reginfo->is_utf8_target                          \
 138             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
 139             : (pos - off >= lim)                                 \
 140                 ? (U8*)pos - off                                 \
 141                 : NULL)
 142
 143 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
 144
 145 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 146 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 147
 148 /* lim must be +ve. Returns NULL on overshoot */
 149 #define HOPMAYBE3(pos,off,lim) \
 150         (reginfo->is_utf8_target                        \
 151             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 152             : ((U8*)pos + off <= lim)                   \
 153                 ? (U8*)pos + off                        \
 154                 : NULL)
 155
 156 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 157  * off must be >=0; args should be vars rather than expressions */
 158 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 159     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 160     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 161 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
 162
 163 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 164     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 165     : (U8*)(pos + off))
 166 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 167
 168 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 169 #define NEXTCHR_IS_EOS (nextchr < 0)
 170
 171 #define SET_nextchr \
 172     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 173
 174 #define SET_locinput(p) \
 175     locinput = (p);  \
 176     SET_nextchr
 177
 178 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 179 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 180
 181 /* for use after a quantifier and before an EXACT-like node -- japhy */
 182 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 183  *
 184  * NOTE that *nothing* that affects backtracking should be in here, specifically
 185  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 186  * node that is in between two EXACT like nodes when ascertaining what the required
 187  * "follow" character is. This should probably be moved to regex compile time
 188  * although it may be done at run time beause of the REF possibility - more
 189  * investigation required. -- demerphq
 190 */
 191 #define JUMPABLE(rn) (                                                             \
 192     OP(rn) == OPEN ||                                                              \
 193     (OP(rn) == CLOSE &&                                                            \
 194      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
 195     OP(rn) == EVAL ||                                                              \
 196     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 197     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 198     OP(rn) == KEEPS ||                                                             \
 199     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 200 )
 201 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 202
 203 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 204
 205 #if 0
 206 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 207    we don't need this definition.  XXX These are now out-of-sync*/
 208 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 209 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFAA || OP(rn)==EXACTFAA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 210 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 211
 212 #else
 213 /* ... so we use this as its faster. */
 214 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 215 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFAA || OP(rn) == EXACTFAA_NO_TRIE)
 216 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 217 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 218
 219 #endif
 220
 221 /*
 222   Search for mandatory following text node; for lookahead, the text must
 223   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 224 */
 225 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 226     while (JUMPABLE(rn)) { \
 227         const OPCODE type = OP(rn); \
 228         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 229             rn = NEXTOPER(NEXTOPER(rn)); \
 230         else if (type == PLUS) \
 231             rn = NEXTOPER(rn); \
 232         else if (type == IFMATCH) \
 233             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 234         else rn += NEXT_OFF(rn); \
 235     } \
 236 } STMT_END
 237
 238 #define SLAB_FIRST(s) (&(s)->states[0])
 239 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 240
 241 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 242 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 243 static regmatch_state * S_push_slab(pTHX);
 244
 245 #define REGCP_PAREN_ELEMS 3
 246 #define REGCP_OTHER_ELEMS 3
 247 #define REGCP_FRAME_ELEMS 1
 248 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 249  * are needed for the regexp context stack bookkeeping. */
 250
 251 STATIC CHECKPOINT
 252 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
 253 {
 254     const int retval = PL_savestack_ix;
 255     const int paren_elems_to_push =
 256                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 257     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 258     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 259     I32 p;
 260     GET_RE_DEBUG_FLAGS_DECL;
 261
 262     PERL_ARGS_ASSERT_REGCPPUSH;
 263
 264     if (paren_elems_to_push < 0)
 265         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 266                    (int)paren_elems_to_push, (int)maxopenparen,
 267                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 268
 269     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 270         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
 271                    " out of range (%lu-%ld)",
 272                    total_elems,
 273                    (unsigned long)maxopenparen,
 274                    (long)parenfloor);
 275
 276     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 277
 278     DEBUG_BUFFERS_r(
 279         if ((int)maxopenparen > (int)parenfloor)
 280             Perl_re_exec_indentf( aTHX_
 281                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
 282                 depth,
 283                 PTR2UV(rex),
 284                 PTR2UV(rex->offs)
 285             );
 286     );
 287     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 288 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 289         SSPUSHIV(rex->offs[p].end);
 290         SSPUSHIV(rex->offs[p].start);
 291         SSPUSHINT(rex->offs[p].start_tmp);
 292         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
 293             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
 294             depth,
 295             (UV)p,
 296             (IV)rex->offs[p].start,
 297             (IV)rex->offs[p].start_tmp,
 298             (IV)rex->offs[p].end
 299         ));
 300     }
 301 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 302     SSPUSHINT(maxopenparen);
 303     SSPUSHINT(rex->lastparen);
 304     SSPUSHINT(rex->lastcloseparen);
 305     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 306
 307     return retval;
 308 }
 309
 310 /* These are needed since we do not localize EVAL nodes: */
 311 #define REGCP_SET(cp)                                           \
 312     DEBUG_STATE_r(                                              \
 313         Perl_re_exec_indentf( aTHX_                             \
 314             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
 315             depth, (IV)PL_savestack_ix                          \
 316         )                                                       \
 317     );                                                          \
 318     cp = PL_savestack_ix
 319
 320 #define REGCP_UNWIND(cp)                                        \
 321     DEBUG_STATE_r(                                              \
 322         if (cp != PL_savestack_ix)                              \
 323             Perl_re_exec_indentf( aTHX_                         \
 324                 "Clearing an EVAL scope, savestack=%"           \
 325                 IVdf "..%" IVdf "\n",                           \
 326                 depth, (IV)(cp), (IV)PL_savestack_ix            \
 327             )                                                   \
 328     );                                                          \
 329     regcpblow(cp)
 330
 331 #define UNWIND_PAREN(lp, lcp)               \
 332     for (n = rex->lastparen; n > lp; n--)   \
 333         rex->offs[n].end = -1;              \
 334     rex->lastparen = n;                     \
 335     rex->lastcloseparen = lcp;
 336
 337
 338 STATIC void
 339 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
 340 {
 341     UV i;
 342     U32 paren;
 343     GET_RE_DEBUG_FLAGS_DECL;
 344
 345     PERL_ARGS_ASSERT_REGCPPOP;
 346
 347     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 348     i = SSPOPUV;
 349     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 350     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 351     rex->lastcloseparen = SSPOPINT;
 352     rex->lastparen = SSPOPINT;
 353     *maxopenparen_p = SSPOPINT;
 354
 355     i -= REGCP_OTHER_ELEMS;
 356     /* Now restore the parentheses context. */
 357     DEBUG_BUFFERS_r(
 358         if (i || rex->lastparen + 1 <= rex->nparens)
 359             Perl_re_exec_indentf( aTHX_
 360                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
 361                 depth,
 362                 PTR2UV(rex),
 363                 PTR2UV(rex->offs)
 364             );
 365     );
 366     paren = *maxopenparen_p;
 367     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 368         SSize_t tmps;
 369         rex->offs[paren].start_tmp = SSPOPINT;
 370         rex->offs[paren].start = SSPOPIV;
 371         tmps = SSPOPIV;
 372         if (paren <= rex->lastparen)
 373             rex->offs[paren].end = tmps;
 374         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 375             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
 376             depth,
 377             (UV)paren,
 378             (IV)rex->offs[paren].start,
 379             (IV)rex->offs[paren].start_tmp,
 380             (IV)rex->offs[paren].end,
 381             (paren > rex->lastparen ? "(skipped)" : ""));
 382         );
 383         paren--;
 384     }
 385 #if 1
 386     /* It would seem that the similar code in regtry()
 387      * already takes care of this, and in fact it is in
 388      * a better location to since this code can #if 0-ed out
 389      * but the code in regtry() is needed or otherwise tests
 390      * requiring null fields (pat.t#187 and split.t#{13,14}
 391      * (as of patchlevel 7877)  will fail.  Then again,
 392      * this code seems to be necessary or otherwise
 393      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 394      * --jhi updated by dapm */
 395     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 396         if (i > *maxopenparen_p)
 397             rex->offs[i].start = -1;
 398         rex->offs[i].end = -1;
 399         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
 400             "    \\%" UVuf ": %s   ..-1 undeffing\n",
 401             depth,
 402             (UV)i,
 403             (i > *maxopenparen_p) ? "-1" : "  "
 404         ));
 405     }
 406 #endif
 407 }
 408
 409 /* restore the parens and associated vars at savestack position ix,
 410  * but without popping the stack */
 411
 412 STATIC void
 413 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 414 {
 415     I32 tmpix = PL_savestack_ix;
 416     PERL_ARGS_ASSERT_REGCP_RESTORE;
 417
 418     PL_savestack_ix = ix;
 419     regcppop(rex, maxopenparen_p);
 420     PL_savestack_ix = tmpix;
 421 }
 422
 423 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 424
 425 #ifndef PERL_IN_XSUB_RE
 426
 427 bool
 428 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 429 {
 430     /* Returns a boolean as to whether or not 'character' is a member of the
 431      * Posix character class given by 'classnum' that should be equivalent to a
 432      * value in the typedef '_char_class_number'.
 433      *
 434      * Ideally this could be replaced by a just an array of function pointers
 435      * to the C library functions that implement the macros this calls.
 436      * However, to compile, the precise function signatures are required, and
 437      * these may vary from platform to to platform.  To avoid having to figure
 438      * out what those all are on each platform, I (khw) am using this method,
 439      * which adds an extra layer of function call overhead (unless the C
 440      * optimizer strips it away).  But we don't particularly care about
 441      * performance with locales anyway. */
 442
 443     switch ((_char_class_number) classnum) {
 444         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 445         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 446         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 447         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 448         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
 449                                         || isUPPER_LC(character);
 450         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 451         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 452         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 453         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 454         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 455         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 456         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 457         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 458         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 459         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 460         default:    /* VERTSPACE should never occur in locales */
 461             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 462     }
 463
 464     NOT_REACHED; /* NOTREACHED */
 465     return FALSE;
 466 }
 467
 468 #endif
 469
 470 STATIC bool
 471 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 472 {
 473     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 474      * 'character' is a member of the Posix character class given by 'classnum'
 475      * that should be equivalent to a value in the typedef
 476      * '_char_class_number'.
 477      *
 478      * This just calls isFOO_lc on the code point for the character if it is in
 479      * the range 0-255.  Outside that range, all characters use Unicode
 480      * rules, ignoring any locale.  So use the Unicode function if this class
 481      * requires a swash, and use the Unicode macro otherwise. */
 482
 483     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 484
 485     if (UTF8_IS_INVARIANT(*character)) {
 486         return isFOO_lc(classnum, *character);
 487     }
 488     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 489         return isFOO_lc(classnum,
 490                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
 491     }
 492
 493     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 494
 495     switch ((_char_class_number) classnum) {
 496         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 497         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 498         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 499         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 500         default:
 501             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
 502                                         utf8_to_uvchr_buf(character, e, NULL));
 503     }
 504
 505     return FALSE; /* Things like CNTRL are always below 256 */
 506 }
 507
 508 STATIC char *
 509 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
 510 {
 511     /* Returns the position of the first ASCII byte in the sequence between 's'
 512      * and 'send-1' inclusive; returns 'send' if none found */
 513
 514     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 515
 516 #ifndef EBCDIC
 517
 518     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 519
 520                             /* This term is wordsize if subword; 0 if not */
 521                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 522
 523                             /* 'offset' */
 524                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 525     {
 526
 527         /* Process per-byte until reach word boundary.  XXX This loop could be
 528          * eliminated if we knew that this platform had fast unaligned reads */
 529         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 530             if (isASCII(*s)) {
 531                 return s;
 532             }
 533             s++;    /* khw didn't bother creating a separate loop for
 534                        utf8_target */
 535         }
 536
 537         /* Here, we know we have at least one full word to process.  Process
 538          * per-word as long as we have at least a full word left */
 539         do {
 540             PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
 541             if (complemented & PERL_VARIANTS_WORD_MASK)  {
 542
 543 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 544      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 545
 546                 s += _variant_byte_number(complemented);
 547                 return s;
 548
 549 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 550            checks. */
 551
 552                 break;
 553 #  endif
 554             }
 555
 556             s += PERL_WORDSIZE;
 557
 558         } while (s + PERL_WORDSIZE <= send);
 559     }
 560
 561 #endif
 562
 563     /* Process per-character */
 564     if (utf8_target) {
 565         while (s < send) {
 566             if (isASCII(*s)) {
 567                 return s;
 568             }
 569             s += UTF8SKIP(s);
 570         }
 571     }
 572     else {
 573         while (s < send) {
 574             if (isASCII(*s)) {
 575                 return s;
 576             }
 577             s++;
 578         }
 579     }
 580
 581     return s;
 582 }
 583
 584 STATIC char *
 585 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
 586 {
 587     /* Returns the position of the first non-ASCII byte in the sequence between
 588      * 's' and 'send-1' inclusive; returns 'send' if none found */
 589
 590 #ifdef EBCDIC
 591
 592     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 593
 594     if (utf8_target) {
 595         while (s < send) {
 596             if ( ! isASCII(*s)) {
 597                 return s;
 598             }
 599             s += UTF8SKIP(s);
 600         }
 601     }
 602     else {
 603         while (s < send) {
 604             if ( ! isASCII(*s)) {
 605                 return s;
 606             }
 607             s++;
 608         }
 609     }
 610
 611     return s;
 612
 613 #else
 614
 615     const U8 * next_non_ascii = NULL;
 616
 617     PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
 618     PERL_UNUSED_ARG(utf8_target);
 619
 620     /* On ASCII platforms invariants and ASCII are identical, so if the string
 621      * is entirely invariants, there is no non-ASCII character */
 622     return (is_utf8_invariant_string_loc((U8 *) s,
 623                                          (STRLEN) (send - s),
 624                                          &next_non_ascii))
 625             ? (char *) send
 626             : (char *) next_non_ascii;
 627
 628 #endif
 629
 630 }
 631
 632 STATIC U8 *
 633 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 634 {
 635     /* Returns the position of the first byte in the sequence between 's' and
 636      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
 637      * */
 638
 639     PERL_ARGS_ASSERT_FIND_SPAN_END;
 640
 641     assert(send >= s);
 642
 643     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 644                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 645                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 646     {
 647         PERL_UINTMAX_T span_word;
 648
 649         /* Process per-byte until reach word boundary.  XXX This loop could be
 650          * eliminated if we knew that this platform had fast unaligned reads */
 651         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 652             if (*s != span_byte) {
 653                 return s;
 654             }
 655             s++;
 656         }
 657
 658         /* Create a word filled with the bytes we are spanning */
 659         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 660
 661         /* Process per-word as long as we have at least a full word left */
 662         do {
 663
 664             /* Keep going if the whole word is composed of 'span_byte's */
 665             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
 666                 s += PERL_WORDSIZE;
 667                 continue;
 668             }
 669
 670             /* Here, at least one byte in the word isn't 'span_byte'. */
 671
 672 #ifdef EBCDIC
 673
 674             break;
 675
 676 #else
 677
 678             /* This xor leaves 1 bits only in those non-matching bytes */
 679             span_word ^= * (PERL_UINTMAX_T *) s;
 680
 681             /* Make sure the upper bit of each non-matching byte is set.  This
 682              * makes each such byte look like an ASCII platform variant byte */
 683             span_word |= span_word << 1;
 684             span_word |= span_word << 2;
 685             span_word |= span_word << 4;
 686
 687             /* That reduces the problem to what this function solves */
 688             return s + _variant_byte_number(span_word);
 689
 690 #endif
 691
 692         } while (s + PERL_WORDSIZE <= send);
 693     }
 694
 695     /* Process the straggler bytes beyond the final word boundary */
 696     while (s < send) {
 697         if (*s != span_byte) {
 698             return s;
 699         }
 700         s++;
 701     }
 702
 703     return s;
 704 }
 705
 706 STATIC U8 *
 707 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
 708 {
 709     /* Returns the position of the first byte in the sequence between 's'
 710      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
 711      * returns 'send' if none found.  It uses word-level operations instead of
 712      * byte to speed up the process */
 713
 714     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
 715
 716     assert(send >= s);
 717     assert((byte & mask) == byte);
 718
 719 #ifndef EBCDIC
 720
 721     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 722                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 723                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 724     {
 725         PERL_UINTMAX_T word_complemented, mask_word;
 726
 727         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 728             if (((*s) & mask) == byte) {
 729                 return s;
 730             }
 731             s++;
 732         }
 733
 734         word_complemented = ~ (PERL_COUNT_MULTIPLIER * byte);
 735         mask_word =            PERL_COUNT_MULTIPLIER * mask;
 736
 737         do {
 738             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 739
 740             /* If 'masked' contains 'byte' within it, anding with the
 741              * complement will leave those 8 bits 0 */
 742             masked &= word_complemented;
 743
 744             /* This causes the most significant bit to be set to 1 for any
 745              * bytes in the word that aren't completely 0 */
 746             masked |= masked << 1;
 747             masked |= masked << 2;
 748             masked |= masked << 4;
 749
 750             /* The msbits are the same as what marks a byte as variant, so we
 751              * can use this mask.  If all msbits are 1, the word doesn't
 752              * contain 'byte' */
 753             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
 754                 s += PERL_WORDSIZE;
 755                 continue;
 756             }
 757
 758             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
 759              * and any that are, are 0.  Complement and re-AND to swap that */
 760             masked = ~ masked;
 761             masked &= PERL_VARIANTS_WORD_MASK;
 762
 763             /* This reduces the problem to that solved by this function */
 764             s += _variant_byte_number(masked);
 765             return s;
 766
 767         } while (s + PERL_WORDSIZE <= send);
 768     }
 769
 770 #endif
 771
 772     while (s < send) {
 773         if (((*s) & mask) == byte) {
 774             return s;
 775         }
 776         s++;
 777     }
 778
 779     return s;
 780 }
 781
 782 STATIC U8 *
 783 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
 784 {
 785     /* Returns the position of the first byte in the sequence between 's' and
 786      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
 787      * 'span_byte' should have been ANDed with 'mask' in the call of this
 788      * function.  Returns 'send' if none found.  Works like find_span_end(),
 789      * except for the AND */
 790
 791     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
 792
 793     assert(send >= s);
 794     assert((span_byte & mask) == span_byte);
 795
 796     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 797                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
 798                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
 799     {
 800         PERL_UINTMAX_T span_word, mask_word;
 801
 802         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
 803             if (((*s) & mask) != span_byte) {
 804                 return s;
 805             }
 806             s++;
 807         }
 808
 809         span_word = PERL_COUNT_MULTIPLIER * span_byte;
 810         mask_word = PERL_COUNT_MULTIPLIER * mask;
 811
 812         do {
 813             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
 814
 815             if (masked == span_word) {
 816                 s += PERL_WORDSIZE;
 817                 continue;
 818             }
 819
 820 #ifdef EBCDIC
 821
 822             break;
 823
 824 #else
 825
 826             masked ^= span_word;
 827             masked |= masked << 1;
 828             masked |= masked << 2;
 829             masked |= masked << 4;
 830             return s + _variant_byte_number(masked);
 831
 832 #endif
 833
 834         } while (s + PERL_WORDSIZE <= send);
 835     }
 836
 837     while (s < send) {
 838         if (((*s) & mask) != span_byte) {
 839             return s;
 840         }
 841         s++;
 842     }
 843
 844     return s;
 845 }
 846
 847 /*
 848  * pregexec and friends
 849  */
 850
 851 #ifndef PERL_IN_XSUB_RE
 852 /*
 853  - pregexec - match a regexp against a string
 854  */
 855 I32
 856 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 857          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 858 /* stringarg: the point in the string at which to begin matching */
 859 /* strend:    pointer to null at end of string */
 860 /* strbeg:    real beginning of string */
 861 /* minend:    end of match must be >= minend bytes after stringarg. */
 862 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 863  *            itself is accessed via the pointers above */
 864 /* nosave:    For optimizations. */
 865 {
 866     PERL_ARGS_ASSERT_PREGEXEC;
 867
 868     return
 869         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 870                       nosave ? 0 : REXEC_COPY_STR);
 871 }
 872 #endif
 873
 874
 875
 876 /* re_intuit_start():
 877  *
 878  * Based on some optimiser hints, try to find the earliest position in the
 879  * string where the regex could match.
 880  *
 881  *   rx:     the regex to match against
 882  *   sv:     the SV being matched: only used for utf8 flag; the string
 883  *           itself is accessed via the pointers below. Note that on
 884  *           something like an overloaded SV, SvPOK(sv) may be false
 885  *           and the string pointers may point to something unrelated to
 886  *           the SV itself.
 887  *   strbeg: real beginning of string
 888  *   strpos: the point in the string at which to begin matching
 889  *   strend: pointer to the byte following the last char of the string
 890  *   flags   currently unused; set to 0
 891  *   data:   currently unused; set to NULL
 892  *
 893  * The basic idea of re_intuit_start() is to use some known information
 894  * about the pattern, namely:
 895  *
 896  *   a) the longest known anchored substring (i.e. one that's at a
 897  *      constant offset from the beginning of the pattern; but not
 898  *      necessarily at a fixed offset from the beginning of the
 899  *      string);
 900  *   b) the longest floating substring (i.e. one that's not at a constant
 901  *      offset from the beginning of the pattern);
 902  *   c) Whether the pattern is anchored to the string; either
 903  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 904  *      or anchored to pos(): /\G/;
 905  *   d) A start class: a real or synthetic character class which
 906  *      represents which characters are legal at the start of the pattern;
 907  *
 908  * to either quickly reject the match, or to find the earliest position
 909  * within the string at which the pattern might match, thus avoiding
 910  * running the full NFA engine at those earlier locations, only to
 911  * eventually fail and retry further along.
 912  *
 913  * Returns NULL if the pattern can't match, or returns the address within
 914  * the string which is the earliest place the match could occur.
 915  *
 916  * The longest of the anchored and floating substrings is called 'check'
 917  * and is checked first. The other is called 'other' and is checked
 918  * second. The 'other' substring may not be present.  For example,
 919  *
 920  *    /(abc|xyz)ABC\d{0,3}DEFG/
 921  *
 922  * will have
 923  *
 924  *   check substr (float)    = "DEFG", offset 6..9 chars
 925  *   other substr (anchored) = "ABC",  offset 3..3 chars
 926  *   stclass = [ax]
 927  *
 928  * Be aware that during the course of this function, sometimes 'anchored'
 929  * refers to a substring being anchored relative to the start of the
 930  * pattern, and sometimes to the pattern itself being anchored relative to
 931  * the string. For example:
 932  *
 933  *   /\dabc/:   "abc" is anchored to the pattern;
 934  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 935  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 936  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 937  *                    but the pattern is anchored to the string.
 938  */
 939
 940 char *
 941 Perl_re_intuit_start(pTHX_
 942                     REGEXP * const rx,
 943                     SV *sv,
 944                     const char * const strbeg,
 945                     char *strpos,
 946                     char *strend,
 947                     const U32 flags,
 948                     re_scream_pos_data *data)
 949 {
 950     struct regexp *const prog = ReANY(rx);
 951     SSize_t start_shift = prog->check_offset_min;
 952     /* Should be nonnegative! */
 953     SSize_t end_shift   = 0;
 954     /* current lowest pos in string where the regex can start matching */
 955     char *rx_origin = strpos;
 956     SV *check;
 957     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 958     U8   other_ix = 1 - prog->substrs->check_ix;
 959     bool ml_anch = 0;
 960     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 961     char *check_at = NULL;              /* check substr found at this pos */
 962     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 963     RXi_GET_DECL(prog,progi);
 964     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 965     regmatch_info *const reginfo = &reginfo_buf;
 966     GET_RE_DEBUG_FLAGS_DECL;
 967
 968     PERL_ARGS_ASSERT_RE_INTUIT_START;
 969     PERL_UNUSED_ARG(flags);
 970     PERL_UNUSED_ARG(data);
 971
 972     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
 973                 "Intuit: trying to determine minimum start position...\n"));
 974
 975     /* for now, assume that all substr offsets are positive. If at some point
 976      * in the future someone wants to do clever things with lookbehind and
 977      * -ve offsets, they'll need to fix up any code in this function
 978      * which uses these offsets. See the thread beginning
 979      * <20140113145929.GF27210@iabyn.com>
 980      */
 981     assert(prog->substrs->data[0].min_offset >= 0);
 982     assert(prog->substrs->data[0].max_offset >= 0);
 983     assert(prog->substrs->data[1].min_offset >= 0);
 984     assert(prog->substrs->data[1].max_offset >= 0);
 985     assert(prog->substrs->data[2].min_offset >= 0);
 986     assert(prog->substrs->data[2].max_offset >= 0);
 987
 988     /* for now, assume that if both present, that the floating substring
 989      * doesn't start before the anchored substring.
 990      * If you break this assumption (e.g. doing better optimisations
 991      * with lookahead/behind), then you'll need to audit the code in this
 992      * function carefully first
 993      */
 994     assert(
 995             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 996               && (prog->float_utf8    || prog->float_substr))
 997            || (prog->float_min_offset >= prog->anchored_offset));
 998
 999     /* byte rather than char calculation for efficiency. It fails
1000      * to quickly reject some cases that can't match, but will reject
1001      * them later after doing full char arithmetic */
1002     if (prog->minlen > strend - strpos) {
1003         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1004                               "  String too short...\n"));
1005         goto fail;
1006     }
1007
1008     RXp_MATCH_UTF8_set(prog, utf8_target);
1009     reginfo->is_utf8_target = cBOOL(utf8_target);
1010     reginfo->info_aux = NULL;
1011     reginfo->strbeg = strbeg;
1012     reginfo->strend = strend;
1013     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1014     reginfo->intuit = 1;
1015     /* not actually used within intuit, but zero for safety anyway */
1016     reginfo->poscache_maxiter = 0;
1017
1018     if (utf8_target) {
1019         if ((!prog->anchored_utf8 && prog->anchored_substr)
1020                 || (!prog->float_utf8 && prog->float_substr))
1021             to_utf8_substr(prog);
1022         check = prog->check_utf8;
1023     } else {
1024         if (!prog->check_substr && prog->check_utf8) {
1025             if (! to_byte_substr(prog)) {
1026                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1027             }
1028         }
1029         check = prog->check_substr;
1030     }
1031
1032     /* dump the various substring data */
1033     DEBUG_OPTIMISE_MORE_r({
1034         int i;
1035         for (i=0; i<=2; i++) {
1036             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1037                                   : prog->substrs->data[i].substr);
1038             if (!sv)
1039                 continue;
1040
1041             Perl_re_printf( aTHX_
1042                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1043                 " useful=%" IVdf " utf8=%d [%s]\n",
1044                 i,
1045                 (IV)prog->substrs->data[i].min_offset,
1046                 (IV)prog->substrs->data[i].max_offset,
1047                 (IV)prog->substrs->data[i].end_shift,
1048                 BmUSEFUL(sv),
1049                 utf8_target ? 1 : 0,
1050                 SvPEEK(sv));
1051         }
1052     });
1053
1054     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1055
1056         /* ml_anch: check after \n?
1057          *
1058          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1059          * with /.*.../, these flags will have been added by the
1060          * compiler:
1061          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
1062          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
1063          */
1064         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
1065                    && !(prog->intflags & PREGf_IMPLICIT);
1066
1067         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1068             /* we are only allowed to match at BOS or \G */
1069
1070             /* trivially reject if there's a BOS anchor and we're not at BOS.
1071              *
1072              * Note that we don't try to do a similar quick reject for
1073              * \G, since generally the caller will have calculated strpos
1074              * based on pos() and gofs, so the string is already correctly
1075              * anchored by definition; and handling the exceptions would
1076              * be too fiddly (e.g. REXEC_IGNOREPOS).
1077              */
1078             if (   strpos != strbeg
1079                 && (prog->intflags & PREGf_ANCH_SBOL))
1080             {
1081                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1082                                 "  Not at start...\n"));
1083                 goto fail;
1084             }
1085
1086             /* in the presence of an anchor, the anchored (relative to the
1087              * start of the regex) substr must also be anchored relative
1088              * to strpos. So quickly reject if substr isn't found there.
1089              * This works for \G too, because the caller will already have
1090              * subtracted gofs from pos, and gofs is the offset from the
1091              * \G to the start of the regex. For example, in /.abc\Gdef/,
1092              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1093              * caller will have set strpos=pos()-4; we look for the substr
1094              * at position pos()-4+1, which lines up with the "a" */
1095
1096             if (prog->check_offset_min == prog->check_offset_max) {
1097                 /* Substring at constant offset from beg-of-str... */
1098                 SSize_t slen = SvCUR(check);
1099                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1100
1101                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1102                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1103                     (IV)prog->check_offset_min));
1104
1105                 if (SvTAIL(check)) {
1106                     /* In this case, the regex is anchored at the end too.
1107                      * Unless it's a multiline match, the lengths must match
1108                      * exactly, give or take a \n.  NB: slen >= 1 since
1109                      * the last char of check is \n */
1110                     if (!multiline
1111                         && (   strend - s > slen
1112                             || strend - s < slen - 1
1113                             || (strend - s == slen && strend[-1] != '\n')))
1114                     {
1115                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1116                                             "  String too long...\n"));
1117                         goto fail_finish;
1118                     }
1119                     /* Now should match s[0..slen-2] */
1120                     slen--;
1121                 }
1122                 if (slen && (strend - s < slen
1123                     || *SvPVX_const(check) != *s
1124                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1125                 {
1126                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1127                                     "  String not equal...\n"));
1128                     goto fail_finish;
1129                 }
1130
1131                 check_at = s;
1132                 goto success_at_start;
1133             }
1134         }
1135     }
1136
1137     end_shift = prog->check_end_shift;
1138
1139 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
1140     if (end_shift < 0)
1141         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1142                    (IV)end_shift, RX_PRECOMP(rx));
1143 #endif
1144
1145   restart:
1146
1147     /* This is the (re)entry point of the main loop in this function.
1148      * The goal of this loop is to:
1149      * 1) find the "check" substring in the region rx_origin..strend
1150      *    (adjusted by start_shift / end_shift). If not found, reject
1151      *    immediately.
1152      * 2) If it exists, look for the "other" substr too if defined; for
1153      *    example, if the check substr maps to the anchored substr, then
1154      *    check the floating substr, and vice-versa. If not found, go
1155      *    back to (1) with rx_origin suitably incremented.
1156      * 3) If we find an rx_origin position that doesn't contradict
1157      *    either of the substrings, then check the possible additional
1158      *    constraints on rx_origin of /^.../m or a known start class.
1159      *    If these fail, then depending on which constraints fail, jump
1160      *    back to here, or to various other re-entry points further along
1161      *    that skip some of the first steps.
1162      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1163      *    substring. If the start position was determined to be at the
1164      *    beginning of the string  - so, not rejected, but not optimised,
1165      *    since we have to run regmatch from position 0 - decrement the
1166      *    BmUSEFUL() count. Otherwise increment it.
1167      */
1168
1169
1170     /* first, look for the 'check' substring */
1171
1172     {
1173         U8* start_point;
1174         U8* end_point;
1175
1176         DEBUG_OPTIMISE_MORE_r({
1177             Perl_re_printf( aTHX_
1178                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1179                 " Start shift: %" IVdf " End shift %" IVdf
1180                 " Real end Shift: %" IVdf "\n",
1181                 (IV)(rx_origin - strbeg),
1182                 (IV)prog->check_offset_min,
1183                 (IV)start_shift,
1184                 (IV)end_shift,
1185                 (IV)prog->check_end_shift);
1186         });
1187
1188         end_point = HOPBACK3(strend, end_shift, rx_origin);
1189         if (!end_point)
1190             goto fail_finish;
1191         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1192         if (!start_point)
1193             goto fail_finish;
1194
1195
1196         /* If the regex is absolutely anchored to either the start of the
1197          * string (SBOL) or to pos() (ANCH_GPOS), then
1198          * check_offset_max represents an upper bound on the string where
1199          * the substr could start. For the ANCH_GPOS case, we assume that
1200          * the caller of intuit will have already set strpos to
1201          * pos()-gofs, so in this case strpos + offset_max will still be
1202          * an upper bound on the substr.
1203          */
1204         if (!ml_anch
1205             && prog->intflags & PREGf_ANCH
1206             && prog->check_offset_max != SSize_t_MAX)
1207         {
1208             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1209             const char * const anchor =
1210                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1211             SSize_t targ_len = (char*)end_point - anchor;
1212
1213             if (check_len > targ_len) {
1214                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1215                               "Target string too short to match required substring...\n"));
1216                 goto fail_finish;
1217             }
1218
1219             /* do a bytes rather than chars comparison. It's conservative;
1220              * so it skips doing the HOP if the result can't possibly end
1221              * up earlier than the old value of end_point.
1222              */
1223             assert(anchor + check_len <= (char *)end_point);
1224             if (prog->check_offset_max + check_len < targ_len) {
1225                 end_point = HOP3lim((U8*)anchor,
1226                                 prog->check_offset_max,
1227                                 end_point - check_len
1228                             )
1229                             + check_len;
1230                 if (end_point < start_point)
1231                     goto fail_finish;
1232             }
1233         }
1234
1235         check_at = fbm_instr( start_point, end_point,
1236                       check, multiline ? FBMrf_MULTILINE : 0);
1237
1238         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1239             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1240             (IV)((char*)start_point - strbeg),
1241             (IV)((char*)end_point   - strbeg),
1242             (IV)(check_at ? check_at - strbeg : -1)
1243         ));
1244
1245         /* Update the count-of-usability, remove useless subpatterns,
1246             unshift s.  */
1247
1248         DEBUG_EXECUTE_r({
1249             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1250                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1251             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1252                               (check_at ? "Found" : "Did not find"),
1253                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1254                     ? "anchored" : "floating"),
1255                 quoted,
1256                 RE_SV_TAIL(check),
1257                 (check_at ? " at offset " : "...\n") );
1258         });
1259
1260         if (!check_at)
1261             goto fail_finish;
1262         /* set rx_origin to the minimum position where the regex could start
1263          * matching, given the constraint of the just-matched check substring.
1264          * But don't set it lower than previously.
1265          */
1266
1267         if (check_at - rx_origin > prog->check_offset_max)
1268             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1269         /* Finish the diagnostic message */
1270         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1271             "%ld (rx_origin now %" IVdf ")...\n",
1272             (long)(check_at - strbeg),
1273             (IV)(rx_origin - strbeg)
1274         ));
1275     }
1276
1277
1278     /* now look for the 'other' substring if defined */
1279
1280     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1281                     : prog->substrs->data[other_ix].substr)
1282     {
1283         /* Take into account the "other" substring. */
1284         char *last, *last1;
1285         char *s;
1286         SV* must;
1287         struct reg_substr_datum *other;
1288
1289       do_other_substr:
1290         other = &prog->substrs->data[other_ix];
1291
1292         /* if "other" is anchored:
1293          * we've previously found a floating substr starting at check_at.
1294          * This means that the regex origin must lie somewhere
1295          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1296          * and max:                 HOP3(check_at, -check_offset_min)
1297          * (except that min will be >= strpos)
1298          * So the fixed  substr must lie somewhere between
1299          *  HOP3(min, anchored_offset)
1300          *  HOP3(max, anchored_offset) + SvCUR(substr)
1301          */
1302
1303         /* if "other" is floating
1304          * Calculate last1, the absolute latest point where the
1305          * floating substr could start in the string, ignoring any
1306          * constraints from the earlier fixed match. It is calculated
1307          * as follows:
1308          *
1309          * strend - prog->minlen (in chars) is the absolute latest
1310          * position within the string where the origin of the regex
1311          * could appear. The latest start point for the floating
1312          * substr is float_min_offset(*) on from the start of the
1313          * regex.  last1 simply combines thee two offsets.
1314          *
1315          * (*) You might think the latest start point should be
1316          * float_max_offset from the regex origin, and technically
1317          * you'd be correct. However, consider
1318          *    /a\d{2,4}bcd\w/
1319          * Here, float min, max are 3,5 and minlen is 7.
1320          * This can match either
1321          *    /a\d\dbcd\w/
1322          *    /a\d\d\dbcd\w/
1323          *    /a\d\d\d\dbcd\w/
1324          * In the first case, the regex matches minlen chars; in the
1325          * second, minlen+1, in the third, minlen+2.
1326          * In the first case, the floating offset is 3 (which equals
1327          * float_min), in the second, 4, and in the third, 5 (which
1328          * equals float_max). In all cases, the floating string bcd
1329          * can never start more than 4 chars from the end of the
1330          * string, which equals minlen - float_min. As the substring
1331          * starts to match more than float_min from the start of the
1332          * regex, it makes the regex match more than minlen chars,
1333          * and the two cancel each other out. So we can always use
1334          * float_min - minlen, rather than float_max - minlen for the
1335          * latest position in the string.
1336          *
1337          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1338          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1339          */
1340
1341         assert(prog->minlen >= other->min_offset);
1342         last1 = HOP3c(strend,
1343                         other->min_offset - prog->minlen, strbeg);
1344
1345         if (other_ix) {/* i.e. if (other-is-float) */
1346             /* last is the latest point where the floating substr could
1347              * start, *given* any constraints from the earlier fixed
1348              * match. This constraint is that the floating string starts
1349              * <= float_max_offset chars from the regex origin (rx_origin).
1350              * If this value is less than last1, use it instead.
1351              */
1352             assert(rx_origin <= last1);
1353             last =
1354                 /* this condition handles the offset==infinity case, and
1355                  * is a short-cut otherwise. Although it's comparing a
1356                  * byte offset to a char length, it does so in a safe way,
1357                  * since 1 char always occupies 1 or more bytes,
1358                  * so if a string range is  (last1 - rx_origin) bytes,
1359                  * it will be less than or equal to  (last1 - rx_origin)
1360                  * chars; meaning it errs towards doing the accurate HOP3
1361                  * rather than just using last1 as a short-cut */
1362                 (last1 - rx_origin) < other->max_offset
1363                     ? last1
1364                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1365         }
1366         else {
1367             assert(strpos + start_shift <= check_at);
1368             last = HOP4c(check_at, other->min_offset - start_shift,
1369                         strbeg, strend);
1370         }
1371
1372         s = HOP3c(rx_origin, other->min_offset, strend);
1373         if (s < other_last)     /* These positions already checked */
1374             s = other_last;
1375
1376         must = utf8_target ? other->utf8_substr : other->substr;
1377         assert(SvPOK(must));
1378         {
1379             char *from = s;
1380             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1381
1382             if (to > strend)
1383                 to = strend;
1384             if (from > to) {
1385                 s = NULL;
1386                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1387                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1388                     (IV)(from - strbeg),
1389                     (IV)(to   - strbeg)
1390                 ));
1391             }
1392             else {
1393                 s = fbm_instr(
1394                     (unsigned char*)from,
1395                     (unsigned char*)to,
1396                     must,
1397                     multiline ? FBMrf_MULTILINE : 0
1398                 );
1399                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1400                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1401                     (IV)(from - strbeg),
1402                     (IV)(to   - strbeg),
1403                     (IV)(s ? s - strbeg : -1)
1404                 ));
1405             }
1406         }
1407
1408         DEBUG_EXECUTE_r({
1409             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1410                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1411             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1412                 s ? "Found" : "Contradicts",
1413                 other_ix ? "floating" : "anchored",
1414                 quoted, RE_SV_TAIL(must));
1415         });
1416
1417
1418         if (!s) {
1419             /* last1 is latest possible substr location. If we didn't
1420              * find it before there, we never will */
1421             if (last >= last1) {
1422                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1423                                         "; giving up...\n"));
1424                 goto fail_finish;
1425             }
1426
1427             /* try to find the check substr again at a later
1428              * position. Maybe next time we'll find the "other" substr
1429              * in range too */
1430             other_last = HOP3c(last, 1, strend) /* highest failure */;
1431             rx_origin =
1432                 other_ix /* i.e. if other-is-float */
1433                     ? HOP3c(rx_origin, 1, strend)
1434                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1435             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1436                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1437                 (other_ix ? "floating" : "anchored"),
1438                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1439                 (IV)(rx_origin - strbeg)
1440             ));
1441             goto restart;
1442         }
1443         else {
1444             if (other_ix) { /* if (other-is-float) */
1445                 /* other_last is set to s, not s+1, since its possible for
1446                  * a floating substr to fail first time, then succeed
1447                  * second time at the same floating position; e.g.:
1448                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1449                  * The first time round, anchored and float match at
1450                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1451                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1452                  */
1453                 other_last = s;
1454             }
1455             else {
1456                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1457                 other_last = HOP3c(s, 1, strend);
1458             }
1459             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1460                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1461                   (long)(s - strbeg),
1462                 (IV)(rx_origin - strbeg)
1463               ));
1464
1465         }
1466     }
1467     else {
1468         DEBUG_OPTIMISE_MORE_r(
1469             Perl_re_printf( aTHX_
1470                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1471                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1472                 " strend:%" IVdf "\n",
1473                 (IV)prog->check_offset_min,
1474                 (IV)prog->check_offset_max,
1475                 (IV)(check_at-strbeg),
1476                 (IV)(rx_origin-strbeg),
1477                 (IV)(rx_origin-check_at),
1478                 (IV)(strend-strbeg)
1479             )
1480         );
1481     }
1482
1483   postprocess_substr_matches:
1484
1485     /* handle the extra constraint of /^.../m if present */
1486
1487     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1488         char *s;
1489
1490         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1491                         "  looking for /^/m anchor"));
1492
1493         /* we have failed the constraint of a \n before rx_origin.
1494          * Find the next \n, if any, even if it's beyond the current
1495          * anchored and/or floating substrings. Whether we should be
1496          * scanning ahead for the next \n or the next substr is debatable.
1497          * On the one hand you'd expect rare substrings to appear less
1498          * often than \n's. On the other hand, searching for \n means
1499          * we're effectively flipping between check_substr and "\n" on each
1500          * iteration as the current "rarest" string candidate, which
1501          * means for example that we'll quickly reject the whole string if
1502          * hasn't got a \n, rather than trying every substr position
1503          * first
1504          */
1505
1506         s = HOP3c(strend, - prog->minlen, strpos);
1507         if (s <= rx_origin ||
1508             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1509         {
1510             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1511                             "  Did not find /%s^%s/m...\n",
1512                             PL_colors[0], PL_colors[1]));
1513             goto fail_finish;
1514         }
1515
1516         /* earliest possible origin is 1 char after the \n.
1517          * (since *rx_origin == '\n', it's safe to ++ here rather than
1518          * HOP(rx_origin, 1)) */
1519         rx_origin++;
1520
1521         if (prog->substrs->check_ix == 0  /* check is anchored */
1522             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1523         {
1524             /* Position contradicts check-string; either because
1525              * check was anchored (and thus has no wiggle room),
1526              * or check was float and rx_origin is above the float range */
1527             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1528                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1529                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1530             goto restart;
1531         }
1532
1533         /* if we get here, the check substr must have been float,
1534          * is in range, and we may or may not have had an anchored
1535          * "other" substr which still contradicts */
1536         assert(prog->substrs->check_ix); /* check is float */
1537
1538         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1539             /* whoops, the anchored "other" substr exists, so we still
1540              * contradict. On the other hand, the float "check" substr
1541              * didn't contradict, so just retry the anchored "other"
1542              * substr */
1543             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1544                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1545                 PL_colors[0], PL_colors[1],
1546                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1547                 (IV)(rx_origin - strbeg)
1548             ));
1549             goto do_other_substr;
1550         }
1551
1552         /* success: we don't contradict the found floating substring
1553          * (and there's no anchored substr). */
1554         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1555             "  Found /%s^%s/m with rx_origin %ld...\n",
1556             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1557     }
1558     else {
1559         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1560             "  (multiline anchor test skipped)\n"));
1561     }
1562
1563   success_at_start:
1564
1565
1566     /* if we have a starting character class, then test that extra constraint.
1567      * (trie stclasses are too expensive to use here, we are better off to
1568      * leave it to regmatch itself) */
1569
1570     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1571         const U8* const str = (U8*)STRING(progi->regstclass);
1572
1573         /* XXX this value could be pre-computed */
1574         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1575                     ?  (reginfo->is_utf8_pat
1576                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1577                         : STR_LEN(progi->regstclass))
1578                     : 1);
1579         char * endpos;
1580         char *s;
1581         /* latest pos that a matching float substr constrains rx start to */
1582         char *rx_max_float = NULL;
1583
1584         /* if the current rx_origin is anchored, either by satisfying an
1585          * anchored substring constraint, or a /^.../m constraint, then we
1586          * can reject the current origin if the start class isn't found
1587          * at the current position. If we have a float-only match, then
1588          * rx_origin is constrained to a range; so look for the start class
1589          * in that range. if neither, then look for the start class in the
1590          * whole rest of the string */
1591
1592         /* XXX DAPM it's not clear what the minlen test is for, and why
1593          * it's not used in the floating case. Nothing in the test suite
1594          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1595          * Here are some old comments, which may or may not be correct:
1596          *
1597          *   minlen == 0 is possible if regstclass is \b or \B,
1598          *   and the fixed substr is ''$.
1599          *   Since minlen is already taken into account, rx_origin+1 is
1600          *   before strend; accidentally, minlen >= 1 guaranties no false
1601          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1602          *   0) below assumes that regstclass does not come from lookahead...
1603          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1604          *   This leaves EXACTF-ish only, which are dealt with in
1605          *   find_byclass().
1606          */
1607
1608         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1609             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1610         else if (prog->float_substr || prog->float_utf8) {
1611             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1612             endpos = HOP3clim(rx_max_float, cl_l, strend);
1613         }
1614         else
1615             endpos= strend;
1616
1617         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1618             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1619             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1620               (IV)start_shift, (IV)(check_at - strbeg),
1621               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1622
1623         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1624                             reginfo);
1625         if (!s) {
1626             if (endpos == strend) {
1627                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1628                                 "  Could not match STCLASS...\n") );
1629                 goto fail;
1630             }
1631             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1632                                "  This position contradicts STCLASS...\n") );
1633             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1634                         && !(prog->intflags & PREGf_IMPLICIT))
1635                 goto fail;
1636
1637             /* Contradict one of substrings */
1638             if (prog->anchored_substr || prog->anchored_utf8) {
1639                 if (prog->substrs->check_ix == 1) { /* check is float */
1640                     /* Have both, check_string is floating */
1641                     assert(rx_origin + start_shift <= check_at);
1642                     if (rx_origin + start_shift != check_at) {
1643                         /* not at latest position float substr could match:
1644                          * Recheck anchored substring, but not floating.
1645                          * The condition above is in bytes rather than
1646                          * chars for efficiency. It's conservative, in
1647                          * that it errs on the side of doing 'goto
1648                          * do_other_substr'. In this case, at worst,
1649                          * an extra anchored search may get done, but in
1650                          * practice the extra fbm_instr() is likely to
1651                          * get skipped anyway. */
1652                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1653                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1654                             (long)(other_last - strbeg),
1655                             (IV)(rx_origin - strbeg)
1656                         ));
1657                         goto do_other_substr;
1658                     }
1659                 }
1660             }
1661             else {
1662                 /* float-only */
1663
1664                 if (ml_anch) {
1665                     /* In the presence of ml_anch, we might be able to
1666                      * find another \n without breaking the current float
1667                      * constraint. */
1668
1669                     /* strictly speaking this should be HOP3c(..., 1, ...),
1670                      * but since we goto a block of code that's going to
1671                      * search for the next \n if any, its safe here */
1672                     rx_origin++;
1673                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1674                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1675                               PL_colors[0], PL_colors[1],
1676                               (long)(rx_origin - strbeg)) );
1677                     goto postprocess_substr_matches;
1678                 }
1679
1680                 /* strictly speaking this can never be true; but might
1681                  * be if we ever allow intuit without substrings */
1682                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1683                     goto fail;
1684
1685                 rx_origin = rx_max_float;
1686             }
1687
1688             /* at this point, any matching substrings have been
1689              * contradicted. Start again... */
1690
1691             rx_origin = HOP3c(rx_origin, 1, strend);
1692
1693             /* uses bytes rather than char calculations for efficiency.
1694              * It's conservative: it errs on the side of doing 'goto restart',
1695              * where there is code that does a proper char-based test */
1696             if (rx_origin + start_shift + end_shift > strend) {
1697                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1698                                        "  Could not match STCLASS...\n") );
1699                 goto fail;
1700             }
1701             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1702                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1703                 (prog->substrs->check_ix ? "floating" : "anchored"),
1704                 (long)(rx_origin + start_shift - strbeg),
1705                 (IV)(rx_origin - strbeg)
1706             ));
1707             goto restart;
1708         }
1709
1710         /* Success !!! */
1711
1712         if (rx_origin != s) {
1713             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1714                         "  By STCLASS: moving %ld --> %ld\n",
1715                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1716                    );
1717         }
1718         else {
1719             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1720                                   "  Does not contradict STCLASS...\n");
1721                    );
1722         }
1723     }
1724
1725     /* Decide whether using the substrings helped */
1726
1727     if (rx_origin != strpos) {
1728         /* Fixed substring is found far enough so that the match
1729            cannot start at strpos. */
1730
1731         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1732         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1733     }
1734     else {
1735         /* The found rx_origin position does not prohibit matching at
1736          * strpos, so calling intuit didn't gain us anything. Decrement
1737          * the BmUSEFUL() count on the check substring, and if we reach
1738          * zero, free it.  */
1739         if (!(prog->intflags & PREGf_NAUGHTY)
1740             && (utf8_target ? (
1741                 prog->check_utf8                /* Could be deleted already */
1742                 && --BmUSEFUL(prog->check_utf8) < 0
1743                 && (prog->check_utf8 == prog->float_utf8)
1744             ) : (
1745                 prog->check_substr              /* Could be deleted already */
1746                 && --BmUSEFUL(prog->check_substr) < 0
1747                 && (prog->check_substr == prog->float_substr)
1748             )))
1749         {
1750             /* If flags & SOMETHING - do not do it many times on the same match */
1751             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1752             /* XXX Does the destruction order has to change with utf8_target? */
1753             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1754             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1755             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1756             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1757             check = NULL;                       /* abort */
1758             /* XXXX This is a remnant of the old implementation.  It
1759                     looks wasteful, since now INTUIT can use many
1760                     other heuristics. */
1761             prog->extflags &= ~RXf_USE_INTUIT;
1762         }
1763     }
1764
1765     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1766             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1767              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1768
1769     return rx_origin;
1770
1771   fail_finish:                          /* Substring not found */
1772     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1773         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1774   fail:
1775     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1776                           PL_colors[4], PL_colors[5]));
1777     return NULL;
1778 }
1779
1780
1781 #define DECL_TRIE_TYPE(scan) \
1782     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1783                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1784                  trie_utf8l, trie_flu8 }                                            \
1785                     trie_type = ((scan->flags == EXACT)                             \
1786                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1787                                  : (scan->flags == EXACTL)                          \
1788                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1789                                     : (scan->flags == EXACTFAA)                     \
1790                                       ? (utf8_target                                \
1791                                          ? trie_utf8_exactfa_fold                   \
1792                                          : trie_latin_utf8_exactfa_fold)            \
1793                                       : (scan->flags == EXACTFLU8                   \
1794                                          ? trie_flu8                                \
1795                                          : (utf8_target                             \
1796                                            ? trie_utf8_fold                         \
1797                                            :   trie_latin_utf8_fold)))
1798
1799 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1800 STMT_START {                                                                        \
1801     STRLEN skiplen;                                                                 \
1802     U8 flags = FOLD_FLAGS_FULL;                                                     \
1803     switch (trie_type) {                                                            \
1804     case trie_flu8:                                                                 \
1805         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1806         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1807             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1808         }                                                                           \
1809         goto do_trie_utf8_fold;                                                     \
1810     case trie_utf8_exactfa_fold:                                                    \
1811         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1812         /* FALLTHROUGH */                                                           \
1813     case trie_utf8_fold:                                                            \
1814       do_trie_utf8_fold:                                                            \
1815         if ( foldlen>0 ) {                                                          \
1816             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1817             foldlen -= len;                                                         \
1818             uscan += len;                                                           \
1819             len=0;                                                                  \
1820         } else {                                                                    \
1821             len = UTF8SKIP(uc);                                                     \
1822             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen,  \
1823                                                                             flags); \
1824             skiplen = UVCHR_SKIP( uvc );                                            \
1825             foldlen -= skiplen;                                                     \
1826             uscan = foldbuf + skiplen;                                              \
1827         }                                                                           \
1828         break;                                                                      \
1829     case trie_latin_utf8_exactfa_fold:                                              \
1830         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1831         /* FALLTHROUGH */                                                           \
1832     case trie_latin_utf8_fold:                                                      \
1833         if ( foldlen>0 ) {                                                          \
1834             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1835             foldlen -= len;                                                         \
1836             uscan += len;                                                           \
1837             len=0;                                                                  \
1838         } else {                                                                    \
1839             len = 1;                                                                \
1840             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1841             skiplen = UVCHR_SKIP( uvc );                                            \
1842             foldlen -= skiplen;                                                     \
1843             uscan = foldbuf + skiplen;                                              \
1844         }                                                                           \
1845         break;                                                                      \
1846     case trie_utf8l:                                                                \
1847         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1848         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1849             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1850         }                                                                           \
1851         /* FALLTHROUGH */                                                           \
1852     case trie_utf8:                                                                 \
1853         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1854         break;                                                                      \
1855     case trie_plain:                                                                \
1856         uvc = (UV)*uc;                                                              \
1857         len = 1;                                                                    \
1858     }                                                                               \
1859     if (uvc < 256) {                                                                \
1860         charid = trie->charmap[ uvc ];                                              \
1861     }                                                                               \
1862     else {                                                                          \
1863         charid = 0;                                                                 \
1864         if (widecharmap) {                                                          \
1865             SV** const svpp = hv_fetch(widecharmap,                                 \
1866                         (char*)&uvc, sizeof(UV), 0);                                \
1867             if (svpp)                                                               \
1868                 charid = (U16)SvIV(*svpp);                                          \
1869         }                                                                           \
1870     }                                                                               \
1871 } STMT_END
1872
1873 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1874     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1875                 startpos, doutf8, depth)
1876
1877 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1878     STMT_START {                                            \
1879         while (s < strend) {                                \
1880             CODE                                            \
1881             s += ((UTF8) ? UTF8SKIP(s) : 1);                \
1882         }                                                   \
1883     } STMT_END
1884
1885 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1886     STMT_START {                                            \
1887         while (s < strend) {                                \
1888             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1889         }                                                   \
1890     } STMT_END
1891
1892 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1893     if (COND) {                                                \
1894         FBC_CHECK_AND_TRY                                      \
1895         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1896         previous_occurrence_end = s;                           \
1897     }                                                          \
1898     else {                                                     \
1899         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1900     }
1901
1902 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1903     if (utf8_target) {                                         \
1904         REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1905     }                                                          \
1906     else {                                                     \
1907         REXEC_FBC_CLASS_SCAN(0, COND);                         \
1908     }
1909
1910 /* We keep track of where the next character should start after an occurrence
1911  * of the one we're looking for.  Knowing that, we can see right away if the
1912  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1913  * don't accept the 2nd and succeeding adjacent occurrences */
1914 #define FBC_CHECK_AND_TRY                                      \
1915         if (   (   doevery                                     \
1916                 || s != previous_occurrence_end)               \
1917             && (reginfo->intuit || regtry(reginfo, &s)))       \
1918         {                                                      \
1919             goto got_it;                                       \
1920         }
1921
1922
1923 /* This differs from the above macros in that it calls a function which returns
1924  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1925  * there is no such occurrence. */
1926 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1927     while (s < strend) {                                    \
1928         s = (f);                                            \
1929         if (s >= strend) {                                  \
1930             break;                                          \
1931         }                                                   \
1932                                                             \
1933         FBC_CHECK_AND_TRY                                   \
1934         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1935         previous_occurrence_end = s;                        \
1936     }
1937
1938 /* The three macros below are slightly different versions of the same logic.
1939  *
1940  * The first is for /a and /aa when the target string is UTF-8.  This can only
1941  * match ascii, but it must advance based on UTF-8.   The other two handle the
1942  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1943  * for the boundary (or non-boundary) between a word and non-word character.
1944  * The utf8 and non-utf8 cases have the same logic, but the details must be
1945  * different.  Find the "wordness" of the character just prior to this one, and
1946  * compare it with the wordness of this one.  If they differ, we have a
1947  * boundary.  At the beginning of the string, pretend that the previous
1948  * character was a new-line.
1949  *
1950  * All these macros uncleanly have side-effects with each other and outside
1951  * variables.  So far it's been too much trouble to clean-up
1952  *
1953  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1954  *               a word character or not.
1955  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1956  *               word/non-word
1957  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1958  *
1959  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1960  * are looking for a boundary or for a non-boundary.  If we are looking for a
1961  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1962  * see if this tentative match actually works, and if so, to quit the loop
1963  * here.  And vice-versa if we are looking for a non-boundary.
1964  *
1965  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1966  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1967  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1968  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1969  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1970  * complement.  But in that branch we complement tmp, meaning that at the
1971  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1972  * which means at the top of the loop in the next iteration, it is
1973  * TEST_NON_UTF8(s-1) */
1974 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1975     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1976     tmp = TEST_NON_UTF8(tmp);                                                  \
1977     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1978         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1979             tmp = !tmp;                                                        \
1980             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1981         }                                                                      \
1982         else {                                                                 \
1983             IF_FAIL;                                                           \
1984         }                                                                      \
1985     );                                                                         \
1986
1987 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1988  * TEST_UTF8 is a macro that for the same input code points returns identically
1989  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1990 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1991     if (s == reginfo->strbeg) {                                                \
1992         tmp = '\n';                                                            \
1993     }                                                                          \
1994     else { /* Back-up to the start of the previous character */                \
1995         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1996         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1997                                                        0, UTF8_ALLOW_DEFAULT); \
1998     }                                                                          \
1999     tmp = TEST_UV(tmp);                                                        \
2000     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
2001         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
2002             tmp = !tmp;                                                        \
2003             IF_SUCCESS;                                                        \
2004         }                                                                      \
2005         else {                                                                 \
2006             IF_FAIL;                                                           \
2007         }                                                                      \
2008     );
2009
2010 /* Like the above two macros.  UTF8_CODE is the complete code for handling
2011  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2012  * macros below */
2013 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
2014     if (utf8_target) {                                                         \
2015         UTF8_CODE                                                              \
2016     }                                                                          \
2017     else {  /* Not utf8 */                                                     \
2018         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
2019         tmp = TEST_NON_UTF8(tmp);                                              \
2020         REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
2021             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
2022                 IF_SUCCESS;                                                    \
2023                 tmp = !tmp;                                                    \
2024             }                                                                  \
2025             else {                                                             \
2026                 IF_FAIL;                                                       \
2027             }                                                                  \
2028         );                                                                     \
2029     }                                                                          \
2030     /* Here, things have been set up by the previous code so that tmp is the   \
2031      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
2032      * utf8ness of the target).  We also have to check if this matches against \
2033      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
2034      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
2035      * string */                                                               \
2036     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2037         IF_SUCCESS;                                                            \
2038     }                                                                          \
2039     else {                                                                     \
2040         IF_FAIL;                                                               \
2041     }
2042
2043 /* This is the macro to use when we want to see if something that looks like it
2044  * could match, actually does, and if so exits the loop */
2045 #define REXEC_FBC_TRYIT                            \
2046     if ((reginfo->intuit || regtry(reginfo, &s)))  \
2047         goto got_it
2048
2049 /* The only difference between the BOUND and NBOUND cases is that
2050  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2051  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2052  * with the other one being empty (PLACEHOLDER is defined as empty).
2053  *
2054  * The TEST_FOO parameters are for operating on different forms of input, but
2055  * all should be ones that return identically for the same underlying code
2056  * points */
2057 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
2058     FBC_BOUND_COMMON(                                                          \
2059           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
2060           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2061
2062 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2063     FBC_BOUND_COMMON(                                                          \
2064             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2065             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2066
2067 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2068     FBC_BOUND_COMMON(                                                          \
2069           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2070           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2071
2072 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2073     FBC_BOUND_COMMON(                                                          \
2074             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2075             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2076
2077 #ifdef DEBUGGING
2078 static IV
2079 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2080   IV cp_out = Perl__invlist_search(invlist, cp_in);
2081   assert(cp_out >= 0);
2082   return cp_out;
2083 }
2084 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2085         invmap[S_get_break_val_cp_checked(invlist, cp)]
2086 #else
2087 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2088         invmap[_invlist_search(invlist, cp)]
2089 #endif
2090
2091 /* Takes a pointer to an inversion list, a pointer to its corresponding
2092  * inversion map, and a code point, and returns the code point's value
2093  * according to the two arrays.  It assumes that all code points have a value.
2094  * This is used as the base macro for macros for particular properties */
2095 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2096         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2097
2098 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2099  * of a code point, returning the value for the first code point in the string.
2100  * And it takes the particular macro name that finds the desired value given a
2101  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2102 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2103              (__ASSERT_(pos < strend)                                          \
2104                  /* Note assumes is valid UTF-8 */                             \
2105              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2106
2107 /* Returns the GCB value for the input code point */
2108 #define getGCB_VAL_CP(cp)                                                      \
2109           _generic_GET_BREAK_VAL_CP(                                           \
2110                                     PL_GCB_invlist,                            \
2111                                     _Perl_GCB_invmap,                          \
2112                                     (cp))
2113
2114 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2115  * bounded by pos and strend */
2116 #define getGCB_VAL_UTF8(pos, strend)                                           \
2117     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2118
2119 /* Returns the LB value for the input code point */
2120 #define getLB_VAL_CP(cp)                                                       \
2121           _generic_GET_BREAK_VAL_CP(                                           \
2122                                     PL_LB_invlist,                             \
2123                                     _Perl_LB_invmap,                           \
2124                                     (cp))
2125
2126 /* Returns the LB value for the first code point in the UTF-8 encoded string
2127  * bounded by pos and strend */
2128 #define getLB_VAL_UTF8(pos, strend)                                            \
2129     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2130
2131
2132 /* Returns the SB value for the input code point */
2133 #define getSB_VAL_CP(cp)                                                       \
2134           _generic_GET_BREAK_VAL_CP(                                           \
2135                                     PL_SB_invlist,                             \
2136                                     _Perl_SB_invmap,                     \
2137                                     (cp))
2138
2139 /* Returns the SB value for the first code point in the UTF-8 encoded string
2140  * bounded by pos and strend */
2141 #define getSB_VAL_UTF8(pos, strend)                                            \
2142     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2143
2144 /* Returns the WB value for the input code point */
2145 #define getWB_VAL_CP(cp)                                                       \
2146           _generic_GET_BREAK_VAL_CP(                                           \
2147                                     PL_WB_invlist,                             \
2148                                     _Perl_WB_invmap,                         \
2149                                     (cp))
2150
2151 /* Returns the WB value for the first code point in the UTF-8 encoded string
2152  * bounded by pos and strend */
2153 #define getWB_VAL_UTF8(pos, strend)                                            \
2154     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2155
2156 /* We know what class REx starts with.  Try to find this position... */
2157 /* if reginfo->intuit, its a dryrun */
2158 /* annoyingly all the vars in this routine have different names from their counterparts
2159    in regmatch. /grrr */
2160 STATIC char *
2161 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2162     const char *strend, regmatch_info *reginfo)
2163 {
2164     dVAR;
2165
2166     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2167     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2168
2169     char *pat_string;   /* The pattern's exactish string */
2170     char *pat_end;          /* ptr to end char of pat_string */
2171     re_fold_t folder;   /* Function for computing non-utf8 folds */
2172     const U8 *fold_array;   /* array for folding ords < 256 */
2173     STRLEN ln;
2174     STRLEN lnc;
2175     U8 c1;
2176     U8 c2;
2177     char *e;
2178
2179     /* In some cases we accept only the first occurence of 'x' in a sequence of
2180      * them.  This variable points to just beyond the end of the previous
2181      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2182      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2183      * hop back.) */
2184     char * previous_occurrence_end = 0;
2185
2186     I32 tmp;            /* Scratch variable */
2187     const bool utf8_target = reginfo->is_utf8_target;
2188     UV utf8_fold_flags = 0;
2189     const bool is_utf8_pat = reginfo->is_utf8_pat;
2190     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2191                                    with a result inverts that result, as 0^1 =
2192                                    1 and 1^1 = 0 */
2193     _char_class_number classnum;
2194
2195     RXi_GET_DECL(prog,progi);
2196
2197     PERL_ARGS_ASSERT_FIND_BYCLASS;
2198
2199     /* We know what class it must start with. */
2200     switch (OP(c)) {
2201     case ANYOFL:
2202         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2203
2204         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2205             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2206         }
2207
2208         /* FALLTHROUGH */
2209     case ANYOFD:
2210     case ANYOF:
2211         if (utf8_target) {
2212             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2213                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2214         }
2215         else if (ANYOF_FLAGS(c)) {
2216             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2217         }
2218         else {
2219             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2220         }
2221         break;
2222
2223     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2224         /* UTF-8ness doesn't matter, so use 0 */
2225         REXEC_FBC_FIND_NEXT_SCAN(0,
2226          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2227                                    (U8) ARG(c), FLAGS(c)));
2228         break;
2229
2230     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2231         assert(! is_utf8_pat);
2232         /* FALLTHROUGH */
2233     case EXACTFAA:
2234         if (is_utf8_pat || utf8_target) {
2235             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2236             goto do_exactf_utf8;
2237         }
2238         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
2239         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
2240         goto do_exactf_non_utf8;        /* isn't dealt with by these */
2241
2242     case EXACTF:   /* This node only generated for non-utf8 patterns */
2243         assert(! is_utf8_pat);
2244         if (utf8_target) {
2245             utf8_fold_flags = 0;
2246             goto do_exactf_utf8;
2247         }
2248         fold_array = PL_fold;
2249         folder = foldEQ;
2250         goto do_exactf_non_utf8;
2251
2252     case EXACTFL:
2253         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2254         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2255             utf8_fold_flags = FOLDEQ_LOCALE;
2256             goto do_exactf_utf8;
2257         }
2258         fold_array = PL_fold_locale;
2259         folder = foldEQ_locale;
2260         goto do_exactf_non_utf8;
2261
2262     case EXACTFU_SS:
2263         if (is_utf8_pat) {
2264             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2265         }
2266         goto do_exactf_utf8;
2267
2268     case EXACTFLU8:
2269             if (! utf8_target) {    /* All code points in this node require
2270                                        UTF-8 to express.  */
2271                 break;
2272             }
2273             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2274                                              | FOLDEQ_S2_FOLDS_SANE;
2275             goto do_exactf_utf8;
2276
2277     case EXACTFU:
2278         if (is_utf8_pat || utf8_target) {
2279             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2280             goto do_exactf_utf8;
2281         }
2282
2283         /* Any 'ss' in the pattern should have been replaced by regcomp,
2284          * so we don't have to worry here about this single special case
2285          * in the Latin1 range */
2286         fold_array = PL_fold_latin1;
2287         folder = foldEQ_latin1;
2288
2289         /* FALLTHROUGH */
2290
2291       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2292                            are no glitches with fold-length differences
2293                            between the target string and pattern */
2294
2295         /* The idea in the non-utf8 EXACTF* cases is to first find the
2296          * first character of the EXACTF* node and then, if necessary,
2297          * case-insensitively compare the full text of the node.  c1 is the
2298          * first character.  c2 is its fold.  This logic will not work for
2299          * Unicode semantics and the german sharp ss, which hence should
2300          * not be compiled into a node that gets here. */
2301         pat_string = STRING(c);
2302         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2303
2304         /* We know that we have to match at least 'ln' bytes (which is the
2305          * same as characters, since not utf8).  If we have to match 3
2306          * characters, and there are only 2 availabe, we know without
2307          * trying that it will fail; so don't start a match past the
2308          * required minimum number from the far end */
2309         e = HOP3c(strend, -((SSize_t)ln), s);
2310         if (e < s)
2311             break;
2312
2313         c1 = *pat_string;
2314         c2 = fold_array[c1];
2315         if (c1 == c2) { /* If char and fold are the same */
2316             while (s <= e) {
2317                 s = (char *) memchr(s, c1, e + 1 - s);
2318                 if (s == NULL) {
2319                     break;
2320                 }
2321
2322                 /* Check that the rest of the node matches */
2323                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2324                     && (reginfo->intuit || regtry(reginfo, &s)) )
2325                 {
2326                     goto got_it;
2327                 }
2328                 s++;
2329             }
2330         }
2331         else {
2332             U8 bits_differing = c1 ^ c2;
2333
2334             /* If the folds differ in one bit position only, we can mask to
2335              * match either of them, and can use this faster find method.  Both
2336              * ASCII and EBCDIC tend to have their case folds differ in only
2337              * one position, so this is very likely */
2338             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2339                 bits_differing = ~ bits_differing;
2340                 while (s <= e) {
2341                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2342                                         (c1 & bits_differing), bits_differing);
2343                     if (s > e) {
2344                         break;
2345                     }
2346
2347                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2348                         && (reginfo->intuit || regtry(reginfo, &s)) )
2349                     {
2350                         goto got_it;
2351                     }
2352                     s++;
2353                 }
2354             }
2355             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2356                        should actually happen only in EXACTFL nodes */
2357                 while (s <= e) {
2358                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2359                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2360                         && (reginfo->intuit || regtry(reginfo, &s)) )
2361                     {
2362                         goto got_it;
2363                     }
2364                     s++;
2365                 }
2366             }
2367         }
2368         break;
2369
2370       do_exactf_utf8:
2371       {
2372         unsigned expansion;
2373
2374         /* If one of the operands is in utf8, we can't use the simpler folding
2375          * above, due to the fact that many different characters can have the
2376          * same fold, or portion of a fold, or different- length fold */
2377         pat_string = STRING(c);
2378         ln  = STR_LEN(c);       /* length to match in octets/bytes */
2379         pat_end = pat_string + ln;
2380         lnc = is_utf8_pat       /* length to match in characters */
2381                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2382                 : ln;
2383
2384         /* We have 'lnc' characters to match in the pattern, but because of
2385          * multi-character folding, each character in the target can match
2386          * up to 3 characters (Unicode guarantees it will never exceed
2387          * this) if it is utf8-encoded; and up to 2 if not (based on the
2388          * fact that the Latin 1 folds are already determined, and the
2389          * only multi-char fold in that range is the sharp-s folding to
2390          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2391          * string character.  Adjust lnc accordingly, rounding up, so that
2392          * if we need to match at least 4+1/3 chars, that really is 5. */
2393         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2394         lnc = (lnc + expansion - 1) / expansion;
2395
2396         /* As in the non-UTF8 case, if we have to match 3 characters, and
2397          * only 2 are left, it's guaranteed to fail, so don't start a
2398          * match that would require us to go beyond the end of the string
2399          */
2400         e = HOP3c(strend, -((SSize_t)lnc), s);
2401
2402         /* XXX Note that we could recalculate e to stop the loop earlier,
2403          * as the worst case expansion above will rarely be met, and as we
2404          * go along we would usually find that e moves further to the left.
2405          * This would happen only after we reached the point in the loop
2406          * where if there were no expansion we should fail.  Unclear if
2407          * worth the expense */
2408
2409         while (s <= e) {
2410             char *my_strend= (char *)strend;
2411             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2412                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2413                 && (reginfo->intuit || regtry(reginfo, &s)) )
2414             {
2415                 goto got_it;
2416             }
2417             s += (utf8_target) ? UTF8SKIP(s) : 1;
2418         }
2419         break;
2420     }
2421
2422     case BOUNDL:
2423         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2424         if (FLAGS(c) != TRADITIONAL_BOUND) {
2425             if (! IN_UTF8_CTYPE_LOCALE) {
2426                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2427                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2428             }
2429             goto do_boundu;
2430         }
2431
2432         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2433         break;
2434
2435     case NBOUNDL:
2436         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2437         if (FLAGS(c) != TRADITIONAL_BOUND) {
2438             if (! IN_UTF8_CTYPE_LOCALE) {
2439                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2440                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2441             }
2442             goto do_nboundu;
2443         }
2444
2445         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2446         break;
2447
2448     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2449                    meaning */
2450         assert(FLAGS(c) == TRADITIONAL_BOUND);
2451
2452         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2453         break;
2454
2455     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2456                    meaning */
2457         assert(FLAGS(c) == TRADITIONAL_BOUND);
2458
2459         FBC_BOUND_A(isWORDCHAR_A);
2460         break;
2461
2462     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2463                    meaning */
2464         assert(FLAGS(c) == TRADITIONAL_BOUND);
2465
2466         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2467         break;
2468
2469     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2470                    meaning */
2471         assert(FLAGS(c) == TRADITIONAL_BOUND);
2472
2473         FBC_NBOUND_A(isWORDCHAR_A);
2474         break;
2475
2476     case NBOUNDU:
2477         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2478             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2479             break;
2480         }
2481
2482       do_nboundu:
2483
2484         to_complement = 1;
2485         /* FALLTHROUGH */
2486
2487     case BOUNDU:
2488       do_boundu:
2489         switch((bound_type) FLAGS(c)) {
2490             case TRADITIONAL_BOUND:
2491                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2492                 break;
2493             case GCB_BOUND:
2494                 if (s == reginfo->strbeg) {
2495                     if (reginfo->intuit || regtry(reginfo, &s))
2496                     {
2497                         goto got_it;
2498                     }
2499
2500                     /* Didn't match.  Try at the next position (if there is one) */
2501                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2502                     if (UNLIKELY(s >= reginfo->strend)) {
2503                         break;
2504                     }
2505                 }
2506
2507                 if (utf8_target) {
2508                     GCB_enum before = getGCB_VAL_UTF8(
2509                                                reghop3((U8*)s, -1,
2510                                                        (U8*)(reginfo->strbeg)),
2511                                                (U8*) reginfo->strend);
2512                     while (s < strend) {
2513                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2514                                                         (U8*) reginfo->strend);
2515                         if (   (to_complement ^ isGCB(before,
2516                                                       after,
2517                                                       (U8*) reginfo->strbeg,
2518                                                       (U8*) s,
2519                                                       utf8_target))
2520                             && (reginfo->intuit || regtry(reginfo, &s)))
2521                         {
2522                             goto got_it;
2523                         }
2524                         before = after;
2525                         s += UTF8SKIP(s);
2526                     }
2527                 }
2528                 else {  /* Not utf8.  Everything is a GCB except between CR and
2529                            LF */
2530                     while (s < strend) {
2531                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2532                                               || UCHARAT(s) != '\n'))
2533                             && (reginfo->intuit || regtry(reginfo, &s)))
2534                         {
2535                             goto got_it;
2536                         }
2537                         s++;
2538                     }
2539                 }
2540
2541                 /* And, since this is a bound, it can match after the final
2542                  * character in the string */
2543                 if ((reginfo->intuit || regtry(reginfo, &s))) {
2544                     goto got_it;
2545                 }
2546                 break;
2547
2548             case LB_BOUND:
2549                 if (s == reginfo->strbeg) {
2550                     if (reginfo->intuit || regtry(reginfo, &s)) {
2551                         goto got_it;
2552                     }
2553                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2554                     if (UNLIKELY(s >= reginfo->strend)) {
2555                         break;
2556                     }
2557                 }
2558
2559                 if (utf8_target) {
2560                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2561                                                                -1,
2562                                                                (U8*)(reginfo->strbeg)),
2563                                                        (U8*) reginfo->strend);
2564                     while (s < strend) {
2565                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2566                         if (to_complement ^ isLB(before,
2567                                                  after,
2568                                                  (U8*) reginfo->strbeg,
2569                                                  (U8*) s,
2570                                                  (U8*) reginfo->strend,
2571                                                  utf8_target)
2572                             && (reginfo->intuit || regtry(reginfo, &s)))
2573                         {
2574                             goto got_it;
2575                         }
2576                         before = after;
2577                         s += UTF8SKIP(s);
2578                     }
2579                 }
2580                 else {  /* Not utf8. */
2581                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2582                     while (s < strend) {
2583                         LB_enum after = getLB_VAL_CP((U8) *s);
2584                         if (to_complement ^ isLB(before,
2585                                                  after,
2586                                                  (U8*) reginfo->strbeg,
2587                                                  (U8*) s,
2588                                                  (U8*) reginfo->strend,
2589                                                  utf8_target)
2590                             && (reginfo->intuit || regtry(reginfo, &s)))
2591                         {
2592                             goto got_it;
2593                         }
2594                         before = after;
2595                         s++;
2596                     }
2597                 }
2598
2599                 if (reginfo->intuit || regtry(reginfo, &s)) {
2600                     goto got_it;
2601                 }
2602
2603                 break;
2604
2605             case SB_BOUND:
2606                 if (s == reginfo->strbeg) {
2607                     if (reginfo->intuit || regtry(reginfo, &s)) {
2608                         goto got_it;
2609                     }
2610                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2611                     if (UNLIKELY(s >= reginfo->strend)) {
2612                         break;
2613                     }
2614                 }
2615
2616                 if (utf8_target) {
2617                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2618                                                         -1,
2619                                                         (U8*)(reginfo->strbeg)),
2620                                                       (U8*) reginfo->strend);
2621                     while (s < strend) {
2622                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2623                                                          (U8*) reginfo->strend);
2624                         if ((to_complement ^ isSB(before,
2625                                                   after,
2626                                                   (U8*) reginfo->strbeg,
2627                                                   (U8*) s,
2628                                                   (U8*) reginfo->strend,
2629                                                   utf8_target))
2630                             && (reginfo->intuit || regtry(reginfo, &s)))
2631                         {
2632                             goto got_it;
2633                         }
2634                         before = after;
2635                         s += UTF8SKIP(s);
2636                     }
2637                 }
2638                 else {  /* Not utf8. */
2639                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2640                     while (s < strend) {
2641                         SB_enum after = getSB_VAL_CP((U8) *s);
2642                         if ((to_complement ^ isSB(before,
2643                                                   after,
2644                                                   (U8*) reginfo->strbeg,
2645                                                   (U8*) s,
2646                                                   (U8*) reginfo->strend,
2647                                                   utf8_target))
2648                             && (reginfo->intuit || regtry(reginfo, &s)))
2649                         {
2650                             goto got_it;
2651                         }
2652                         before = after;
2653                         s++;
2654                     }
2655                 }
2656
2657                 /* Here are at the final position in the target string.  The SB
2658                  * value is always true here, so matches, depending on other
2659                  * constraints */
2660                 if (reginfo->intuit || regtry(reginfo, &s)) {
2661                     goto got_it;
2662                 }
2663
2664                 break;
2665
2666             case WB_BOUND:
2667                 if (s == reginfo->strbeg) {
2668                     if (reginfo->intuit || regtry(reginfo, &s)) {
2669                         goto got_it;
2670                     }
2671                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2672                     if (UNLIKELY(s >= reginfo->strend)) {
2673                         break;
2674                     }
2675                 }
2676
2677                 if (utf8_target) {
2678                     /* We are at a boundary between char_sub_0 and char_sub_1.
2679                      * We also keep track of the value for char_sub_-1 as we
2680                      * loop through the line.   Context may be needed to make a
2681                      * determination, and if so, this can save having to
2682                      * recalculate it */
2683                     WB_enum previous = WB_UNKNOWN;
2684                     WB_enum before = getWB_VAL_UTF8(
2685                                               reghop3((U8*)s,
2686                                                       -1,
2687                                                       (U8*)(reginfo->strbeg)),
2688                                               (U8*) reginfo->strend);
2689                     while (s < strend) {
2690                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2691                                                         (U8*) reginfo->strend);
2692                         if ((to_complement ^ isWB(previous,
2693                                                   before,
2694                                                   after,
2695                                                   (U8*) reginfo->strbeg,
2696                                                   (U8*) s,
2697                                                   (U8*) reginfo->strend,
2698                                                   utf8_target))
2699                             && (reginfo->intuit || regtry(reginfo, &s)))
2700                         {
2701                             goto got_it;
2702                         }
2703                         previous = before;
2704                         before = after;
2705                         s += UTF8SKIP(s);
2706                     }
2707                 }
2708                 else {  /* Not utf8. */
2709                     WB_enum previous = WB_UNKNOWN;
2710                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2711                     while (s < strend) {
2712                         WB_enum after = getWB_VAL_CP((U8) *s);
2713                         if ((to_complement ^ isWB(previous,
2714                                                   before,
2715                                                   after,
2716                                                   (U8*) reginfo->strbeg,
2717                                                   (U8*) s,
2718                                                   (U8*) reginfo->strend,
2719                                                   utf8_target))
2720                             && (reginfo->intuit || regtry(reginfo, &s)))
2721                         {
2722                             goto got_it;
2723                         }
2724                         previous = before;
2725                         before = after;
2726                         s++;
2727                     }
2728                 }
2729
2730                 if (reginfo->intuit || regtry(reginfo, &s)) {
2731                     goto got_it;
2732                 }
2733         }
2734         break;
2735
2736     case LNBREAK:
2737         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2738                         is_LNBREAK_latin1_safe(s, strend)
2739         );
2740         break;
2741
2742     case ASCII:
2743         REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
2744         break;
2745
2746     case NASCII:
2747         if (utf8_target) {
2748             REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2749                                                             utf8_target));
2750         }
2751         else {
2752             REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2753                                                             utf8_target));
2754         }
2755
2756         break;
2757
2758     /* The argument to all the POSIX node types is the class number to pass to
2759      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2760
2761     case NPOSIXL:
2762         to_complement = 1;
2763         /* FALLTHROUGH */
2764
2765     case POSIXL:
2766         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2767         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2768                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2769         break;
2770
2771     case NPOSIXD:
2772         to_complement = 1;
2773         /* FALLTHROUGH */
2774
2775     case POSIXD:
2776         if (utf8_target) {
2777             goto posix_utf8;
2778         }
2779         goto posixa;
2780
2781     case NPOSIXA:
2782         if (utf8_target) {
2783             /* The complement of something that matches only ASCII matches all
2784              * non-ASCII, plus everything in ASCII that isn't in the class. */
2785             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2786                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2787             break;
2788         }
2789
2790         to_complement = 1;
2791         goto posixa;
2792
2793     case POSIXA:
2794         /* Don't need to worry about utf8, as it can match only a single
2795          * byte invariant character.  But we do anyway for performance reasons,
2796          * as otherwise we would have to examine all the continuation
2797          * characters */
2798         if (utf8_target) {
2799             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2800             break;
2801         }
2802
2803       posixa:
2804         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2805                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2806         break;
2807
2808     case NPOSIXU:
2809         to_complement = 1;
2810         /* FALLTHROUGH */
2811
2812     case POSIXU:
2813         if (! utf8_target) {
2814             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2815                                  to_complement ^ cBOOL(_generic_isCC(*s,
2816                                                                     FLAGS(c))));
2817         }
2818         else {
2819
2820           posix_utf8:
2821             classnum = (_char_class_number) FLAGS(c);
2822             switch (classnum) {
2823                 default:
2824                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2825                         to_complement ^ cBOOL(_invlist_contains_cp(
2826                                               PL_XPosix_ptrs[classnum],
2827                                               utf8_to_uvchr_buf((U8 *) s,
2828                                                                 (U8 *) strend,
2829                                                                 NULL))));
2830                     break;
2831                 case _CC_ENUM_SPACE:
2832                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2833                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2834                     break;
2835
2836                 case _CC_ENUM_BLANK:
2837                     REXEC_FBC_CLASS_SCAN(1,
2838                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2839                     break;
2840
2841                 case _CC_ENUM_XDIGIT:
2842                     REXEC_FBC_CLASS_SCAN(1,
2843                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2844                     break;
2845
2846                 case _CC_ENUM_VERTSPACE:
2847                     REXEC_FBC_CLASS_SCAN(1,
2848                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2849                     break;
2850
2851                 case _CC_ENUM_CNTRL:
2852                     REXEC_FBC_CLASS_SCAN(1,
2853                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2854                     break;
2855             }
2856         }
2857         break;
2858
2859     case AHOCORASICKC:
2860     case AHOCORASICK:
2861         {
2862             DECL_TRIE_TYPE(c);
2863             /* what trie are we using right now */
2864             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2865             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2866             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2867
2868             const char *last_start = strend - trie->minlen;
2869 #ifdef DEBUGGING
2870             const char *real_start = s;
2871 #endif
2872             STRLEN maxlen = trie->maxlen;
2873             SV *sv_points;
2874             U8 **points; /* map of where we were in the input string
2875                             when reading a given char. For ASCII this
2876                             is unnecessary overhead as the relationship
2877                             is always 1:1, but for Unicode, especially
2878                             case folded Unicode this is not true. */
2879             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2880             U8 *bitmap=NULL;
2881
2882
2883             GET_RE_DEBUG_FLAGS_DECL;
2884
2885             /* We can't just allocate points here. We need to wrap it in
2886              * an SV so it gets freed properly if there is a croak while
2887              * running the match */
2888             ENTER;
2889             SAVETMPS;
2890             sv_points=newSV(maxlen * sizeof(U8 *));
2891             SvCUR_set(sv_points,
2892                 maxlen * sizeof(U8 *));
2893             SvPOK_on(sv_points);
2894             sv_2mortal(sv_points);
2895             points=(U8**)SvPV_nolen(sv_points );
2896             if ( trie_type != trie_utf8_fold
2897                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2898             {
2899                 if (trie->bitmap)
2900                     bitmap=(U8*)trie->bitmap;
2901                 else
2902                     bitmap=(U8*)ANYOF_BITMAP(c);
2903             }
2904             /* this is the Aho-Corasick algorithm modified a touch
2905                to include special handling for long "unknown char" sequences.
2906                The basic idea being that we use AC as long as we are dealing
2907                with a possible matching char, when we encounter an unknown char
2908                (and we have not encountered an accepting state) we scan forward
2909                until we find a legal starting char.
2910                AC matching is basically that of trie matching, except that when
2911                we encounter a failing transition, we fall back to the current
2912                states "fail state", and try the current char again, a process
2913                we repeat until we reach the root state, state 1, or a legal
2914                transition. If we fail on the root state then we can either
2915                terminate if we have reached an accepting state previously, or
2916                restart the entire process from the beginning if we have not.
2917
2918              */
2919             while (s <= last_start) {
2920                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2921                 U8 *uc = (U8*)s;
2922                 U16 charid = 0;
2923                 U32 base = 1;
2924                 U32 state = 1;
2925                 UV uvc = 0;
2926                 STRLEN len = 0;
2927                 STRLEN foldlen = 0;
2928                 U8 *uscan = (U8*)NULL;
2929                 U8 *leftmost = NULL;
2930 #ifdef DEBUGGING
2931                 U32 accepted_word= 0;
2932 #endif
2933                 U32 pointpos = 0;
2934
2935                 while ( state && uc <= (U8*)strend ) {
2936                     int failed=0;
2937                     U32 word = aho->states[ state ].wordnum;
2938
2939                     if( state==1 ) {
2940                         if ( bitmap ) {
2941                             DEBUG_TRIE_EXECUTE_r(
2942                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2943                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2944                                         (char *)uc, utf8_target, 0 );
2945                                     Perl_re_printf( aTHX_
2946                                         " Scanning for legal start char...\n");
2947                                 }
2948                             );
2949                             if (utf8_target) {
2950                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2951                                     uc += UTF8SKIP(uc);
2952                                 }
2953                             } else {
2954                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2955                                     uc++;
2956                                 }
2957                             }
2958                             s= (char *)uc;
2959                         }
2960                         if (uc >(U8*)last_start) break;
2961                     }
2962
2963                     if ( word ) {
2964                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2965                         if (!leftmost || lpos < leftmost) {
2966                             DEBUG_r(accepted_word=word);
2967                             leftmost= lpos;
2968                         }
2969                         if (base==0) break;
2970
2971                     }
2972                     points[pointpos++ % maxlen]= uc;
2973                     if (foldlen || uc < (U8*)strend) {
2974                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2975                                          widecharmap, uc,
2976                                          uscan, len, uvc, charid, foldlen,
2977                                          foldbuf, uniflags);
2978                         DEBUG_TRIE_EXECUTE_r({
2979                             dump_exec_pos( (char *)uc, c, strend,
2980                                         real_start, s, utf8_target, 0);
2981                             Perl_re_printf( aTHX_
2982                                 " Charid:%3u CP:%4" UVxf " ",
2983                                  charid, uvc);
2984                         });
2985                     }
2986                     else {
2987                         len = 0;
2988                         charid = 0;
2989                     }
2990
2991
2992                     do {
2993 #ifdef DEBUGGING
2994                         word = aho->states[ state ].wordnum;
2995 #endif
2996                         base = aho->states[ state ].trans.base;
2997
2998                         DEBUG_TRIE_EXECUTE_r({
2999                             if (failed)
3000                                 dump_exec_pos( (char *)uc, c, strend, real_start,
3001                                     s,   utf8_target, 0 );
3002                             Perl_re_printf( aTHX_
3003                                 "%sState: %4" UVxf ", word=%" UVxf,
3004                                 failed ? " Fail transition to " : "",
3005                                 (UV)state, (UV)word);
3006                         });
3007                         if ( base ) {
3008                             U32 tmp;
3009                             I32 offset;
3010                             if (charid &&
3011                                  ( ((offset = base + charid
3012                                     - 1 - trie->uniquecharcount)) >= 0)
3013                                  && ((U32)offset < trie->lasttrans)
3014                                  && trie->trans[offset].check == state
3015                                  && (tmp=trie->trans[offset].next))
3016                             {
3017                                 DEBUG_TRIE_EXECUTE_r(
3018                                     Perl_re_printf( aTHX_ " - legal\n"));
3019                                 state = tmp;
3020                                 break;
3021                             }
3022                             else {
3023                                 DEBUG_TRIE_EXECUTE_r(
3024                                     Perl_re_printf( aTHX_ " - fail\n"));
3025                                 failed = 1;
3026                                 state = aho->fail[state];
3027                             }
3028                         }
3029                         else {
3030                             /* we must be accepting here */
3031                             DEBUG_TRIE_EXECUTE_r(
3032                                     Perl_re_printf( aTHX_ " - accepting\n"));
3033                             failed = 1;
3034                             break;
3035                         }
3036                     } while(state);
3037                     uc += len;
3038                     if (failed) {
3039                         if (leftmost)
3040                             break;
3041                         if (!state) state = 1;
3042                     }
3043                 }
3044                 if ( aho->states[ state ].wordnum ) {
3045                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3046                     if (!leftmost || lpos < leftmost) {
3047                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3048                         leftmost = lpos;
3049                     }
3050                 }
3051                 if (leftmost) {
3052                     s = (char*)leftmost;
3053                     DEBUG_TRIE_EXECUTE_r({
3054                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3055                             (UV)accepted_word, (IV)(s - real_start)
3056                         );
3057                     });
3058                     if (reginfo->intuit || regtry(reginfo, &s)) {
3059                         FREETMPS;
3060                         LEAVE;
3061                         goto got_it;
3062                     }
3063                     s = HOPc(s,1);
3064                     DEBUG_TRIE_EXECUTE_r({
3065                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3066                     });
3067                 } else {
3068                     DEBUG_TRIE_EXECUTE_r(
3069                         Perl_re_printf( aTHX_ "No match.\n"));
3070                     break;
3071                 }
3072             }
3073             FREETMPS;
3074             LEAVE;
3075         }
3076         break;
3077     default:
3078         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3079     }
3080     return 0;
3081   got_it:
3082     return s;
3083 }
3084
3085 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3086  * flags have same meanings as with regexec_flags() */
3087
3088 static void
3089 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3090                             char *strbeg,
3091                             char *strend,
3092                             SV *sv,
3093                             U32 flags,
3094                             bool utf8_target)
3095 {
3096     struct regexp *const prog = ReANY(rx);
3097
3098     if (flags & REXEC_COPY_STR) {
3099 #ifdef PERL_ANY_COW
3100         if (SvCANCOW(sv)) {
3101             DEBUG_C(Perl_re_printf( aTHX_
3102                               "Copy on write: regexp capture, type %d\n",
3103                                     (int) SvTYPE(sv)));
3104             /* Create a new COW SV to share the match string and store
3105              * in saved_copy, unless the current COW SV in saved_copy
3106              * is valid and suitable for our purpose */
3107             if ((   prog->saved_copy
3108                  && SvIsCOW(prog->saved_copy)
3109                  && SvPOKp(prog->saved_copy)
3110                  && SvIsCOW(sv)
3111                  && SvPOKp(sv)
3112                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3113             {
3114                 /* just reuse saved_copy SV */
3115                 if (RXp_MATCH_COPIED(prog)) {
3116                     Safefree(prog->subbeg);
3117                     RXp_MATCH_COPIED_off(prog);
3118                 }
3119             }
3120             else {
3121                 /* create new COW SV to share string */
3122                 RXp_MATCH_COPY_FREE(prog);
3123                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3124             }
3125             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3126             assert (SvPOKp(prog->saved_copy));
3127             prog->sublen  = strend - strbeg;
3128             prog->suboffset = 0;
3129             prog->subcoffset = 0;
3130         } else
3131 #endif
3132         {
3133             SSize_t min = 0;
3134             SSize_t max = strend - strbeg;
3135             SSize_t sublen;
3136
3137             if (    (flags & REXEC_COPY_SKIP_POST)
3138                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3139                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3140             ) { /* don't copy $' part of string */
3141                 U32 n = 0;
3142                 max = -1;
3143                 /* calculate the right-most part of the string covered
3144                  * by a capture. Due to lookahead, this may be to
3145                  * the right of $&, so we have to scan all captures */
3146                 while (n <= prog->lastparen) {
3147                     if (prog->offs[n].end > max)
3148                         max = prog->offs[n].end;
3149                     n++;
3150                 }
3151                 if (max == -1)
3152                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3153                             ? prog->offs[0].start
3154                             : 0;
3155                 assert(max >= 0 && max <= strend - strbeg);
3156             }
3157
3158             if (    (flags & REXEC_COPY_SKIP_PRE)
3159                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3160                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3161             ) { /* don't copy $` part of string */
3162                 U32 n = 0;
3163                 min = max;
3164                 /* calculate the left-most part of the string covered
3165                  * by a capture. Due to lookbehind, this may be to
3166                  * the left of $&, so we have to scan all captures */
3167                 while (min && n <= prog->lastparen) {
3168                     if (   prog->offs[n].start != -1
3169                         && prog->offs[n].start < min)
3170                     {
3171                         min = prog->offs[n].start;
3172                     }
3173                     n++;
3174                 }
3175                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3176                     && min >  prog->offs[0].end
3177                 )
3178                     min = prog->offs[0].end;
3179
3180             }
3181
3182             assert(min >= 0 && min <= max && min <= strend - strbeg);
3183             sublen = max - min;
3184
3185             if (RXp_MATCH_COPIED(prog)) {
3186                 if (sublen > prog->sublen)
3187                     prog->subbeg =
3188                             (char*)saferealloc(prog->subbeg, sublen+1);
3189             }
3190             else
3191                 prog->subbeg = (char*)safemalloc(sublen+1);
3192             Copy(strbeg + min, prog->subbeg, sublen, char);
3193             prog->subbeg[sublen] = '\0';
3194             prog->suboffset = min;
3195             prog->sublen = sublen;
3196             RXp_MATCH_COPIED_on(prog);
3197         }
3198         prog->subcoffset = prog->suboffset;
3199         if (prog->suboffset && utf8_target) {
3200             /* Convert byte offset to chars.
3201              * XXX ideally should only compute this if @-/@+
3202              * has been seen, a la PL_sawampersand ??? */
3203
3204             /* If there's a direct correspondence between the
3205              * string which we're matching and the original SV,
3206              * then we can use the utf8 len cache associated with
3207              * the SV. In particular, it means that under //g,
3208              * sv_pos_b2u() will use the previously cached
3209              * position to speed up working out the new length of
3210              * subcoffset, rather than counting from the start of
3211              * the string each time. This stops
3212              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3213              * from going quadratic */
3214             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3215                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3216                                                 SV_GMAGIC|SV_CONST_RETURN);
3217             else
3218                 prog->subcoffset = utf8_length((U8*)strbeg,
3219                                     (U8*)(strbeg+prog->suboffset));
3220         }
3221     }
3222     else {
3223         RXp_MATCH_COPY_FREE(prog);
3224         prog->subbeg = strbeg;
3225         prog->suboffset = 0;
3226         prog->subcoffset = 0;
3227         prog->sublen = strend - strbeg;
3228     }
3229 }
3230
3231
3232
3233
3234 /*
3235  - regexec_flags - match a regexp against a string
3236  */
3237 I32
3238 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3239               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3240 /* stringarg: the point in the string at which to begin matching */
3241 /* strend:    pointer to null at end of string */
3242 /* strbeg:    real beginning of string */
3243 /* minend:    end of match must be >= minend bytes after stringarg. */
3244 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3245  *            itself is accessed via the pointers above */
3246 /* data:      May be used for some additional optimizations.
3247               Currently unused. */
3248 /* flags:     For optimizations. See REXEC_* in regexp.h */
3249
3250 {
3251     struct regexp *const prog = ReANY(rx);
3252     char *s;
3253     regnode *c;
3254     char *startpos;
3255     SSize_t minlen;             /* must match at least this many chars */
3256     SSize_t dontbother = 0;     /* how many characters not to try at end */
3257     const bool utf8_target = cBOOL(DO_UTF8(sv));
3258     I32 multiline;
3259     RXi_GET_DECL(prog,progi);
3260     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3261     regmatch_info *const reginfo = &reginfo_buf;
3262     regexp_paren_pair *swap = NULL;
3263     I32 oldsave;
3264     GET_RE_DEBUG_FLAGS_DECL;
3265
3266     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3267     PERL_UNUSED_ARG(data);
3268
3269     /* Be paranoid... */
3270     if (prog == NULL) {
3271         Perl_croak(aTHX_ "NULL regexp parameter");
3272     }
3273
3274     DEBUG_EXECUTE_r(
3275         debug_start_match(rx, utf8_target, stringarg, strend,
3276         "Matching");
3277     );
3278
3279     startpos = stringarg;
3280
3281     /* set these early as they may be used by the HOP macros below */
3282     reginfo->strbeg = strbeg;
3283     reginfo->strend = strend;
3284     reginfo->is_utf8_target = cBOOL(utf8_target);
3285
3286     if (prog->intflags & PREGf_GPOS_SEEN) {
3287         MAGIC *mg;
3288
3289         /* set reginfo->ganch, the position where \G can match */
3290
3291         reginfo->ganch =
3292             (flags & REXEC_IGNOREPOS)
3293             ? stringarg /* use start pos rather than pos() */
3294             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3295               /* Defined pos(): */
3296             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3297             : strbeg; /* pos() not defined; use start of string */
3298
3299         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3300             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3301
3302         /* in the presence of \G, we may need to start looking earlier in
3303          * the string than the suggested start point of stringarg:
3304          * if prog->gofs is set, then that's a known, fixed minimum
3305          * offset, such as
3306          * /..\G/:   gofs = 2
3307          * /ab|c\G/: gofs = 1
3308          * or if the minimum offset isn't known, then we have to go back
3309          * to the start of the string, e.g. /w+\G/
3310          */
3311
3312         if (prog->intflags & PREGf_ANCH_GPOS) {
3313             if (prog->gofs) {
3314                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3315                 if (!startpos ||
3316                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3317                 {
3318                     DEBUG_r(Perl_re_printf( aTHX_
3319                             "fail: ganch-gofs before earliest possible start\n"));
3320                     return 0;
3321                 }
3322             }
3323             else
3324                 startpos = reginfo->ganch;
3325         }
3326         else if (prog->gofs) {
3327             startpos = HOPBACKc(startpos, prog->gofs);
3328             if (!startpos)
3329                 startpos = strbeg;
3330         }
3331         else if (prog->intflags & PREGf_GPOS_FLOAT)
3332             startpos = strbeg;
3333     }
3334
3335     minlen = prog->minlen;
3336     if ((startpos + minlen) > strend || startpos < strbeg) {
3337         DEBUG_r(Perl_re_printf( aTHX_
3338                     "Regex match can't succeed, so not even tried\n"));
3339         return 0;
3340     }
3341
3342     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3343      * which will call destuctors to reset PL_regmatch_state, free higher
3344      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3345      * regmatch_info_aux_eval */
3346
3347     oldsave = PL_savestack_ix;
3348
3349     s = startpos;
3350
3351     if ((prog->extflags & RXf_USE_INTUIT)
3352         && !(flags & REXEC_CHECKED))
3353     {
3354         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3355                                     flags, NULL);
3356         if (!s)
3357             return 0;
3358
3359         if (prog->extflags & RXf_CHECK_ALL) {
3360             /* we can match based purely on the result of INTUIT.
3361              * Set up captures etc just for $& and $-[0]
3362              * (an intuit-only match wont have $1,$2,..) */
3363             assert(!prog->nparens);
3364
3365             /* s/// doesn't like it if $& is earlier than where we asked it to
3366              * start searching (which can happen on something like /.\G/) */
3367             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3368                     && (s < stringarg))
3369             {
3370                 /* this should only be possible under \G */
3371                 assert(prog->intflags & PREGf_GPOS_SEEN);
3372                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3373                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3374                 goto phooey;
3375             }
3376
3377             /* match via INTUIT shouldn't have any captures.
3378              * Let @-, @+, $^N know */
3379             prog->lastparen = prog->lastcloseparen = 0;
3380             RXp_MATCH_UTF8_set(prog, utf8_target);
3381             prog->offs[0].start = s - strbeg;
3382             prog->offs[0].end = utf8_target
3383                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3384                 : s - strbeg + prog->minlenret;
3385             if ( !(flags & REXEC_NOT_FIRST) )
3386                 S_reg_set_capture_string(aTHX_ rx,
3387                                         strbeg, strend,
3388                                         sv, flags, utf8_target);
3389
3390             return 1;
3391         }
3392     }
3393
3394     multiline = prog->extflags & RXf_PMf_MULTILINE;
3395
3396     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3397         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3398                               "String too short [regexec_flags]...\n"));
3399         goto phooey;
3400     }
3401
3402     /* Check validity of program. */
3403     if (UCHARAT(progi->program) != REG_MAGIC) {
3404         Perl_croak(aTHX_ "corrupted regexp program");
3405     }
3406
3407     RXp_MATCH_TAINTED_off(prog);
3408     RXp_MATCH_UTF8_set(prog, utf8_target);
3409
3410     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
3411     reginfo->intuit = 0;
3412     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3413     reginfo->warned = FALSE;
3414     reginfo->sv = sv;
3415     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3416     /* see how far we have to get to not match where we matched before */
3417     reginfo->till = stringarg + minend;
3418
3419     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3420         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3421            S_cleanup_regmatch_info_aux has executed (registered by
3422            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3423            magic belonging to this SV.
3424            Not newSVsv, either, as it does not COW.
3425         */
3426         reginfo->sv = newSV(0);
3427         SvSetSV_nosteal(reginfo->sv, sv);
3428         SAVEFREESV(reginfo->sv);
3429     }
3430
3431     /* reserve next 2 or 3 slots in PL_regmatch_state:
3432      * slot N+0: may currently be in use: skip it
3433      * slot N+1: use for regmatch_info_aux struct
3434      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3435      * slot N+3: ready for use by regmatch()
3436      */
3437
3438     {
3439         regmatch_state *old_regmatch_state;
3440         regmatch_slab  *old_regmatch_slab;
3441         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3442
3443         /* on first ever match, allocate first slab */
3444         if (!PL_regmatch_slab) {
3445             Newx(PL_regmatch_slab, 1, regmatch_slab);
3446             PL_regmatch_slab->prev = NULL;
3447             PL_regmatch_slab->next = NULL;
3448             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3449         }
3450
3451         old_regmatch_state = PL_regmatch_state;
3452         old_regmatch_slab  = PL_regmatch_slab;
3453
3454         for (i=0; i <= max; i++) {
3455             if (i == 1)
3456                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3457             else if (i ==2)
3458                 reginfo->info_aux_eval =
3459                 reginfo->info_aux->info_aux_eval =
3460                             &(PL_regmatch_state->u.info_aux_eval);
3461
3462             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3463                 PL_regmatch_state = S_push_slab(aTHX);
3464         }
3465
3466         /* note initial PL_regmatch_state position; at end of match we'll
3467          * pop back to there and free any higher slabs */
3468
3469         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3470         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3471         reginfo->info_aux->poscache = NULL;
3472
3473         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3474
3475         if ((prog->extflags & RXf_EVAL_SEEN))
3476             S_setup_eval_state(aTHX_ reginfo);
3477         else
3478             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3479     }
3480
3481     /* If there is a "must appear" string, look for it. */
3482
3483     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3484         /* We have to be careful. If the previous successful match
3485            was from this regex we don't want a subsequent partially
3486            successful match to clobber the old results.
3487            So when we detect this possibility we add a swap buffer
3488            to the re, and switch the buffer each match. If we fail,
3489            we switch it back; otherwise we leave it swapped.
3490         */
3491         swap = prog->offs;
3492         /* do we need a save destructor here for eval dies? */
3493         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3494         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3495             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3496             0,
3497             PTR2UV(prog),
3498             PTR2UV(swap),
3499             PTR2UV(prog->offs)
3500         ));
3501     }
3502
3503     if (prog->recurse_locinput)
3504         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3505
3506     /* Simplest case: anchored match need be tried only once, or with
3507      * MBOL, only at the beginning of each line.
3508      *
3509      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3510      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3511      * match at the start of the string then it won't match anywhere else
3512      * either; while with /.*.../, if it doesn't match at the beginning,
3513      * the earliest it could match is at the start of the next line */
3514
3515     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3516         char *end;
3517
3518         if (regtry(reginfo, &s))
3519             goto got_it;
3520
3521         if (!(prog->intflags & PREGf_ANCH_MBOL))
3522             goto phooey;
3523
3524         /* didn't match at start, try at other newline positions */
3525
3526         if (minlen)
3527             dontbother = minlen - 1;
3528         end = HOP3c(strend, -dontbother, strbeg) - 1;
3529
3530         /* skip to next newline */
3531
3532         while (s <= end) { /* note it could be possible to match at the end of the string */
3533             /* NB: newlines are the same in unicode as they are in latin */
3534             if (*s++ != '\n')
3535                 continue;
3536             if (prog->check_substr || prog->check_utf8) {
3537             /* note that with PREGf_IMPLICIT, intuit can only fail
3538              * or return the start position, so it's of limited utility.
3539              * Nevertheless, I made the decision that the potential for
3540              * quick fail was still worth it - DAPM */
3541                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3542                 if (!s)
3543                     goto phooey;
3544             }
3545             if (regtry(reginfo, &s))
3546                 goto got_it;
3547         }
3548         goto phooey;
3549     } /* end anchored search */
3550
3551     if (prog->intflags & PREGf_ANCH_GPOS)
3552     {
3553         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3554         assert(prog->intflags & PREGf_GPOS_SEEN);
3555         /* For anchored \G, the only position it can match from is
3556          * (ganch-gofs); we already set startpos to this above; if intuit
3557          * moved us on from there, we can't possibly succeed */
3558         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3559         if (s == startpos && regtry(reginfo, &s))
3560             goto got_it;
3561         goto phooey;
3562     }
3563
3564     /* Messy cases:  unanchored match. */
3565     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3566         /* we have /x+whatever/ */
3567         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3568         char ch;
3569 #ifdef DEBUGGING
3570         int did_match = 0;
3571 #endif
3572         if (utf8_target) {
3573             if (! prog->anchored_utf8) {
3574                 to_utf8_substr(prog);
3575             }
3576             ch = SvPVX_const(prog->anchored_utf8)[0];
3577             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3578                 if (*s == ch) {
3579                     DEBUG_EXECUTE_r( did_match = 1 );
3580                     if (regtry(reginfo, &s)) goto got_it;
3581                     s += UTF8SKIP(s);
3582                     while (s < strend && *s == ch)
3583                         s += UTF8SKIP(s);
3584                 }
3585             );
3586
3587         }
3588         else {
3589             if (! prog->anchored_substr) {
3590                 if (! to_byte_substr(prog)) {
3591                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3592                 }
3593             }
3594             ch = SvPVX_const(prog->anchored_substr)[0];
3595             REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3596                 if (*s == ch) {
3597                     DEBUG_EXECUTE_r( did_match = 1 );
3598                     if (regtry(reginfo, &s)) goto got_it;
3599                     s++;
3600                     while (s < strend && *s == ch)
3601                         s++;
3602                 }
3603             );
3604         }
3605         DEBUG_EXECUTE_r(if (!did_match)
3606                 Perl_re_printf( aTHX_
3607                                   "Did not find anchored character...\n")
3608                );
3609     }
3610     else if (prog->anchored_substr != NULL
3611               || prog->anchored_utf8 != NULL
3612               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3613                   && prog->float_max_offset < strend - s)) {
3614         SV *must;
3615         SSize_t back_max;
3616         SSize_t back_min;
3617         char *last;
3618         char *last1;            /* Last position checked before */
3619 #ifdef DEBUGGING
3620         int did_match = 0;
3621 #endif
3622         if (prog->anchored_substr || prog->anchored_utf8) {
3623             if (utf8_target) {
3624                 if (! prog->anchored_utf8) {
3625                     to_utf8_substr(prog);
3626                 }
3627                 must = prog->anchored_utf8;
3628             }
3629             else {
3630                 if (! prog->anchored_substr) {
3631                     if (! to_byte_substr(prog)) {
3632                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3633                     }
3634                 }
3635                 must = prog->anchored_substr;
3636             }
3637             back_max = back_min = prog->anchored_offset;
3638         } else {
3639             if (utf8_target) {
3640                 if (! prog->float_utf8) {
3641                     to_utf8_substr(prog);
3642                 }
3643                 must = prog->float_utf8;
3644             }
3645             else {
3646                 if (! prog->float_substr) {
3647                     if (! to_byte_substr(prog)) {
3648                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3649                     }
3650                 }
3651                 must = prog->float_substr;
3652             }
3653             back_max = prog->float_max_offset;
3654             back_min = prog->float_min_offset;
3655         }
3656
3657         if (back_min<0) {
3658             last = strend;
3659         } else {
3660             last = HOP3c(strend,        /* Cannot start after this */
3661                   -(SSize_t)(CHR_SVLEN(must)
3662                          - (SvTAIL(must) != 0) + back_min), strbeg);
3663         }
3664         if (s > reginfo->strbeg)
3665             last1 = HOPc(s, -1);
3666         else
3667             last1 = s - 1;      /* bogus */
3668
3669         /* XXXX check_substr already used to find "s", can optimize if
3670            check_substr==must. */
3671         dontbother = 0;
3672         strend = HOPc(strend, -dontbother);
3673         while ( (s <= last) &&
3674                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3675                                   (unsigned char*)strend, must,
3676                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3677             DEBUG_EXECUTE_r( did_match = 1 );
3678             if (HOPc(s, -back_max) > last1) {
3679                 last1 = HOPc(s, -back_min);
3680                 s = HOPc(s, -back_max);
3681             }
3682             else {
3683                 char * const t = (last1 >= reginfo->strbeg)
3684                                     ? HOPc(last1, 1) : last1 + 1;
3685
3686                 last1 = HOPc(s, -back_min);
3687                 s = t;
3688             }
3689             if (utf8_target) {
3690                 while (s <= last1) {
3691                     if (regtry(reginfo, &s))
3692                         goto got_it;
3693                     if (s >= last1) {
3694                         s++; /* to break out of outer loop */
3695                         break;
3696                     }
3697                     s += UTF8SKIP(s);
3698                 }
3699             }
3700             else {
3701                 while (s <= last1) {
3702                     if (regtry(reginfo, &s))
3703                         goto got_it;
3704                     s++;
3705                 }
3706             }
3707         }
3708         DEBUG_EXECUTE_r(if (!did_match) {
3709             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3710                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3711             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3712                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3713                                ? "anchored" : "floating"),
3714                 quoted, RE_SV_TAIL(must));
3715         });
3716         goto phooey;
3717     }
3718     else if ( (c = progi->regstclass) ) {
3719         if (minlen) {
3720             const OPCODE op = OP(progi->regstclass);
3721             /* don't bother with what can't match */
3722             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3723                 strend = HOPc(strend, -(minlen - 1));
3724         }
3725         DEBUG_EXECUTE_r({
3726             SV * const prop = sv_newmortal();
3727             regprop(prog, prop, c, reginfo, NULL);
3728             {
3729                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3730                     s,strend-s,PL_dump_re_max_len);
3731                 Perl_re_printf( aTHX_
3732                     "Matching stclass %.*s against %s (%d bytes)\n",
3733                     (int)SvCUR(prop), SvPVX_const(prop),
3734                      quoted, (int)(strend - s));
3735             }
3736         });
3737         if (find_byclass(prog, c, s, strend, reginfo))
3738             goto got_it;
3739         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3740     }
3741     else {
3742         dontbother = 0;
3743         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3744             /* Trim the end. */
3745             char *last= NULL;
3746             SV* float_real;
3747             STRLEN len;
3748             const char *little;
3749
3750             if (utf8_target) {
3751                 if (! prog->float_utf8) {
3752                     to_utf8_substr(prog);
3753                 }
3754                 float_real = prog->float_utf8;
3755             }
3756             else {
3757                 if (! prog->float_substr) {
3758                     if (! to_byte_substr(prog)) {
3759                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3760                     }
3761                 }
3762                 float_real = prog->float_substr;
3763             }
3764
3765             little = SvPV_const(float_real, len);
3766             if (SvTAIL(float_real)) {
3767                     /* This means that float_real contains an artificial \n on
3768                      * the end due to the presence of something like this:
3769                      * /foo$/ where we can match both "foo" and "foo\n" at the
3770                      * end of the string.  So we have to compare the end of the
3771                      * string first against the float_real without the \n and
3772                      * then against the full float_real with the string.  We
3773                      * have to watch out for cases where the string might be
3774                      * smaller than the float_real or the float_real without
3775                      * the \n. */
3776                     char *checkpos= strend - len;
3777                     DEBUG_OPTIMISE_r(
3778                         Perl_re_printf( aTHX_
3779                             "%sChecking for float_real.%s\n",
3780                             PL_colors[4], PL_colors[5]));
3781                     if (checkpos + 1 < strbeg) {
3782                         /* can't match, even if we remove the trailing \n
3783                          * string is too short to match */
3784                         DEBUG_EXECUTE_r(
3785                             Perl_re_printf( aTHX_
3786                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3787                                 PL_colors[4], PL_colors[5]));
3788                         goto phooey;
3789                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3790                         /* can match, the end of the string matches without the
3791                          * "\n" */
3792                         last = checkpos + 1;
3793                     } else if (checkpos < strbeg) {
3794                         /* cant match, string is too short when the "\n" is
3795                          * included */
3796                         DEBUG_EXECUTE_r(
3797                             Perl_re_printf( aTHX_
3798                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3799                                 PL_colors[4], PL_colors[5]));
3800                         goto phooey;
3801                     } else if (!multiline) {
3802                         /* non multiline match, so compare with the "\n" at the
3803                          * end of the string */
3804                         if (memEQ(checkpos, little, len)) {
3805                             last= checkpos;
3806                         } else {
3807                             DEBUG_EXECUTE_r(
3808                                 Perl_re_printf( aTHX_
3809                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3810                                     PL_colors[4], PL_colors[5]));
3811                             goto phooey;
3812                         }
3813                     } else {
3814                         /* multiline match, so we have to search for a place
3815                          * where the full string is located */
3816                         goto find_last;
3817                     }
3818             } else {
3819                   find_last:
3820                     if (len)
3821                         last = rninstr(s, strend, little, little + len);
3822                     else
3823                         last = strend;  /* matching "$" */
3824             }
3825             if (!last) {
3826                 /* at one point this block contained a comment which was
3827                  * probably incorrect, which said that this was a "should not
3828                  * happen" case.  Even if it was true when it was written I am
3829                  * pretty sure it is not anymore, so I have removed the comment
3830                  * and replaced it with this one. Yves */
3831                 DEBUG_EXECUTE_r(
3832                     Perl_re_printf( aTHX_
3833                         "%sString does not contain required substring, cannot match.%s\n",
3834                         PL_colors[4], PL_colors[5]
3835                     ));
3836                 goto phooey;
3837             }
3838             dontbother = strend - last + prog->float_min_offset;
3839         }
3840         if (minlen && (dontbother < minlen))
3841             dontbother = minlen - 1;
3842         strend -= dontbother;              /* this one's always in bytes! */
3843         /* We don't know much -- general case. */
3844         if (utf8_target) {
3845             for (;;) {
3846                 if (regtry(reginfo, &s))
3847                     goto got_it;
3848                 if (s >= strend)
3849                     break;
3850                 s += UTF8SKIP(s);
3851             };
3852         }
3853         else {
3854             do {
3855                 if (regtry(reginfo, &s))
3856                     goto got_it;
3857             } while (s++ < strend);
3858         }
3859     }
3860
3861     /* Failure. */
3862     goto phooey;
3863
3864   got_it:
3865     /* s/// doesn't like it if $& is earlier than where we asked it to
3866      * start searching (which can happen on something like /.\G/) */
3867     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3868             && (prog->offs[0].start < stringarg - strbeg))
3869     {
3870         /* this should only be possible under \G */
3871         assert(prog->intflags & PREGf_GPOS_SEEN);
3872         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3873             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3874         goto phooey;
3875     }
3876
3877     DEBUG_BUFFERS_r(
3878         if (swap)
3879             Perl_re_exec_indentf( aTHX_
3880                 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3881                 0,
3882                 PTR2UV(prog),
3883                 PTR2UV(swap)
3884             );
3885     );
3886     Safefree(swap);
3887
3888     /* clean up; this will trigger destructors that will free all slabs
3889      * above the current one, and cleanup the regmatch_info_aux
3890      * and regmatch_info_aux_eval sructs */
3891
3892     LEAVE_SCOPE(oldsave);
3893
3894     if (RXp_PAREN_NAMES(prog))
3895         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3896
3897     /* make sure $`, $&, $', and $digit will work later */
3898     if ( !(flags & REXEC_NOT_FIRST) )
3899         S_reg_set_capture_string(aTHX_ rx,
3900                                     strbeg, reginfo->strend,
3901                                     sv, flags, utf8_target);
3902
3903     return 1;
3904
3905   phooey:
3906     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3907                           PL_colors[4], PL_colors[5]));
3908
3909     /* clean up; this will trigger destructors that will free all slabs
3910      * above the current one, and cleanup the regmatch_info_aux
3911      * and regmatch_info_aux_eval sructs */
3912
3913     LEAVE_SCOPE(oldsave);
3914
3915     if (swap) {
3916         /* we failed :-( roll it back */
3917         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3918             "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3919             0,
3920             PTR2UV(prog),
3921             PTR2UV(prog->offs),
3922             PTR2UV(swap)
3923         ));
3924         Safefree(prog->offs);
3925         prog->offs = swap;
3926     }
3927     return 0;
3928 }
3929
3930
3931 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3932  * Do inc before dec, in case old and new rex are the same */
3933 #define SET_reg_curpm(Re2)                          \
3934     if (reginfo->info_aux_eval) {                   \
3935         (void)ReREFCNT_inc(Re2);                    \
3936         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3937         PM_SETRE((PL_reg_curpm), (Re2));            \
3938     }
3939
3940
3941 /*
3942  - regtry - try match at specific point
3943  */
3944 STATIC bool                     /* 0 failure, 1 success */
3945 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3946 {
3947     CHECKPOINT lastcp;
3948     REGEXP *const rx = reginfo->prog;
3949     regexp *const prog = ReANY(rx);
3950     SSize_t result;
3951 #ifdef DEBUGGING
3952     U32 depth = 0; /* used by REGCP_SET */
3953 #endif
3954     RXi_GET_DECL(prog,progi);
3955     GET_RE_DEBUG_FLAGS_DECL;
3956
3957     PERL_ARGS_ASSERT_REGTRY;
3958
3959     reginfo->cutpoint=NULL;
3960
3961     prog->offs[0].start = *startposp - reginfo->strbeg;
3962     prog->lastparen = 0;
3963     prog->lastcloseparen = 0;
3964
3965     /* XXXX What this code is doing here?!!!  There should be no need
3966        to do this again and again, prog->lastparen should take care of
3967        this!  --ilya*/
3968
3969     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3970      * Actually, the code in regcppop() (which Ilya may be meaning by
3971      * prog->lastparen), is not needed at all by the test suite
3972      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3973      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3974      * Meanwhile, this code *is* needed for the
3975      * above-mentioned test suite tests to succeed.  The common theme
3976      * on those tests seems to be returning null fields from matches.
3977      * --jhi updated by dapm */
3978
3979     /* After encountering a variant of the issue mentioned above I think
3980      * the point Ilya was making is that if we properly unwind whenever
3981      * we set lastparen to a smaller value then we should not need to do
3982      * this every time, only when needed. So if we have tests that fail if
3983      * we remove this, then it suggests somewhere else we are improperly
3984      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3985      * places it is called, and related regcp() routines. - Yves */
3986 #if 1
3987     if (prog->nparens) {
3988         regexp_paren_pair *pp = prog->offs;
3989         I32 i;
3990         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3991             ++pp;
3992             pp->start = -1;
3993             pp->end = -1;
3994         }
3995     }
3996 #endif
3997     REGCP_SET(lastcp);
3998     result = regmatch(reginfo, *startposp, progi->program + 1);
3999     if (result != -1) {
4000         prog->offs[0].end = result;
4001         return 1;
4002     }
4003     if (reginfo->cutpoint)
4004         *startposp= reginfo->cutpoint;
4005     REGCP_UNWIND(lastcp);
4006     return 0;
4007 }
4008
4009
4010 #define sayYES goto yes
4011 #define sayNO goto no
4012 #define sayNO_SILENT goto no_silent
4013
4014 /* we dont use STMT_START/END here because it leads to
4015    "unreachable code" warnings, which are bogus, but distracting. */
4016 #define CACHEsayNO \
4017     if (ST.cache_mask) \
4018        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
4019     sayNO
4020
4021 /* this is used to determine how far from the left messages like
4022    'failed...' are printed in regexec.c. It should be set such that
4023    messages are inline with the regop output that created them.
4024 */
4025 #define REPORT_CODE_OFF 29
4026 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4027 #ifdef DEBUGGING
4028 int
4029 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4030 {
4031     va_list ap;
4032     int result;
4033     PerlIO *f= Perl_debug_log;
4034     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4035     va_start(ap, depth);
4036     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4037     result = PerlIO_vprintf(f, fmt, ap);
4038     va_end(ap);
4039     return result;
4040 }
4041 #endif /* DEBUGGING */
4042
4043
4044 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4045 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4046 #define CHRTEST_NOT_A_CP_1 -999
4047 #define CHRTEST_NOT_A_CP_2 -998
4048
4049 /* grab a new slab and return the first slot in it */
4050
4051 STATIC regmatch_state *
4052 S_push_slab(pTHX)
4053 {
4054     regmatch_slab *s = PL_regmatch_slab->next;
4055     if (!s) {
4056         Newx(s, 1, regmatch_slab);
4057         s->prev = PL_regmatch_slab;
4058         s->next = NULL;
4059         PL_regmatch_slab->next = s;
4060     }
4061     PL_regmatch_slab = s;
4062     return SLAB_FIRST(s);
4063 }
4064
4065
4066 /* push a new state then goto it */
4067
4068 #define PUSH_STATE_GOTO(state, node, input) \
4069     pushinput = input; \
4070     scan = node; \
4071     st->resume_state = state; \
4072     goto push_state;
4073
4074 /* push a new state with success backtracking, then goto it */
4075
4076 #define PUSH_YES_STATE_GOTO(state, node, input) \
4077     pushinput = input; \
4078     scan = node; \
4079     st->resume_state = state; \
4080     goto push_yes_state;
4081
4082
4083
4084
4085 /*
4086
4087 regmatch() - main matching routine
4088
4089 This is basically one big switch statement in a loop. We execute an op,
4090 set 'next' to point the next op, and continue. If we come to a point which
4091 we may need to backtrack to on failure such as (A|B|C), we push a
4092 backtrack state onto the backtrack stack. On failure, we pop the top
4093 state, and re-enter the loop at the state indicated. If there are no more
4094 states to pop, we return failure.
4095
4096 Sometimes we also need to backtrack on success; for example /A+/, where
4097 after successfully matching one A, we need to go back and try to
4098 match another one; similarly for lookahead assertions: if the assertion
4099 completes successfully, we backtrack to the state just before the assertion
4100 and then carry on.  In these cases, the pushed state is marked as
4101 'backtrack on success too'. This marking is in fact done by a chain of
4102 pointers, each pointing to the previous 'yes' state. On success, we pop to
4103 the nearest yes state, discarding any intermediate failure-only states.
4104 Sometimes a yes state is pushed just to force some cleanup code to be
4105 called at the end of a successful match or submatch; e.g. (??{$re}) uses
4106 it to free the inner regex.
4107
4108 Note that failure backtracking rewinds the cursor position, while
4109 success backtracking leaves it alone.
4110
4111 A pattern is complete when the END op is executed, while a subpattern
4112 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
4113 ops trigger the "pop to last yes state if any, otherwise return true"
4114 behaviour.
4115
4116 A common convention in this function is to use A and B to refer to the two
4117 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
4118 the subpattern to be matched possibly multiple times, while B is the entire
4119 rest of the pattern. Variable and state names reflect this convention.
4120
4121 The states in the main switch are the union of ops and failure/success of
4122 substates associated with with that op.  For example, IFMATCH is the op
4123 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
4124 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
4125 successfully matched A and IFMATCH_A_fail is a state saying that we have
4126 just failed to match A. Resume states always come in pairs. The backtrack
4127 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
4128 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
4129 on success or failure.
4130
4131 The struct that holds a backtracking state is actually a big union, with
4132 one variant for each major type of op. The variable st points to the
4133 top-most backtrack struct. To make the code clearer, within each
4134 block of code we #define ST to alias the relevant union.
4135
4136 Here's a concrete example of a (vastly oversimplified) IFMATCH
4137 implementation:
4138
4139     switch (state) {
4140     ....
4141
4142 #define ST st->u.ifmatch
4143
4144     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4145         ST.foo = ...; // some state we wish to save
4146         ...
4147         // push a yes backtrack state with a resume value of
4148         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
4149         // first node of A:
4150         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
4151         // NOTREACHED
4152
4153     case IFMATCH_A: // we have successfully executed A; now continue with B
4154         next = B;
4155         bar = ST.foo; // do something with the preserved value
4156         break;
4157
4158     case IFMATCH_A_fail: // A failed, so the assertion failed
4159         ...;   // do some housekeeping, then ...
4160         sayNO; // propagate the failure
4161
4162 #undef ST
4163
4164     ...
4165     }
4166
4167 For any old-timers reading this who are familiar with the old recursive
4168 approach, the code above is equivalent to:
4169
4170     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4171     {
4172         int foo = ...
4173         ...
4174         if (regmatch(A)) {
4175             next = B;
4176             bar = foo;
4177             break;
4178         }
4179         ...;   // do some housekeeping, then ...
4180         sayNO; // propagate the failure
4181     }
4182
4183 The topmost backtrack state, pointed to by st, is usually free. If you
4184 want to claim it, populate any ST.foo fields in it with values you wish to
4185 save, then do one of
4186
4187         PUSH_STATE_GOTO(resume_state, node, newinput);
4188         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
4189
4190 which sets that backtrack state's resume value to 'resume_state', pushes a
4191 new free entry to the top of the backtrack stack, then goes to 'node'.
4192 On backtracking, the free slot is popped, and the saved state becomes the
4193 new free state. An ST.foo field in this new top state can be temporarily
4194 accessed to retrieve values, but once the main loop is re-entered, it
4195 becomes available for reuse.
4196
4197 Note that the depth of the backtrack stack constantly increases during the
4198 left-to-right execution of the pattern, rather than going up and down with
4199 the pattern nesting. For example the stack is at its maximum at Z at the
4200 end of the pattern, rather than at X in the following:
4201
4202     /(((X)+)+)+....(Y)+....Z/
4203
4204 The only exceptions to this are lookahead/behind assertions and the cut,
4205 (?>A), which pop all the backtrack states associated with A before
4206 continuing.
4207
4208 Backtrack state structs are allocated in slabs of about 4K in size.
4209 PL_regmatch_state and st always point to the currently active state,
4210 and PL_regmatch_slab points to the slab currently containing
4211 PL_regmatch_state.  The first time regmatch() is called, the first slab is
4212 allocated, and is never freed until interpreter destruction. When the slab
4213 is full, a new one is allocated and chained to the end. At exit from
4214 regmatch(), slabs allocated since entry are freed.
4215
4216 */
4217
4218
4219 #define DEBUG_STATE_pp(pp)                                  \
4220     DEBUG_STATE_r({                                         \
4221         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
4222         Perl_re_printf( aTHX_                                           \
4223             "%*s" pp " %s%s%s%s%s\n",                       \
4224             INDENT_CHARS(depth), "",                        \
4225             PL_reg_name[st->resume_state],                  \
4226             ((st==yes_state||st==mark_state) ? "[" : ""),   \
4227             ((st==yes_state) ? "Y" : ""),                   \
4228             ((st==mark_state) ? "M" : ""),                  \
4229             ((st==yes_state||st==mark_state) ? "]" : "")    \
4230         );                                                  \
4231     });
4232
4233
4234 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4235
4236 #ifdef DEBUGGING
4237
4238 STATIC void
4239 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4240     const char *start, const char *end, const char *blurb)
4241 {
4242     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4243
4244     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4245
4246     if (!PL_colorset)
4247             reginitcolors();
4248     {
4249         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4250             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4251
4252         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4253             start, end - start, PL_dump_re_max_len);
4254
4255         Perl_re_printf( aTHX_
4256             "%s%s REx%s %s against %s\n",
4257                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4258
4259         if (utf8_target||utf8_pat)
4260             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4261                 utf8_pat ? "pattern" : "",
4262                 utf8_pat && utf8_target ? " and " : "",
4263                 utf8_target ? "string" : ""
4264             );
4265     }
4266 }
4267
4268 STATIC void
4269 S_dump_exec_pos(pTHX_ const char *locinput,
4270                       const regnode *scan,
4271                       const char *loc_regeol,
4272                       const char *loc_bostr,
4273                       const char *loc_reg_starttry,
4274                       const bool utf8_target,
4275                       const U32 depth
4276                 )
4277 {
4278     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4279     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4280     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4281     /* The part of the string before starttry has one color
4282        (pref0_len chars), between starttry and current
4283        position another one (pref_len - pref0_len chars),
4284        after the current position the third one.
4285        We assume that pref0_len <= pref_len, otherwise we
4286        decrease pref0_len.  */
4287     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4288         ? (5 + taill) - l : locinput - loc_bostr;
4289     int pref0_len;
4290
4291     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4292
4293     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4294         pref_len++;
4295     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4296     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4297         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4298               ? (5 + taill) - pref_len : loc_regeol - locinput);
4299     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4300         l--;
4301     if (pref0_len < 0)
4302         pref0_len = 0;
4303     if (pref0_len > pref_len)
4304         pref0_len = pref_len;
4305     {
4306         const int is_uni = utf8_target ? 1 : 0;
4307
4308         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4309             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4310
4311         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4312                     (locinput - pref_len + pref0_len),
4313                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4314
4315         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4316                     locinput, loc_regeol - locinput, 10, 0, 1);
4317
4318         const STRLEN tlen=len0+len1+len2;
4319         Perl_re_printf( aTHX_
4320                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4321                     (IV)(locinput - loc_bostr),
4322                     len0, s0,
4323                     len1, s1,
4324                     (docolor ? "" : "> <"),
4325                     len2, s2,
4326                     (int)(tlen > 19 ? 0 :  19 - tlen),
4327                     "",
4328                     depth);
4329     }
4330 }
4331
4332 #endif
4333
4334 /* reg_check_named_buff_matched()
4335  * Checks to see if a named buffer has matched. The data array of
4336  * buffer numbers corresponding to the buffer is expected to reside
4337  * in the regexp->data->data array in the slot stored in the ARG() of
4338  * node involved. Note that this routine doesn't actually care about the
4339  * name, that information is not preserved from compilation to execution.
4340  * Returns the index of the leftmost defined buffer with the given name
4341  * or 0 if non of the buffers matched.
4342  */
4343 STATIC I32
4344 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4345 {
4346     I32 n;
4347     RXi_GET_DECL(rex,rexi);
4348     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4349     I32 *nums=(I32*)SvPVX(sv_dat);
4350
4351     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4352
4353     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4354         if ((I32)rex->lastparen >= nums[n] &&
4355             rex->offs[nums[n]].end != -1)
4356         {
4357             return nums[n];
4358         }
4359     }
4360     return 0;
4361 }
4362
4363
4364 static bool
4365 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4366         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4367 {
4368     /* This function determines if there are one or two characters that match
4369      * the first character of the passed-in EXACTish node <text_node>, and if
4370      * so, returns them in the passed-in pointers.
4371      *
4372      * If it determines that no possible character in the target string can
4373      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4374      * the first character in <text_node> requires UTF-8 to represent, and the
4375      * target string isn't in UTF-8.)
4376      *
4377      * If there are more than two characters that could match the beginning of
4378      * <text_node>, or if more context is required to determine a match or not,
4379      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4380      *
4381      * The motiviation behind this function is to allow the caller to set up
4382      * tight loops for matching.  If <text_node> is of type EXACT, there is
4383      * only one possible character that can match its first character, and so
4384      * the situation is quite simple.  But things get much more complicated if
4385      * folding is involved.  It may be that the first character of an EXACTFish
4386      * node doesn't participate in any possible fold, e.g., punctuation, so it
4387      * can be matched only by itself.  The vast majority of characters that are
4388      * in folds match just two things, their lower and upper-case equivalents.
4389      * But not all are like that; some have multiple possible matches, or match
4390      * sequences of more than one character.  This function sorts all that out.
4391      *
4392      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4393      * loop of trying to match A*, we know we can't exit where the thing
4394      * following it isn't a B.  And something can't be a B unless it is the
4395      * beginning of B.  By putting a quick test for that beginning in a tight
4396      * loop, we can rule out things that can't possibly be B without having to
4397      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4398      * character, we can make a tight loop matching A*, using the outputs of
4399      * this function.
4400      *
4401      * If the target string to match isn't in UTF-8, and there aren't
4402      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4403      * the one or two possible octets (which are characters in this situation)
4404      * that can match.  In all cases, if there is only one character that can
4405      * match, *<c1p> and *<c2p> will be identical.
4406      *
4407      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4408      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4409      * can match the beginning of <text_node>.  They should be declared with at
4410      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4411      * undefined what these contain.)  If one or both of the buffers are
4412      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4413      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4414      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4415      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4416      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4417
4418     const bool utf8_target = reginfo->is_utf8_target;
4419
4420     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4421     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4422     bool use_chrtest_void = FALSE;
4423     const bool is_utf8_pat = reginfo->is_utf8_pat;
4424
4425     /* Used when we have both utf8 input and utf8 output, to avoid converting
4426      * to/from code points */
4427     bool utf8_has_been_setup = FALSE;
4428
4429     dVAR;
4430
4431     U8 *pat = (U8*)STRING(text_node);
4432     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4433
4434     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4435
4436         /* In an exact node, only one thing can be matched, that first
4437          * character.  If both the pat and the target are UTF-8, we can just
4438          * copy the input to the output, avoiding finding the code point of
4439          * that character */
4440         if (!is_utf8_pat) {
4441             c2 = c1 = *pat;
4442         }
4443         else if (utf8_target) {
4444             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4445             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4446             utf8_has_been_setup = TRUE;
4447         }
4448         else {
4449             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4450         }
4451     }
4452     else { /* an EXACTFish node */
4453         U8 *pat_end = pat + STR_LEN(text_node);
4454
4455         /* An EXACTFL node has at least some characters unfolded, because what
4456          * they match is not known until now.  So, now is the time to fold
4457          * the first few of them, as many as are needed to determine 'c1' and
4458          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4459          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4460          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4461          * need to fold as many characters as a single character can fold to,
4462          * so that later we can check if the first ones are such a multi-char
4463          * fold.  But, in such a pattern only locale-problematic characters
4464          * aren't folded, so we can skip this completely if the first character
4465          * in the node isn't one of the tricky ones */
4466         if (OP(text_node) == EXACTFL) {
4467
4468             if (! is_utf8_pat) {
4469                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4470                 {
4471                     folded[0] = folded[1] = 's';
4472                     pat = folded;
4473                     pat_end = folded + 2;
4474                 }
4475             }
4476             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4477                 U8 *s = pat;
4478                 U8 *d = folded;
4479                 int i;
4480
4481                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4482                     if (isASCII(*s)) {
4483                         *(d++) = (U8) toFOLD_LC(*s);
4484                         s++;
4485                     }
4486                     else {
4487                         STRLEN len;
4488                         _toFOLD_utf8_flags(s,
4489                                            pat_end,
4490                                            d,
4491                                            &len,
4492                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4493                         d += len;
4494                         s += UTF8SKIP(s);
4495                     }
4496                 }
4497
4498                 pat = folded;
4499                 pat_end = d;
4500             }
4501         }
4502
4503         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4504              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4505         {
4506             /* Multi-character folds require more context to sort out.  Also
4507              * PL_utf8_foldclosures used below doesn't handle them, so have to
4508              * be handled outside this routine */
4509             use_chrtest_void = TRUE;
4510         }
4511         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4512             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4513             if (c1 > 255) {
4514                 const unsigned int * remaining_folds_to_list;
4515                 unsigned int first_folds_to;
4516
4517                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4518                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4519                 Size_t folds_to_count = _inverse_folds(c1,
4520                                                      &first_folds_to,
4521                                                      &remaining_folds_to_list);
4522                 if (folds_to_count == 0) {
4523                     c2 = c1;    /* there is only a single character that could
4524                                    match */
4525                 }
4526                 else if (folds_to_count != 1) {
4527                     /* If there aren't exactly two folds to this (itself and
4528                      * another), it is outside the scope of this function */
4529                     use_chrtest_void = TRUE;
4530                 }
4531                 else {  /* There are two.  We already have one, get the other */
4532                     c2 = first_folds_to;
4533
4534                     /* Folds that cross the 255/256 boundary are forbidden if
4535                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4536                      * ASCIII.  The only other match to c1 is c2, and since c1
4537                      * is above 255, c2 better be as well under these
4538                      * circumstances.  If it isn't, it means the only legal
4539                      * match of c1 is itself. */
4540                     if (    c2 < 256
4541                         && (   (   OP(text_node) == EXACTFL
4542                                 && ! IN_UTF8_CTYPE_LOCALE)
4543                             || ((     OP(text_node) == EXACTFAA
4544                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4545                                 && (isASCII(c1) || isASCII(c2)))))
4546                     {
4547                         c2 = c1;
4548                     }
4549                 }
4550             }
4551             else /* Here, c1 is <= 255 */
4552                 if (utf8_target
4553                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4554                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4555                     && ((OP(text_node) != EXACTFAA
4556                         && OP(text_node) != EXACTFAA_NO_TRIE)
4557                         || ! isASCII(c1)))
4558             {
4559                 /* Here, there could be something above Latin1 in the target
4560                  * which folds to this character in the pattern.  All such
4561                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4562                  * than two characters involved in their folds, so are outside
4563                  * the scope of this function */
4564                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4565                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4566                 }
4567                 else {
4568                     use_chrtest_void = TRUE;
4569                 }
4570             }
4571             else { /* Here nothing above Latin1 can fold to the pattern
4572                       character */
4573                 switch (OP(text_node)) {
4574
4575                     case EXACTFL:   /* /l rules */
4576                         c2 = PL_fold_locale[c1];
4577                         break;
4578
4579                     case EXACTF:   /* This node only generated for non-utf8
4580                                     patterns */
4581                         assert(! is_utf8_pat);
4582                         if (! utf8_target) {    /* /d rules */
4583                             c2 = PL_fold[c1];
4584                             break;
4585                         }
4586                         /* FALLTHROUGH */
4587                         /* /u rules for all these.  This happens to work for
4588                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4589                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4590                                                 non-utf8 patterns */
4591                         assert(! is_utf8_pat);
4592                         /* FALLTHROUGH */
4593                     case EXACTFAA:
4594                     case EXACTFU_SS:
4595                     case EXACTFU:
4596                         c2 = PL_fold_latin1[c1];
4597                         break;
4598
4599                     default:
4600                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4601                         NOT_REACHED; /* NOTREACHED */
4602                 }
4603             }
4604         }
4605     }
4606
4607     /* Here have figured things out.  Set up the returns */
4608     if (use_chrtest_void) {
4609         *c2p = *c1p = CHRTEST_VOID;
4610     }
4611     else if (utf8_target) {
4612         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4613             uvchr_to_utf8(c1_utf8, c1);
4614             uvchr_to_utf8(c2_utf8, c2);
4615         }
4616
4617         /* Invariants are stored in both the utf8 and byte outputs; Use
4618          * negative numbers otherwise for the byte ones.  Make sure that the
4619          * byte ones are the same iff the utf8 ones are the same */
4620         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4621         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4622                 ? *c2_utf8
4623                 : (c1 == c2)
4624                   ? CHRTEST_NOT_A_CP_1
4625                   : CHRTEST_NOT_A_CP_2;
4626     }
4627     else if (c1 > 255) {
4628        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4629                            can represent */
4630            return FALSE;
4631        }
4632
4633        *c1p = *c2p = c2;    /* c2 is the only representable value */
4634     }
4635     else {  /* c1 is representable; see about c2 */
4636        *c1p = c1;
4637        *c2p = (c2 < 256) ? c2 : c1;
4638     }
4639
4640     return TRUE;
4641 }
4642
4643 STATIC bool
4644 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4645 {
4646     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4647      * between the inputs.  See http://www.unicode.org/reports/tr29/. */
4648
4649     PERL_ARGS_ASSERT_ISGCB;
4650
4651     switch (GCB_table[before][after]) {
4652         case GCB_BREAKABLE:
4653             return TRUE;
4654
4655         case GCB_NOBREAK:
4656             return FALSE;
4657
4658         case GCB_RI_then_RI:
4659             {
4660                 int RI_count = 1;
4661                 U8 * temp_pos = (U8 *) curpos;
4662
4663                 /* Do not break within emoji flag sequences. That is, do not
4664                  * break between regional indicator (RI) symbols if there is an
4665                  * odd number of RI characters before the break point.
4666                  *  GB12   sot (RI RI)* RI × RI
4667                  *  GB13 [^RI] (RI RI)* RI × RI */
4668
4669                 while (backup_one_GCB(strbeg,
4670                                     &temp_pos,
4671                                     utf8_target) == GCB_Regional_Indicator)
4672                 {
4673                     RI_count++;
4674                 }
4675
4676                 return RI_count % 2 != 1;
4677             }
4678
4679         case GCB_EX_then_EM:
4680
4681             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4682             {
4683                 U8 * temp_pos = (U8 *) curpos;
4684                 GCB_enum prev;
4685
4686                 do {
4687                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4688                 }
4689                 while (prev == GCB_Extend);
4690
4691                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4692             }
4693
4694         default:
4695             break;
4696     }
4697
4698 #ifdef DEBUGGING
4699     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4700                                   before, after, GCB_table[before][after]);
4701     assert(0);
4702 #endif
4703     return TRUE;
4704 }
4705
4706 STATIC GCB_enum
4707 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4708 {
4709     GCB_enum gcb;
4710
4711     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4712
4713     if (*curpos < strbeg) {
4714         return GCB_EDGE;
4715     }
4716
4717     if (utf8_target) {
4718         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4719         U8 * prev_prev_char_pos;
4720
4721         if (! prev_char_pos) {
4722             return GCB_EDGE;
4723         }
4724
4725         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4726             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4727             *curpos = prev_char_pos;
4728             prev_char_pos = prev_prev_char_pos;
4729         }
4730         else {
4731             *curpos = (U8 *) strbeg;
4732             return GCB_EDGE;
4733         }
4734     }
4735     else {
4736         if (*curpos - 2 < strbeg) {
4737             *curpos = (U8 *) strbeg;
4738             return GCB_EDGE;
4739         }
4740         (*curpos)--;
4741         gcb = getGCB_VAL_CP(*(*curpos - 1));
4742     }
4743
4744     return gcb;
4745 }
4746
4747 /* Combining marks attach to most classes that precede them, but this defines
4748  * the exceptions (from TR14) */
4749 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4750                                      || prev == LB_Mandatory_Break      \
4751                                      || prev == LB_Carriage_Return      \
4752                                      || prev == LB_Line_Feed            \
4753                                      || prev == LB_Next_Line            \
4754                                      || prev == LB_Space                \
4755                                      || prev == LB_ZWSpace))
4756
4757 STATIC bool
4758 S_isLB(pTHX_ LB_enum before,
4759              LB_enum after,
4760              const U8 * const strbeg,
4761              const U8 * const curpos,
4762              const U8 * const strend,
4763              const bool utf8_target)
4764 {
4765     U8 * temp_pos = (U8 *) curpos;
4766     LB_enum prev = before;
4767
4768     /* Is the boundary between 'before' and 'after' line-breakable?
4769      * Most of this is just a table lookup of a generated table from Unicode
4770      * rules.  But some rules require context to decide, and so have to be
4771      * implemented in code */
4772
4773     PERL_ARGS_ASSERT_ISLB;
4774
4775     /* Rule numbers in the comments below are as of Unicode 9.0 */
4776
4777   redo:
4778     before = prev;
4779     switch (LB_table[before][after]) {
4780         case LB_BREAKABLE:
4781             return TRUE;
4782
4783         case LB_NOBREAK:
4784         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4785             return FALSE;
4786
4787         case LB_SP_foo + LB_BREAKABLE:
4788         case LB_SP_foo + LB_NOBREAK:
4789         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4790
4791             /* When we have something following a SP, we have to look at the
4792              * context in order to know what to do.
4793              *
4794              * SP SP should not reach here because LB7: Do not break before
4795              * spaces.  (For two spaces in a row there is nothing that
4796              * overrides that) */
4797             assert(after != LB_Space);
4798
4799             /* Here we have a space followed by a non-space.  Mostly this is a
4800              * case of LB18: "Break after spaces".  But there are complications
4801              * as the handling of spaces is somewhat tricky.  They are in a
4802              * number of rules, which have to be applied in priority order, but
4803              * something earlier in the string can cause a rule to be skipped
4804              * and a lower priority rule invoked.  A prime example is LB7 which
4805              * says don't break before a space.  But rule LB8 (lower priority)
4806              * says that the first break opportunity after a ZW is after any
4807              * span of spaces immediately after it.  If a ZW comes before a SP
4808              * in the input, rule LB8 applies, and not LB7.  Other such rules
4809              * involve combining marks which are rules 9 and 10, but they may
4810              * override higher priority rules if they come earlier in the
4811              * string.  Since we're doing random access into the middle of the
4812              * string, we have to look for rules that should get applied based
4813              * on both string position and priority.  Combining marks do not
4814              * attach to either ZW nor SP, so we don't have to consider them
4815              * until later.
4816              *
4817              * To check for LB8, we have to find the first non-space character
4818              * before this span of spaces */
4819             do {
4820                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4821             }
4822             while (prev == LB_Space);
4823
4824             /* LB8 Break before any character following a zero-width space,
4825              * even if one or more spaces intervene.
4826              *      ZW SP* ÷
4827              * So if we have a ZW just before this span, and to get here this
4828              * is the final space in the span. */
4829             if (prev == LB_ZWSpace) {
4830                 return TRUE;
4831             }
4832
4833             /* Here, not ZW SP+.  There are several rules that have higher
4834              * priority than LB18 and can be resolved now, as they don't depend
4835              * on anything earlier in the string (except ZW, which we have
4836              * already handled).  One of these rules is LB11 Do not break
4837              * before Word joiner, but we have specially encoded that in the
4838              * lookup table so it is caught by the single test below which
4839              * catches the other ones. */
4840             if (LB_table[LB_Space][after] - LB_SP_foo
4841                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4842             {
4843                 return FALSE;
4844             }
4845
4846             /* If we get here, we have to XXX consider combining marks. */
4847             if (prev == LB_Combining_Mark) {
4848
4849                 /* What happens with these depends on the character they
4850                  * follow.  */
4851                 do {
4852                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4853                 }
4854                 while (prev == LB_Combining_Mark);
4855
4856                 /* Most times these attach to and inherit the characteristics
4857                  * of that character, but not always, and when not, they are to
4858                  * be treated as AL by rule LB10. */
4859                 if (! LB_CM_ATTACHES_TO(prev)) {
4860                     prev = LB_Alphabetic;
4861                 }
4862             }
4863
4864             /* Here, we have the character preceding the span of spaces all set
4865              * up.  We follow LB18: "Break after spaces" unless the table shows
4866              * that is overriden */
4867             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4868
4869         case LB_CM_ZWJ_foo:
4870
4871             /* We don't know how to treat the CM except by looking at the first
4872              * non-CM character preceding it.  ZWJ is treated as CM */
4873             do {
4874                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4875             }
4876             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4877
4878             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4879              * attatches to it, then it inherits the behavior of 'prev'.  If it
4880              * doesn't attach, it is to be treated as an AL */
4881             if (! LB_CM_ATTACHES_TO(prev)) {
4882                 prev = LB_Alphabetic;
4883             }
4884
4885             goto redo;
4886
4887         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4888         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4889
4890             /* LB21a Don't break after Hebrew + Hyphen.
4891              * HL (HY | BA) × */
4892
4893             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4894                                                           == LB_Hebrew_Letter)
4895             {
4896                 return FALSE;
4897             }
4898
4899             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4900
4901         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4902         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4903
4904             /* LB25a (PR | PO) × ( OP | HY )? NU */
4905             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4906                 return FALSE;
4907             }
4908
4909             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4910                                                                 == LB_BREAKABLE;
4911
4912         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4913         case LB_SY_or_IS_then_various + LB_NOBREAK:
4914         {
4915             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4916
4917             LB_enum temp = prev;
4918             do {
4919                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4920             }
4921             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4922             if (temp == LB_Numeric) {
4923                 return FALSE;
4924             }
4925
4926             return LB_table[prev][after] - LB_SY_or_IS_then_various
4927                                                                == LB_BREAKABLE;
4928         }
4929
4930         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4931         case LB_various_then_PO_or_PR + LB_NOBREAK:
4932         {
4933             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4934
4935             LB_enum temp = prev;
4936             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4937             {
4938                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4939             }
4940             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4941                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4942             }
4943             if (temp == LB_Numeric) {
4944                 return FALSE;
4945             }
4946             return LB_various_then_PO_or_PR;
4947         }
4948
4949         case LB_RI_then_RI + LB_NOBREAK:
4950         case LB_RI_then_RI + LB_BREAKABLE:
4951             {
4952                 int RI_count = 1;
4953
4954                 /* LB30a Break between two regional indicator symbols if and
4955                  * only if there are an even number of regional indicators
4956                  * preceding the position of the break.
4957                  *
4958                  *    sot (RI RI)* RI × RI
4959                  *  [^RI] (RI RI)* RI × RI */
4960
4961                 while (backup_one_LB(strbeg,
4962                                      &temp_pos,
4963                                      utf8_target) == LB_Regional_Indicator)
4964                 {
4965                     RI_count++;
4966                 }
4967
4968                 return RI_count % 2 == 0;
4969             }
4970
4971         default:
4972             break;
4973     }
4974
4975 #ifdef DEBUGGING
4976     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4977                                   before, after, LB_table[before][after]);
4978     assert(0);
4979 #endif
4980     return TRUE;
4981 }
4982
4983 STATIC LB_enum
4984 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4985 {
4986     LB_enum lb;
4987
4988     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4989
4990     if (*curpos >= strend) {
4991         return LB_EDGE;
4992     }
4993
4994     if (utf8_target) {
4995         *curpos += UTF8SKIP(*curpos);
4996         if (*curpos >= strend) {
4997             return LB_EDGE;
4998         }
4999         lb = getLB_VAL_UTF8(*curpos, strend);
5000     }
5001     else {
5002         (*curpos)++;
5003         if (*curpos >= strend) {
5004             return LB_EDGE;
5005         }
5006         lb = getLB_VAL_CP(**curpos);
5007     }
5008
5009     return lb;
5010 }
5011
5012 STATIC LB_enum
5013 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5014 {
5015     LB_enum lb;
5016
5017     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5018
5019     if (*curpos < strbeg) {
5020         return LB_EDGE;
5021     }
5022
5023     if (utf8_target) {
5024         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5025         U8 * prev_prev_char_pos;
5026
5027         if (! prev_char_pos) {
5028             return LB_EDGE;
5029         }
5030
5031         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5032             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5033             *curpos = prev_char_pos;
5034             prev_char_pos = prev_prev_char_pos;
5035         }
5036         else {
5037             *curpos = (U8 *) strbeg;
5038             return LB_EDGE;
5039         }
5040     }
5041     else {
5042         if (*curpos - 2 < strbeg) {
5043             *curpos = (U8 *) strbeg;
5044             return LB_EDGE;
5045         }
5046         (*curpos)--;
5047         lb = getLB_VAL_CP(*(*curpos - 1));
5048     }
5049
5050     return lb;
5051 }
5052
5053 STATIC bool
5054 S_isSB(pTHX_ SB_enum before,
5055              SB_enum after,
5056              const U8 * const strbeg,
5057              const U8 * const curpos,
5058              const U8 * const strend,
5059              const bool utf8_target)
5060 {
5061     /* returns a boolean indicating if there is a Sentence Boundary Break
5062      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
5063
5064     U8 * lpos = (U8 *) curpos;
5065     bool has_para_sep = FALSE;
5066     bool has_sp = FALSE;
5067
5068     PERL_ARGS_ASSERT_ISSB;
5069
5070     /* Break at the start and end of text.
5071         SB1.  sot  ÷
5072         SB2.  ÷  eot
5073       But unstated in Unicode is don't break if the text is empty */
5074     if (before == SB_EDGE || after == SB_EDGE) {
5075         return before != after;
5076     }
5077
5078     /* SB 3: Do not break within CRLF. */
5079     if (before == SB_CR && after == SB_LF) {
5080         return FALSE;
5081     }
5082
5083     /* Break after paragraph separators.  CR and LF are considered
5084      * so because Unicode views text as like word processing text where there
5085      * are no newlines except between paragraphs, and the word processor takes
5086      * care of wrapping without there being hard line-breaks in the text *./
5087        SB4.  Sep | CR | LF  ÷ */
5088     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5089         return TRUE;
5090     }
5091
5092     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5093      * (See Section 6.2, Replacing Ignore Rules.)
5094         SB5.  X (Extend | Format)*  →  X */
5095     if (after == SB_Extend || after == SB_Format) {
5096
5097         /* Implied is that the these characters attach to everything
5098          * immediately prior to them except for those separator-type
5099          * characters.  And the rules earlier have already handled the case
5100          * when one of those immediately precedes the extend char */
5101         return FALSE;
5102     }
5103
5104     if (before == SB_Extend || before == SB_Format) {
5105         U8 * temp_pos = lpos;
5106         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5107         if (   backup != SB_EDGE
5108             && backup != SB_Sep
5109             && backup != SB_CR
5110             && backup != SB_LF)
5111         {
5112             before = backup;
5113             lpos = temp_pos;
5114         }
5115
5116         /* Here, both 'before' and 'backup' are these types; implied is that we
5117          * don't break between them */
5118         if (backup == SB_Extend || backup == SB_Format) {
5119             return FALSE;
5120         }
5121     }
5122
5123     /* Do not break after ambiguous terminators like period, if they are
5124      * immediately followed by a number or lowercase letter, if they are
5125      * between uppercase letters, if the first following letter (optionally
5126      * after certain punctuation) is lowercase, or if they are followed by
5127      * "continuation" punctuation such as comma, colon, or semicolon. For
5128      * example, a period may be an abbreviation or numeric period, and thus may
5129      * not mark the end of a sentence.
5130
5131      * SB6. ATerm  ×  Numeric */
5132     if (before == SB_ATerm && after == SB_Numeric) {
5133         return FALSE;
5134     }
5135
5136     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5137     if (before == SB_ATerm && after == SB_Upper) {
5138         U8 * temp_pos = lpos;
5139         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5140         if (backup == SB_Upper || backup == SB_Lower) {
5141             return FALSE;
5142         }
5143     }
5144
5145     /* The remaining rules that aren't the final one, all require an STerm or
5146      * an ATerm after having backed up over some Close* Sp*, and in one case an
5147      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5148      * So do that backup now, setting flags if either Sp or a paragraph
5149      * separator are found */
5150
5151     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5152         has_para_sep = TRUE;
5153         before = backup_one_SB(strbeg, &lpos, utf8_target);
5154     }
5155
5156     if (before == SB_Sp) {
5157         has_sp = TRUE;
5158         do {
5159             before = backup_one_SB(strbeg, &lpos, utf8_target);
5160         }
5161         while (before == SB_Sp);
5162     }
5163
5164     while (before == SB_Close) {
5165         before = backup_one_SB(strbeg, &lpos, utf8_target);
5166     }
5167
5168     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5169      * most cases an STerm */
5170     if (before == SB_STerm || before == SB_ATerm) {
5171
5172         /* So, here the lhs matches
5173          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5174          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5175          * The rules that apply here are:
5176          *
5177          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5178                                            | LF | STerm | ATerm) )* Lower
5179            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5180            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5181            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5182            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5183          */
5184
5185         /* And all but SB11 forbid having seen a paragraph separator */
5186         if (! has_para_sep) {
5187             if (before == SB_ATerm) {          /* SB8 */
5188                 U8 * rpos = (U8 *) curpos;
5189                 SB_enum later = after;
5190
5191                 while (    later != SB_OLetter
5192                         && later != SB_Upper
5193                         && later != SB_Lower
5194                         && later != SB_Sep
5195                         && later != SB_CR
5196                         && later != SB_LF
5197                         && later != SB_STerm
5198                         && later != SB_ATerm
5199                         && later != SB_EDGE)
5200                 {
5201                     later = advance_one_SB(&rpos, strend, utf8_target);
5202                 }
5203                 if (later == SB_Lower) {
5204                     return FALSE;
5205                 }
5206             }
5207
5208             if (   after == SB_SContinue    /* SB8a */
5209                 || after == SB_STerm
5210                 || after == SB_ATerm)
5211             {
5212                 return FALSE;
5213             }
5214
5215             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5216                 if (   after == SB_Close
5217                     || after == SB_Sp
5218                     || after == SB_Sep
5219                     || after == SB_CR
5220                     || after == SB_LF)
5221                 {
5222                     return FALSE;
5223                 }
5224             }
5225
5226             /* SB10.  This and SB9 could probably be combined some way, but khw
5227              * has decided to follow the Unicode rule book precisely for
5228              * simplified maintenance */
5229             if (   after == SB_Sp
5230                 || after == SB_Sep
5231                 || after == SB_CR
5232                 || after == SB_LF)
5233             {
5234                 return FALSE;
5235             }
5236         }
5237
5238         /* SB11.  */
5239         return TRUE;
5240     }
5241
5242     /* Otherwise, do not break.
5243     SB12.  Any  ×  Any */
5244
5245     return FALSE;
5246 }
5247
5248 STATIC SB_enum
5249 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5250 {
5251     SB_enum sb;
5252
5253     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5254
5255     if (*curpos >= strend) {
5256         return SB_EDGE;
5257     }
5258
5259     if (utf8_target) {
5260         do {
5261             *curpos += UTF8SKIP(*curpos);
5262             if (*curpos >= strend) {
5263                 return SB_EDGE;
5264             }
5265             sb = getSB_VAL_UTF8(*curpos, strend);
5266         } while (sb == SB_Extend || sb == SB_Format);
5267     }
5268     else {
5269         do {
5270             (*curpos)++;
5271             if (*curpos >= strend) {
5272                 return SB_EDGE;
5273             }
5274             sb = getSB_VAL_CP(**curpos);
5275         } while (sb == SB_Extend || sb == SB_Format);
5276     }
5277
5278     return sb;
5279 }
5280
5281 STATIC SB_enum
5282 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5283 {
5284     SB_enum sb;
5285
5286     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5287
5288     if (*curpos < strbeg) {
5289         return SB_EDGE;
5290     }
5291
5292     if (utf8_target) {
5293         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5294         if (! prev_char_pos) {
5295             return SB_EDGE;
5296         }
5297
5298         /* Back up over Extend and Format.  curpos is always just to the right
5299          * of the characater whose value we are getting */
5300         do {
5301             U8 * prev_prev_char_pos;
5302             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5303                                                                       strbeg)))
5304             {
5305                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5306                 *curpos = prev_char_pos;
5307                 prev_char_pos = prev_prev_char_pos;
5308             }
5309             else {
5310                 *curpos = (U8 *) strbeg;
5311                 return SB_EDGE;
5312             }
5313         } while (sb == SB_Extend || sb == SB_Format);
5314     }
5315     else {
5316         do {
5317             if (*curpos - 2 < strbeg) {
5318                 *curpos = (U8 *) strbeg;
5319                 return SB_EDGE;
5320             }
5321             (*curpos)--;
5322             sb = getSB_VAL_CP(*(*curpos - 1));
5323         } while (sb == SB_Extend || sb == SB_Format);
5324     }
5325
5326     return sb;
5327 }
5328
5329 STATIC bool
5330 S_isWB(pTHX_ WB_enum previous,
5331              WB_enum before,
5332              WB_enum after,
5333              const U8 * const strbeg,
5334              const U8 * const curpos,
5335              const U8 * const strend,
5336              const bool utf8_target)
5337 {
5338     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5339      *  a Unicode word break, using their published algorithm, but tailored for
5340      *  Perl by treating spans of white space as one unit.  Context may be
5341      *  needed to make this determination.  If the value for the character
5342      *  before 'before' is known, it is passed as 'previous'; otherwise that
5343      *  should be set to WB_UNKNOWN.  The other input parameters give the
5344      *  boundaries and current position in the matching of the string.  That
5345      *  is, 'curpos' marks the position where the character whose wb value is
5346      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5347
5348     U8 * before_pos = (U8 *) curpos;
5349     U8 * after_pos = (U8 *) curpos;
5350     WB_enum prev = before;
5351     WB_enum next;
5352
5353     PERL_ARGS_ASSERT_ISWB;
5354
5355     /* Rule numbers in the comments below are as of Unicode 9.0 */
5356
5357   redo:
5358     before = prev;
5359     switch (WB_table[before][after]) {
5360         case WB_BREAKABLE:
5361             return TRUE;
5362
5363         case WB_NOBREAK:
5364             return FALSE;
5365
5366         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5367             next = advance_one_WB(&after_pos, strend, utf8_target,
5368                                  FALSE /* Don't skip Extend nor Format */ );
5369             /* A space immediately preceeding an Extend or Format is attached
5370              * to by them, and hence gets separated from previous spaces.
5371              * Otherwise don't break between horizontal white space */
5372             return next == WB_Extend || next == WB_Format;
5373
5374         /* WB4 Ignore Format and Extend characters, except when they appear at
5375          * the beginning of a region of text.  This code currently isn't
5376          * general purpose, but it works as the rules are currently and likely
5377          * to be laid out.  The reason it works is that when 'they appear at
5378          * the beginning of a region of text', the rule is to break before
5379          * them, just like any other character.  Therefore, the default rule
5380          * applies and we don't have to look in more depth.  Should this ever
5381          * change, we would have to have 2 'case' statements, like in the rules
5382          * below, and backup a single character (not spacing over the extend
5383          * ones) and then see if that is one of the region-end characters and
5384          * go from there */
5385         case WB_Ex_or_FO_or_ZWJ_then_foo:
5386             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5387             goto redo;
5388
5389         case WB_DQ_then_HL + WB_BREAKABLE:
5390         case WB_DQ_then_HL + WB_NOBREAK:
5391
5392             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5393
5394             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5395                                                             == WB_Hebrew_Letter)
5396             {
5397                 return FALSE;
5398             }
5399
5400              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5401
5402         case WB_HL_then_DQ + WB_BREAKABLE:
5403         case WB_HL_then_DQ + WB_NOBREAK:
5404
5405             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5406
5407             if (advance_one_WB(&after_pos, strend, utf8_target,
5408                                        TRUE /* Do skip Extend and Format */ )
5409                                                             == WB_Hebrew_Letter)
5410             {
5411                 return FALSE;
5412             }
5413
5414             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5415
5416         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5417         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5418
5419             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5420              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5421
5422             next = advance_one_WB(&after_pos, strend, utf8_target,
5423                                        TRUE /* Do skip Extend and Format */ );
5424
5425             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5426             {
5427                 return FALSE;
5428             }
5429
5430             return WB_table[before][after]
5431                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5432
5433         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5434         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5435
5436             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5437              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5438
5439             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5440             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5441             {
5442                 return FALSE;
5443             }
5444
5445             return WB_table[before][after]
5446                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5447
5448         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5449         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5450
5451             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5452              * */
5453
5454             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5455                                                             == WB_Numeric)
5456             {
5457                 return FALSE;
5458             }
5459
5460             return WB_table[before][after]
5461                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5462
5463         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5464         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5465
5466             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5467
5468             if (advance_one_WB(&after_pos, strend, utf8_target,
5469                                        TRUE /* Do skip Extend and Format */ )
5470                                                             == WB_Numeric)
5471             {
5472                 return FALSE;
5473             }
5474
5475             return WB_table[before][after]
5476                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5477
5478         case WB_RI_then_RI + WB_NOBREAK:
5479         case WB_RI_then_RI + WB_BREAKABLE:
5480             {
5481                 int RI_count = 1;
5482
5483                 /* Do not break within emoji flag sequences. That is, do not
5484                  * break between regional indicator (RI) symbols if there is an
5485                  * odd number of RI characters before the potential break
5486                  * point.
5487                  *
5488                  * WB15   sot (RI RI)* RI × RI
5489                  * WB16 [^RI] (RI RI)* RI × RI */
5490
5491                 while (backup_one_WB(&previous,
5492                                      strbeg,
5493                                      &before_pos,
5494                                      utf8_target) == WB_Regional_Indicator)
5495                 {
5496                     RI_count++;
5497                 }
5498
5499                 return RI_count % 2 != 1;
5500             }
5501
5502         default:
5503             break;
5504     }
5505
5506 #ifdef DEBUGGING
5507     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5508                                   before, after, WB_table[before][after]);
5509     assert(0);
5510 #endif
5511     return TRUE;
5512 }
5513
5514 STATIC WB_enum
5515 S_advance_one_WB(pTHX_ U8 ** curpos,
5516                        const U8 * const strend,
5517                        const bool utf8_target,
5518                        const bool skip_Extend_Format)
5519 {
5520     WB_enum wb;
5521
5522     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5523
5524     if (*curpos >= strend) {
5525         return WB_EDGE;
5526     }
5527
5528     if (utf8_target) {
5529
5530         /* Advance over Extend and Format */
5531         do {
5532             *curpos += UTF8SKIP(*curpos);
5533             if (*curpos >= strend) {
5534                 return WB_EDGE;
5535             }
5536             wb = getWB_VAL_UTF8(*curpos, strend);
5537         } while (    skip_Extend_Format
5538                  && (wb == WB_Extend || wb == WB_Format));
5539     }
5540     else {
5541         do {
5542             (*curpos)++;
5543             if (*curpos >= strend) {
5544                 return WB_EDGE;
5545             }
5546             wb = getWB_VAL_CP(**curpos);
5547         } while (    skip_Extend_Format
5548                  && (wb == WB_Extend || wb == WB_Format));
5549     }
5550
5551     return wb;
5552 }
5553
5554 STATIC WB_enum
5555 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5556 {
5557     WB_enum wb;
5558
5559     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5560
5561     /* If we know what the previous character's break value is, don't have
5562         * to look it up */
5563     if (*previous != WB_UNKNOWN) {
5564         wb = *previous;
5565
5566         /* But we need to move backwards by one */
5567         if (utf8_target) {
5568             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5569             if (! *curpos) {
5570                 *previous = WB_EDGE;
5571                 *curpos = (U8 *) strbeg;
5572             }
5573             else {
5574                 *previous = WB_UNKNOWN;
5575             }
5576         }
5577         else {
5578             (*curpos)--;
5579             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5580         }
5581
5582         /* And we always back up over these three types */
5583         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5584             return wb;
5585         }
5586     }
5587
5588     if (*curpos < strbeg) {
5589         return WB_EDGE;
5590     }
5591
5592     if (utf8_target) {
5593         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5594         if (! prev_char_pos) {
5595             return WB_EDGE;
5596         }
5597
5598         /* Back up over Extend and Format.  curpos is always just to the right
5599          * of the characater whose value we are getting */
5600         do {
5601             U8 * prev_prev_char_pos;
5602             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5603                                                    -1,
5604                                                    strbeg)))
5605             {
5606                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5607                 *curpos = prev_char_pos;
5608                 prev_char_pos = prev_prev_char_pos;
5609             }
5610             else {
5611                 *curpos = (U8 *) strbeg;
5612                 return WB_EDGE;
5613             }
5614         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5615     }
5616     else {
5617         do {
5618             if (*curpos - 2 < strbeg) {
5619                 *curpos = (U8 *) strbeg;
5620                 return WB_EDGE;
5621             }
5622             (*curpos)--;
5623             wb = getWB_VAL_CP(*(*curpos - 1));
5624         } while (wb == WB_Extend || wb == WB_Format);
5625     }
5626
5627     return wb;
5628 }
5629
5630 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5631 (                                                           \
5632     (   ( st )                                         ) && \
5633     (   ( st )->u.eval.close_paren                     ) && \
5634     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5635 )
5636
5637 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5638 (                                                           \
5639     (   ( st )                                         ) && \
5640     (   ( st )->u.eval.close_paren                     ) && \
5641     (   ( expr )                                       ) && \
5642     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5643 )
5644
5645
5646 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5647     (st)->u.eval.close_paren = ( (expr) + 1 )
5648
5649 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5650     (st)->u.eval.close_paren = 0
5651
5652 /* returns -1 on failure, $+[0] on success */
5653 STATIC SSize_t
5654 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5655 {
5656     dVAR;
5657     const bool utf8_target = reginfo->is_utf8_target;
5658     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5659     REGEXP *rex_sv = reginfo->prog;
5660     regexp *rex = ReANY(rex_sv);
5661     RXi_GET_DECL(rex,rexi);
5662     /* the current state. This is a cached copy of PL_regmatch_state */
5663     regmatch_state *st;
5664     /* cache heavy used fields of st in registers */
5665     regnode *scan;
5666     regnode *next;
5667     U32 n = 0;  /* general value; init to avoid compiler warning */
5668     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5669     SSize_t endref = 0; /* offset of end of backref when ln is start */
5670     char *locinput = startpos;
5671     char *pushinput; /* where to continue after a PUSH */
5672     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5673
5674     bool result = 0;        /* return value of S_regmatch */
5675     U32 depth = 0;            /* depth of backtrack stack */
5676     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5677     const U32 max_nochange_depth =
5678         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5679         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5680     regmatch_state *yes_state = NULL; /* state to pop to on success of
5681                                                             subpattern */
5682     /* mark_state piggy backs on the yes_state logic so that when we unwind
5683        the stack on success we can update the mark_state as we go */
5684     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5685     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5686     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5687     U32 state_num;
5688     bool no_final = 0;      /* prevent failure from backtracking? */
5689     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5690     char *startpoint = locinput;
5691     SV *popmark = NULL;     /* are we looking for a mark? */
5692     SV *sv_commit = NULL;   /* last mark name seen in failure */
5693     SV *sv_yes_mark = NULL; /* last mark name we have seen
5694                                during a successful match */
5695     U32 lastopen = 0;       /* last open we saw */
5696     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5697     SV* const oreplsv = GvSVn(PL_replgv);
5698     /* these three flags are set by various ops to signal information to
5699      * the very next op. They have a useful lifetime of exactly one loop
5700      * iteration, and are not preserved or restored by state pushes/pops
5701      */
5702     bool sw = 0;            /* the condition value in (?(cond)a|b) */
5703     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
5704     int logical = 0;        /* the following EVAL is:
5705                                 0: (?{...})
5706                                 1: (?(?{...})X|Y)
5707                                 2: (??{...})
5708                                or the following IFMATCH/UNLESSM is:
5709                                 false: plain (?=foo)
5710                                 true:  used as a condition: (?(?=foo))
5711                             */
5712     PAD* last_pad = NULL;
5713     dMULTICALL;
5714     U8 gimme = G_SCALAR;
5715     CV *caller_cv = NULL;       /* who called us */
5716     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
5717     U32 maxopenparen = 0;       /* max '(' index seen so far */
5718     int to_complement;  /* Invert the result? */
5719     _char_class_number classnum;
5720     bool is_utf8_pat = reginfo->is_utf8_pat;
5721     bool match = FALSE;
5722     I32 orig_savestack_ix = PL_savestack_ix;
5723     U8 * script_run_begin = NULL;
5724
5725 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5726 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5727 #  define SOLARIS_BAD_OPTIMIZER
5728     const U32 *pl_charclass_dup = PL_charclass;
5729 #  define PL_charclass pl_charclass_dup
5730 #endif
5731
5732 #ifdef DEBUGGING
5733     GET_RE_DEBUG_FLAGS_DECL;
5734 #endif
5735
5736     /* protect against undef(*^R) */
5737     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5738
5739     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5740     multicall_oldcatch = 0;
5741     PERL_UNUSED_VAR(multicall_cop);
5742
5743     PERL_ARGS_ASSERT_REGMATCH;
5744
5745     st = PL_regmatch_state;
5746
5747     /* Note that nextchr is a byte even in UTF */
5748     SET_nextchr;
5749     scan = prog;
5750
5751     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5752             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5753             Perl_re_printf( aTHX_ "regmatch start\n" );
5754     }));
5755
5756     while (scan != NULL) {
5757         next = scan + NEXT_OFF(scan);
5758         if (next == scan)
5759             next = NULL;
5760         state_num = OP(scan);
5761
5762       reenter_switch:
5763         DEBUG_EXECUTE_r(
5764             if (state_num <= REGNODE_MAX) {
5765                 SV * const prop = sv_newmortal();
5766                 regnode *rnext = regnext(scan);
5767
5768                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5769                 regprop(rex, prop, scan, reginfo, NULL);
5770                 Perl_re_printf( aTHX_
5771                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5772                     INDENT_CHARS(depth), "",
5773                     (IV)(scan - rexi->program),
5774                     SvPVX_const(prop),
5775                     (PL_regkind[OP(scan)] == END || !rnext) ?
5776                         0 : (IV)(rnext - rexi->program));
5777             }
5778         );
5779
5780         to_complement = 0;
5781
5782         SET_nextchr;
5783         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5784
5785         switch (state_num) {
5786         case SBOL: /*  /^../ and /\A../  */
5787             if (locinput == reginfo->strbeg)
5788                 break;
5789             sayNO;
5790
5791         case MBOL: /*  /^../m  */
5792             if (locinput == reginfo->strbeg ||
5793                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5794             {
5795                 break;
5796             }
5797             sayNO;
5798
5799         case GPOS: /*  \G  */
5800             if (locinput == reginfo->ganch)
5801                 break;
5802             sayNO;
5803
5804         case KEEPS: /*   \K  */
5805             /* update the startpoint */
5806             st->u.keeper.val = rex->offs[0].start;
5807             rex->offs[0].start = locinput - reginfo->strbeg;
5808             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5809             NOT_REACHED; /* NOTREACHED */
5810
5811         case KEEPS_next_fail:
5812             /* rollback the start point change */
5813             rex->offs[0].start = st->u.keeper.val;
5814             sayNO_SILENT;
5815             NOT_REACHED; /* NOTREACHED */
5816
5817         case MEOL: /* /..$/m  */
5818             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5819                 sayNO;
5820             break;
5821
5822         case SEOL: /* /..$/  */
5823             if (!NEXTCHR_IS_EOS && nextchr != '\n')
5824                 sayNO;
5825             if (reginfo->strend - locinput > 1)
5826                 sayNO;
5827             break;
5828
5829         case EOS: /*  \z  */
5830             if (!NEXTCHR_IS_EOS)
5831                 sayNO;
5832             break;
5833
5834         case SANY: /*  /./s  */
5835             if (NEXTCHR_IS_EOS)
5836                 sayNO;
5837             goto increment_locinput;
5838
5839         case REG_ANY: /*  /./  */
5840             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5841                 sayNO;
5842             goto increment_locinput;
5843
5844
5845 #undef  ST
5846 #define ST st->u.trie
5847         case TRIEC: /* (ab|cd) with known charclass */
5848             /* In this case the charclass data is available inline so
5849                we can fail fast without a lot of extra overhead.
5850              */
5851             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5852                 DEBUG_EXECUTE_r(
5853                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5854                               depth, PL_colors[4], PL_colors[5])
5855                 );
5856                 sayNO_SILENT;
5857                 NOT_REACHED; /* NOTREACHED */
5858             }
5859             /* FALLTHROUGH */
5860         case TRIE:  /* (ab|cd)  */
5861             /* the basic plan of execution of the trie is:
5862              * At the beginning, run though all the states, and
5863              * find the longest-matching word. Also remember the position
5864              * of the shortest matching word. For example, this pattern:
5865              *    1  2 3 4    5
5866              *    ab|a|x|abcd|abc
5867              * when matched against the string "abcde", will generate
5868              * accept states for all words except 3, with the longest
5869              * matching word being 4, and the shortest being 2 (with
5870              * the position being after char 1 of the string).
5871              *
5872              * Then for each matching word, in word order (i.e. 1,2,4,5),
5873              * we run the remainder of the pattern; on each try setting
5874              * the current position to the character following the word,
5875              * returning to try the next word on failure.
5876              *
5877              * We avoid having to build a list of words at runtime by
5878              * using a compile-time structure, wordinfo[].prev, which
5879              * gives, for each word, the previous accepting word (if any).
5880              * In the case above it would contain the mappings 1->2, 2->0,
5881              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5882              * the longest word (4 above), a list of all words, by
5883              * following the list of prev pointers; this gives us the
5884              * unordered list 4,5,1,2. Then given the current word we have
5885              * just tried, we can go through the list and find the
5886              * next-biggest word to try (so if we just failed on word 2,
5887              * the next in the list is 4).
5888              *
5889              * Since at runtime we don't record the matching position in
5890              * the string for each word, we have to work that out for
5891              * each word we're about to process. The wordinfo table holds
5892              * the character length of each word; given that we recorded
5893              * at the start: the position of the shortest word and its
5894              * length in chars, we just need to move the pointer the
5895              * difference between the two char lengths. Depending on
5896              * Unicode status and folding, that's cheap or expensive.
5897              *
5898              * This algorithm is optimised for the case where are only a
5899              * small number of accept states, i.e. 0,1, or maybe 2.
5900              * With lots of accepts states, and having to try all of them,
5901              * it becomes quadratic on number of accept states to find all
5902              * the next words.
5903              */
5904
5905             {
5906                 /* what type of TRIE am I? (utf8 makes this contextual) */
5907                 DECL_TRIE_TYPE(scan);
5908
5909                 /* what trie are we using right now */
5910                 reg_trie_data * const trie
5911                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5912                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5913                 U32 state = trie->startstate;
5914
5915                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5916                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5917                     if (utf8_target
5918                         && ! NEXTCHR_IS_EOS
5919                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5920                         && scan->flags == EXACTL)
5921                     {
5922                         /* We only output for EXACTL, as we let the folder
5923                          * output this message for EXACTFLU8 to avoid
5924                          * duplication */
5925                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5926                                                                reginfo->strend);
5927                     }
5928                 }
5929                 if (   trie->bitmap
5930                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5931                 {
5932                     if (trie->states[ state ].wordnum) {
5933                          DEBUG_EXECUTE_r(
5934                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
5935                                           depth, PL_colors[4], PL_colors[5])
5936                         );
5937                         if (!trie->jump)
5938                             break;
5939                     } else {
5940                         DEBUG_EXECUTE_r(
5941                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5942                                           depth, PL_colors[4], PL_colors[5])
5943                         );
5944                         sayNO_SILENT;
5945                    }
5946                 }
5947
5948             {
5949                 U8 *uc = ( U8* )locinput;
5950
5951                 STRLEN len = 0;
5952                 STRLEN foldlen = 0;
5953                 U8 *uscan = (U8*)NULL;
5954                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5955                 U32 charcount = 0; /* how many input chars we have matched */
5956                 U32 accepted = 0; /* have we seen any accepting states? */
5957
5958                 ST.jump = trie->jump;
5959                 ST.me = scan;
5960                 ST.firstpos = NULL;
5961                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5962                 ST.nextword = 0;
5963
5964                 /* fully traverse the TRIE; note the position of the
5965                    shortest accept state and the wordnum of the longest
5966                    accept state */
5967
5968                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5969                     U32 base = trie->states[ state ].trans.base;
5970                     UV uvc = 0;
5971                     U16 charid = 0;
5972                     U16 wordnum;
5973                     wordnum = trie->states[ state ].wordnum;
5974
5975                     if (wordnum) { /* it's an accept state */
5976                         if (!accepted) {
5977                             accepted = 1;
5978                             /* record first match position */
5979                             if (ST.longfold) {
5980                                 ST.firstpos = (U8*)locinput;
5981                                 ST.firstchars = 0;
5982                             }
5983                             else {
5984                                 ST.firstpos = uc;
5985                                 ST.firstchars = charcount;
5986                             }
5987                         }
5988                         if (!ST.nextword || wordnum < ST.nextword)
5989                             ST.nextword = wordnum;
5990                         ST.topword = wordnum;
5991                     }
5992
5993                     DEBUG_TRIE_EXECUTE_r({
5994                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5995                                 /* HERE */
5996                                 PerlIO_printf( Perl_debug_log,
5997                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
5998                                     INDENT_CHARS(depth), "", PL_colors[4],
5999                                     (UV)state, (accepted ? 'Y' : 'N'));
6000                     });
6001
6002                     /* read a char and goto next state */
6003                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6004                         I32 offset;
6005                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6006                                              uscan, len, uvc, charid, foldlen,
6007                                              foldbuf, uniflags);
6008                         charcount++;
6009                         if (foldlen>0)
6010                             ST.longfold = TRUE;
6011                         if (charid &&
6012                              ( ((offset =
6013                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6014
6015                              && ((U32)offset < trie->lasttrans)
6016                              && trie->trans[offset].check == state)
6017                         {
6018                             state = trie->trans[offset].next;
6019                         }
6020                         else {
6021                             state = 0;
6022                         }
6023                         uc += len;
6024
6025                     }
6026                     else {
6027                         state = 0;
6028                     }
6029                     DEBUG_TRIE_EXECUTE_r(
6030                         Perl_re_printf( aTHX_
6031                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6032                             charid, uvc, (UV)state, PL_colors[5] );
6033                     );
6034                 }
6035                 if (!accepted)
6036                    sayNO;
6037
6038                 /* calculate total number of accept states */
6039                 {
6040                     U16 w = ST.topword;
6041                     accepted = 0;
6042                     while (w) {
6043                         w = trie->wordinfo[w].prev;
6044                         accepted++;
6045                     }
6046                     ST.accepted = accepted;
6047                 }
6048
6049                 DEBUG_EXECUTE_r(
6050                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6051                         depth,
6052                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6053                 );
6054                 goto trie_first_try; /* jump into the fail handler */
6055             }}
6056             NOT_REACHED; /* NOTREACHED */
6057
6058         case TRIE_next_fail: /* we failed - try next alternative */
6059         {
6060             U8 *uc;
6061             if ( ST.jump ) {
6062                 /* undo any captures done in the tail part of a branch,
6063                  * e.g.
6064                  *    /(?:X(.)(.)|Y(.)).../
6065                  * where the trie just matches X then calls out to do the
6066                  * rest of the branch */
6067                 REGCP_UNWIND(ST.cp);
6068                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6069             }
6070             if (!--ST.accepted) {
6071                 DEBUG_EXECUTE_r({
6072                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6073                         depth,
6074                         PL_colors[4],
6075                         PL_colors[5] );
6076                 });
6077                 sayNO_SILENT;
6078             }
6079             {
6080                 /* Find next-highest word to process.  Note that this code
6081                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6082                 U16 min = 0;
6083                 U16 word;
6084                 U16 const nextword = ST.nextword;
6085                 reg_trie_wordinfo * const wordinfo
6086                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6087                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6088                     if (word > nextword && (!min || word < min))
6089                         min = word;
6090                 }
6091                 ST.nextword = min;
6092             }
6093
6094           trie_first_try:
6095             if (do_cutgroup) {
6096                 do_cutgroup = 0;
6097                 no_final = 0;
6098             }
6099
6100             if ( ST.jump ) {
6101                 ST.lastparen = rex->lastparen;
6102                 ST.lastcloseparen = rex->lastcloseparen;
6103                 REGCP_SET(ST.cp);
6104             }
6105
6106             /* find start char of end of current word */
6107             {
6108                 U32 chars; /* how many chars to skip */
6109                 reg_trie_data * const trie
6110                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6111
6112                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6113                             >=  ST.firstchars);
6114                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6115                             - ST.firstchars;
6116                 uc = ST.firstpos;
6117
6118                 if (ST.longfold) {
6119                     /* the hard option - fold each char in turn and find
6120                      * its folded length (which may be different */
6121                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6122                     STRLEN foldlen;
6123                     STRLEN len;
6124                     UV uvc;
6125                     U8 *uscan;
6126
6127                     while (chars) {
6128                         if (utf8_target) {
6129                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6130                                                     uniflags);
6131                             uc += len;
6132                         }
6133                         else {
6134                             uvc = *uc;
6135                             uc++;
6136                         }
6137                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6138                         uscan = foldbuf;
6139                         while (foldlen) {
6140                             if (!--chars)
6141                                 break;
6142                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
6143                                             uniflags);
6144                             uscan += len;
6145                             foldlen -= len;
6146                         }
6147                     }
6148                 }
6149                 else {
6150                     if (utf8_target)
6151                         while (chars--)
6152                             uc += UTF8SKIP(uc);
6153                     else
6154                         uc += chars;
6155                 }
6156             }
6157
6158             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6159                             ? ST.jump[ST.nextword]
6160                             : NEXT_OFF(ST.me));
6161
6162             DEBUG_EXECUTE_r({
6163                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6164                     depth,
6165                     PL_colors[4],
6166                     ST.nextword,
6167                     PL_colors[5]
6168                     );
6169             });
6170
6171             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6172                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6173                 NOT_REACHED; /* NOTREACHED */
6174             }
6175             /* only one choice left - just continue */
6176             DEBUG_EXECUTE_r({
6177                 AV *const trie_words
6178                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6179                 SV ** const tmp = trie_words
6180                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6181                 SV *sv= tmp ? sv_newmortal() : NULL;
6182
6183                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6184                     depth, PL_colors[4],
6185                     ST.nextword,
6186                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6187                             PL_colors[0], PL_colors[1],
6188                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6189                         )
6190                     : "not compiled under -Dr",
6191                     PL_colors[5] );
6192             });
6193
6194             locinput = (char*)uc;
6195             continue; /* execute rest of RE */
6196             /* NOTREACHED */
6197         }
6198 #undef  ST
6199
6200         case EXACTL:             /*  /abc/l       */
6201             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6202
6203             /* Complete checking would involve going through every character
6204              * matched by the string to see if any is above latin1.  But the
6205              * comparision otherwise might very well be a fast assembly
6206              * language routine, and I (khw) don't think slowing things down
6207              * just to check for this warning is worth it.  So this just checks
6208              * the first character */
6209             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6210                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6211             }
6212             /* FALLTHROUGH */
6213         case EXACT: {            /*  /abc/        */
6214             char *s = STRING(scan);
6215             ln = STR_LEN(scan);
6216             if (utf8_target != is_utf8_pat) {
6217                 /* The target and the pattern have differing utf8ness. */
6218                 char *l = locinput;
6219                 const char * const e = s + ln;
6220
6221                 if (utf8_target) {
6222                     /* The target is utf8, the pattern is not utf8.
6223                      * Above-Latin1 code points can't match the pattern;
6224                      * invariants match exactly, and the other Latin1 ones need
6225                      * to be downgraded to a single byte in order to do the
6226                      * comparison.  (If we could be confident that the target
6227                      * is not malformed, this could be refactored to have fewer
6228                      * tests by just assuming that if the first bytes match, it
6229                      * is an invariant, but there are tests in the test suite
6230                      * dealing with (??{...}) which violate this) */
6231                     while (s < e) {
6232                         if (l >= reginfo->strend
6233                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6234                         {
6235                             sayNO;
6236                         }
6237                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6238                             if (*l != *s) {
6239                                 sayNO;
6240                             }
6241                             l++;
6242                         }
6243                         else {
6244                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6245                             {
6246                                 sayNO;
6247                             }
6248                             l += 2;
6249                         }
6250                         s++;
6251                     }
6252                 }
6253                 else {
6254                     /* The target is not utf8, the pattern is utf8. */
6255                     while (s < e) {
6256                         if (l >= reginfo->strend
6257                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6258                         {
6259                             sayNO;
6260                         }
6261                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6262                             if (*s != *l) {
6263                                 sayNO;
6264                             }
6265                             s++;
6266                         }
6267                         else {
6268                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6269                             {
6270                                 sayNO;
6271                             }
6272                             s += 2;
6273                         }
6274                         l++;
6275                     }
6276                 }
6277                 locinput = l;
6278             }
6279             else {
6280                 /* The target and the pattern have the same utf8ness. */
6281                 /* Inline the first character, for speed. */
6282                 if (reginfo->strend - locinput < ln
6283                     || UCHARAT(s) != nextchr
6284                     || (ln > 1 && memNE(s, locinput, ln)))
6285                 {
6286                     sayNO;
6287                 }
6288                 locinput += ln;
6289             }
6290             break;
6291             }
6292
6293         case EXACTFL: {          /*  /abc/il      */
6294             re_fold_t folder;
6295             const U8 * fold_array;
6296             const char * s;
6297             U32 fold_utf8_flags;
6298
6299             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6300             folder = foldEQ_locale;
6301             fold_array = PL_fold_locale;
6302             fold_utf8_flags = FOLDEQ_LOCALE;
6303             goto do_exactf;
6304
6305         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6306                                       is effectively /u; hence to match, target
6307                                       must be UTF-8. */
6308             if (! utf8_target) {
6309                 sayNO;
6310             }
6311             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6312                                              | FOLDEQ_S1_FOLDS_SANE;
6313             folder = foldEQ_latin1;
6314             fold_array = PL_fold_latin1;
6315             goto do_exactf;
6316
6317         case EXACTFU_SS:         /*  /\x{df}/iu   */
6318         case EXACTFU:            /*  /abc/iu      */
6319             folder = foldEQ_latin1;
6320             fold_array = PL_fold_latin1;
6321             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6322             goto do_exactf;
6323
6324         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6325                                    patterns */
6326             assert(! is_utf8_pat);
6327             /* FALLTHROUGH */
6328         case EXACTFAA:            /*  /abc/iaa     */
6329             folder = foldEQ_latin1;
6330             fold_array = PL_fold_latin1;
6331             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6332             goto do_exactf;
6333
6334         case EXACTF:             /*  /abc/i    This node only generated for
6335                                                non-utf8 patterns */
6336             assert(! is_utf8_pat);
6337             folder = foldEQ;
6338             fold_array = PL_fold;
6339             fold_utf8_flags = 0;
6340
6341           do_exactf:
6342             s = STRING(scan);
6343             ln = STR_LEN(scan);
6344
6345             if (utf8_target
6346                 || is_utf8_pat
6347                 || state_num == EXACTFU_SS
6348                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6349             {
6350               /* Either target or the pattern are utf8, or has the issue where
6351                * the fold lengths may differ. */
6352                 const char * const l = locinput;
6353                 char *e = reginfo->strend;
6354
6355                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
6356                                         l, &e, 0,  utf8_target, fold_utf8_flags))
6357                 {
6358                     sayNO;
6359                 }
6360                 locinput = e;
6361                 break;
6362             }
6363
6364             /* Neither the target nor the pattern are utf8 */
6365             if (UCHARAT(s) != nextchr
6366                 && !NEXTCHR_IS_EOS
6367                 && UCHARAT(s) != fold_array[nextchr])
6368             {
6369                 sayNO;
6370             }
6371             if (reginfo->strend - locinput < ln)
6372                 sayNO;
6373             if (ln > 1 && ! folder(s, locinput, ln))
6374                 sayNO;
6375             locinput += ln;
6376             break;
6377         }
6378
6379         case NBOUNDL: /*  /\B/l  */
6380             to_complement = 1;
6381             /* FALLTHROUGH */
6382
6383         case BOUNDL:  /*  /\b/l  */
6384         {
6385             bool b1, b2;
6386             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6387
6388             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6389                 if (! IN_UTF8_CTYPE_LOCALE) {
6390                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6391                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6392                 }
6393                 goto boundu;
6394             }
6395
6396             if (utf8_target) {
6397                 if (locinput == reginfo->strbeg)
6398                     b1 = isWORDCHAR_LC('\n');
6399                 else {
6400                     b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6401                                                         (U8*)(reginfo->strbeg)),
6402                                                  (U8*)(reginfo->strend));
6403                 }
6404                 b2 = (NEXTCHR_IS_EOS)
6405                     ? isWORDCHAR_LC('\n')
6406                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6407                                               (U8*) reginfo->strend);
6408             }
6409             else { /* Here the string isn't utf8 */
6410                 b1 = (locinput == reginfo->strbeg)
6411                      ? isWORDCHAR_LC('\n')
6412                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6413                 b2 = (NEXTCHR_IS_EOS)
6414                     ? isWORDCHAR_LC('\n')
6415                     : isWORDCHAR_LC(nextchr);
6416             }
6417             if (to_complement ^ (b1 == b2)) {
6418                 sayNO;
6419             }
6420             break;
6421         }
6422
6423         case NBOUND:  /*  /\B/   */
6424             to_complement = 1;
6425             /* FALLTHROUGH */
6426
6427         case BOUND:   /*  /\b/   */
6428             if (utf8_target) {
6429                 goto bound_utf8;
6430             }
6431             goto bound_ascii_match_only;
6432
6433         case NBOUNDA: /*  /\B/a  */
6434             to_complement = 1;
6435             /* FALLTHROUGH */
6436
6437         case BOUNDA:  /*  /\b/a  */
6438         {
6439             bool b1, b2;
6440
6441           bound_ascii_match_only:
6442             /* Here the string isn't utf8, or is utf8 and only ascii characters
6443              * are to match \w.  In the latter case looking at the byte just
6444              * prior to the current one may be just the final byte of a
6445              * multi-byte character.  This is ok.  There are two cases:
6446              * 1) it is a single byte character, and then the test is doing
6447              *    just what it's supposed to.
6448              * 2) it is a multi-byte character, in which case the final byte is
6449              *    never mistakable for ASCII, and so the test will say it is
6450              *    not a word character, which is the correct answer. */
6451             b1 = (locinput == reginfo->strbeg)
6452                  ? isWORDCHAR_A('\n')
6453                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6454             b2 = (NEXTCHR_IS_EOS)
6455                 ? isWORDCHAR_A('\n')
6456                 : isWORDCHAR_A(nextchr);
6457             if (to_complement ^ (b1 == b2)) {
6458                 sayNO;
6459             }
6460             break;
6461         }
6462
6463         case NBOUNDU: /*  /\B/u  */
6464             to_complement = 1;
6465             /* FALLTHROUGH */
6466
6467         case BOUNDU:  /*  /\b/u  */
6468
6469           boundu:
6470             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6471                 match = FALSE;
6472             }
6473             else if (utf8_target) {
6474               bound_utf8:
6475                 switch((bound_type) FLAGS(scan)) {
6476                     case TRADITIONAL_BOUND:
6477                     {
6478                         bool b1, b2;
6479                         b1 = (locinput == reginfo->strbeg)
6480                              ? 0 /* isWORDCHAR_L1('\n') */
6481                              : isWORDCHAR_utf8_safe(
6482                                                reghop3((U8*)locinput,
6483                                                        -1,
6484                                                        (U8*)(reginfo->strbeg)),
6485                                                     (U8*) reginfo->strend);
6486                         b2 = (NEXTCHR_IS_EOS)
6487                             ? 0 /* isWORDCHAR_L1('\n') */
6488                             : isWORDCHAR_utf8_safe((U8*)locinput,
6489                                                    (U8*) reginfo->strend);
6490                         match = cBOOL(b1 != b2);
6491                         break;
6492                     }
6493                     case GCB_BOUND:
6494                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6495                             match = TRUE; /* GCB always matches at begin and
6496                                              end */
6497                         }
6498                         else {
6499                             /* Find the gcb values of previous and current
6500                              * chars, then see if is a break point */
6501                             match = isGCB(getGCB_VAL_UTF8(
6502                                                 reghop3((U8*)locinput,
6503                                                         -1,
6504                                                         (U8*)(reginfo->strbeg)),
6505                                                 (U8*) reginfo->strend),
6506                                           getGCB_VAL_UTF8((U8*) locinput,
6507                                                         (U8*) reginfo->strend),
6508                                           (U8*) reginfo->strbeg,
6509                                           (U8*) locinput,
6510                                           utf8_target);
6511                         }
6512                         break;
6513
6514                     case LB_BOUND:
6515                         if (locinput == reginfo->strbeg) {
6516                             match = FALSE;
6517                         }
6518                         else if (NEXTCHR_IS_EOS) {
6519                             match = TRUE;
6520                         }
6521                         else {
6522                             match = isLB(getLB_VAL_UTF8(
6523                                                 reghop3((U8*)locinput,
6524                                                         -1,
6525                                                         (U8*)(reginfo->strbeg)),
6526                                                 (U8*) reginfo->strend),
6527                                           getLB_VAL_UTF8((U8*) locinput,
6528                                                         (U8*) reginfo->strend),
6529                                           (U8*) reginfo->strbeg,
6530                                           (U8*) locinput,
6531                                           (U8*) reginfo->strend,
6532                                           utf8_target);
6533                         }
6534                         break;
6535
6536                     case SB_BOUND: /* Always matches at begin and end */
6537                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6538                             match = TRUE;
6539                         }
6540                         else {
6541                             match = isSB(getSB_VAL_UTF8(
6542                                                 reghop3((U8*)locinput,
6543                                                         -1,
6544                                                         (U8*)(reginfo->strbeg)),
6545                                                 (U8*) reginfo->strend),
6546                                           getSB_VAL_UTF8((U8*) locinput,
6547                                                         (U8*) reginfo->strend),
6548                                           (U8*) reginfo->strbeg,
6549                                           (U8*) locinput,
6550                                           (U8*) reginfo->strend,
6551                                           utf8_target);
6552                         }
6553                         break;
6554
6555                     case WB_BOUND:
6556                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6557                             match = TRUE;
6558                         }
6559                         else {
6560                             match = isWB(WB_UNKNOWN,
6561                                          getWB_VAL_UTF8(
6562                                                 reghop3((U8*)locinput,
6563                                                         -1,
6564                                                         (U8*)(reginfo->strbeg)),
6565                                                 (U8*) reginfo->strend),
6566                                           getWB_VAL_UTF8((U8*) locinput,
6567                                                         (U8*) reginfo->strend),
6568                                           (U8*) reginfo->strbeg,
6569                                           (U8*) locinput,
6570                                           (U8*) reginfo->strend,
6571                                           utf8_target);
6572                         }
6573                         break;
6574                 }
6575             }
6576             else {  /* Not utf8 target */
6577                 switch((bound_type) FLAGS(scan)) {
6578                     case TRADITIONAL_BOUND:
6579                     {
6580                         bool b1, b2;
6581                         b1 = (locinput == reginfo->strbeg)
6582                             ? 0 /* isWORDCHAR_L1('\n') */
6583                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6584                         b2 = (NEXTCHR_IS_EOS)
6585                             ? 0 /* isWORDCHAR_L1('\n') */
6586                             : isWORDCHAR_L1(nextchr);
6587                         match = cBOOL(b1 != b2);
6588                         break;
6589                     }
6590
6591                     case GCB_BOUND:
6592                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6593                             match = TRUE; /* GCB always matches at begin and
6594                                              end */
6595                         }
6596                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6597                                    range */
6598                             match =    UCHARAT(locinput - 1) != '\r'
6599                                     || UCHARAT(locinput) != '\n';
6600                         }
6601                         break;
6602
6603                     case LB_BOUND:
6604                         if (locinput == reginfo->strbeg) {
6605                             match = FALSE;
6606                         }
6607                         else if (NEXTCHR_IS_EOS) {
6608                             match = TRUE;
6609                         }
6610                         else {
6611                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6612                                          getLB_VAL_CP(UCHARAT(locinput)),
6613                                          (U8*) reginfo->strbeg,
6614                                          (U8*) locinput,
6615                                          (U8*) reginfo->strend,
6616                                          utf8_target);
6617                         }
6618                         break;
6619
6620                     case SB_BOUND: /* Always matches at begin and end */
6621                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6622                             match = TRUE;
6623                         }
6624                         else {
6625                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6626                                          getSB_VAL_CP(UCHARAT(locinput)),
6627                                          (U8*) reginfo->strbeg,
6628                                          (U8*) locinput,
6629                                          (U8*) reginfo->strend,
6630                                          utf8_target);
6631                         }
6632                         break;
6633
6634                     case WB_BOUND:
6635                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6636                             match = TRUE;
6637                         }
6638                         else {
6639                             match = isWB(WB_UNKNOWN,
6640                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6641                                          getWB_VAL_CP(UCHARAT(locinput)),
6642                                          (U8*) reginfo->strbeg,
6643                                          (U8*) locinput,
6644                                          (U8*) reginfo->strend,
6645                                          utf8_target);
6646                         }
6647                         break;
6648                 }
6649             }
6650
6651             if (to_complement ^ ! match) {
6652                 sayNO;
6653             }
6654             break;
6655
6656         case ANYOFL:  /*  /[abc]/l      */
6657             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6658
6659             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6660             {
6661               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6662             }
6663             /* FALLTHROUGH */
6664         case ANYOFD:  /*   /[abc]/d       */
6665         case ANYOF:  /*   /[abc]/       */
6666             if (NEXTCHR_IS_EOS)
6667                 sayNO;
6668             if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6669                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6670                                                                    utf8_target))
6671                     sayNO;
6672                 locinput += UTF8SKIP(locinput);
6673             }
6674             else {
6675                 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6676                     sayNO;
6677                 locinput++;
6678             }
6679             break;
6680
6681         case ANYOFM:
6682             if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
6683                 sayNO;
6684             }
6685             locinput++;
6686             break;
6687
6688         case ASCII:
6689             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6690                 sayNO;
6691             }
6692
6693             locinput++;     /* ASCII is always single byte */
6694             break;
6695
6696         case NASCII:
6697             if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6698                 sayNO;
6699             }
6700
6701             goto increment_locinput;
6702             break;
6703
6704         /* The argument (FLAGS) to all the POSIX node types is the class number
6705          * */
6706
6707         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6708             to_complement = 1;
6709             /* FALLTHROUGH */
6710
6711         case POSIXL:    /* \w or [:punct:] etc. under /l */
6712             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6713             if (NEXTCHR_IS_EOS)
6714                 sayNO;
6715
6716             /* Use isFOO_lc() for characters within Latin1.  (Note that
6717              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6718              * wouldn't be invariant) */
6719             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6720                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6721                     sayNO;
6722                 }
6723
6724                 locinput++;
6725                 break;
6726             }
6727
6728             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6729                 /* An above Latin-1 code point, or malformed */
6730                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6731                                                        reginfo->strend);
6732                 goto utf8_posix_above_latin1;
6733             }
6734
6735             /* Here is a UTF-8 variant code point below 256 and the target is
6736              * UTF-8 */
6737             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6738                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6739                                             *(locinput + 1))))))
6740             {
6741                 sayNO;
6742             }
6743
6744             goto increment_locinput;
6745
6746         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
6747             to_complement = 1;
6748             /* FALLTHROUGH */
6749
6750         case POSIXD:    /* \w or [:punct:] etc. under /d */
6751             if (utf8_target) {
6752                 goto utf8_posix;
6753             }
6754             goto posixa;
6755
6756         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
6757
6758             if (NEXTCHR_IS_EOS) {
6759                 sayNO;
6760             }
6761
6762             /* All UTF-8 variants match */
6763             if (! UTF8_IS_INVARIANT(nextchr)) {
6764                 goto increment_locinput;
6765             }
6766
6767             to_complement = 1;
6768             goto join_nposixa;
6769
6770         case POSIXA:    /* \w or [:punct:] etc. under /a */
6771
6772           posixa:
6773             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6774              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6775              * character is a single byte */
6776
6777             if (NEXTCHR_IS_EOS) {
6778                 sayNO;
6779             }
6780
6781           join_nposixa:
6782
6783             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6784                                                                 FLAGS(scan)))))
6785             {
6786                 sayNO;
6787             }
6788
6789             /* Here we are either not in utf8, or we matched a utf8-invariant,
6790              * so the next char is the next byte */
6791             locinput++;
6792             break;
6793
6794         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
6795             to_complement = 1;
6796             /* FALLTHROUGH */
6797
6798         case POSIXU:    /* \w or [:punct:] etc. under /u */
6799           utf8_posix:
6800             if (NEXTCHR_IS_EOS) {
6801                 sayNO;
6802             }
6803
6804             /* Use _generic_isCC() for characters within Latin1.  (Note that
6805              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6806              * wouldn't be invariant) */
6807             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6808                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6809                                                            FLAGS(scan)))))
6810                 {
6811                     sayNO;
6812                 }
6813                 locinput++;
6814             }
6815             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6816                 if (! (to_complement
6817                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6818                                                                *(locinput + 1)),
6819                                              FLAGS(scan)))))
6820                 {
6821                     sayNO;
6822                 }
6823                 locinput += 2;
6824             }
6825             else {  /* Handle above Latin-1 code points */
6826               utf8_posix_above_latin1:
6827                 classnum = (_char_class_number) FLAGS(scan);
6828                 switch (classnum) {
6829                     default:
6830                         if (! (to_complement
6831                            ^ cBOOL(_invlist_contains_cp(
6832                                       PL_XPosix_ptrs[classnum],
6833                                       utf8_to_uvchr_buf((U8 *) locinput,
6834                                                         (U8 *) reginfo->strend,
6835                                                         NULL)))))
6836                         {
6837                             sayNO;
6838                         }
6839                         break;
6840                     case _CC_ENUM_SPACE:
6841                         if (! (to_complement
6842                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
6843                         {
6844                             sayNO;
6845                         }
6846                         break;
6847                     case _CC_ENUM_BLANK:
6848                         if (! (to_complement
6849                                         ^ cBOOL(is_HORIZWS_high(locinput))))
6850                         {
6851                             sayNO;
6852                         }
6853                         break;
6854                     case _CC_ENUM_XDIGIT:
6855                         if (! (to_complement
6856                                         ^ cBOOL(is_XDIGIT_high(locinput))))
6857                         {
6858                             sayNO;
6859                         }
6860                         break;
6861                     case _CC_ENUM_VERTSPACE:
6862                         if (! (to_complement
6863                                         ^ cBOOL(is_VERTWS_high(locinput))))
6864                         {
6865                             sayNO;
6866                         }
6867                         break;
6868                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
6869                     case _CC_ENUM_ASCII:
6870                         if (! to_complement) {
6871                             sayNO;
6872                         }
6873                         break;
6874                 }
6875                 locinput += UTF8SKIP(locinput);
6876             }
6877             break;
6878
6879         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
6880                        a Unicode extended Grapheme Cluster */
6881             if (NEXTCHR_IS_EOS)
6882                 sayNO;
6883             if  (! utf8_target) {
6884
6885                 /* Match either CR LF  or '.', as all the other possibilities
6886                  * require utf8 */
6887                 locinput++;         /* Match the . or CR */
6888                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6889                                        match the LF */
6890                     && locinput < reginfo->strend
6891                     && UCHARAT(locinput) == '\n')
6892                 {
6893                     locinput++;
6894                 }
6895             }
6896             else {
6897
6898                 /* Get the gcb type for the current character */
6899                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6900                                                        (U8*) reginfo->strend);
6901
6902                 /* Then scan through the input until we get to the first
6903                  * character whose type is supposed to be a gcb with the
6904                  * current character.  (There is always a break at the
6905                  * end-of-input) */
6906                 locinput += UTF8SKIP(locinput);
6907                 while (locinput < reginfo->strend) {
6908                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6909                                                          (U8*) reginfo->strend);
6910                     if (isGCB(prev_gcb, cur_gcb,
6911                               (U8*) reginfo->strbeg, (U8*) locinput,
6912                               utf8_target))
6913                     {
6914                         break;
6915                     }
6916
6917                     prev_gcb = cur_gcb;
6918                     locinput += UTF8SKIP(locinput);
6919                 }
6920
6921
6922             }
6923             break;
6924
6925         case NREFFL:  /*  /\g{name}/il  */
6926         {   /* The capture buffer cases.  The ones beginning with N for the
6927                named buffers just convert to the equivalent numbered and
6928                pretend they were called as the corresponding numbered buffer
6929                op.  */
6930             /* don't initialize these in the declaration, it makes C++
6931                unhappy */
6932             const char *s;
6933             char type;
6934             re_fold_t folder;
6935             const U8 *fold_array;
6936             UV utf8_fold_flags;
6937
6938             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6939             folder = foldEQ_locale;
6940             fold_array = PL_fold_locale;
6941             type = REFFL;
6942             utf8_fold_flags = FOLDEQ_LOCALE;
6943             goto do_nref;
6944
6945         case NREFFA:  /*  /\g{name}/iaa  */
6946             folder = foldEQ_latin1;
6947             fold_array = PL_fold_latin1;
6948             type = REFFA;
6949             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6950             goto do_nref;
6951
6952         case NREFFU:  /*  /\g{name}/iu  */
6953             folder = foldEQ_latin1;
6954             fold_array = PL_fold_latin1;
6955             type = REFFU;
6956             utf8_fold_flags = 0;
6957             goto do_nref;
6958
6959         case NREFF:  /*  /\g{name}/i  */
6960             folder = foldEQ;
6961             fold_array = PL_fold;
6962             type = REFF;
6963             utf8_fold_flags = 0;
6964             goto do_nref;
6965
6966         case NREF:  /*  /\g{name}/   */
6967             type = REF;
6968             folder = NULL;
6969             fold_array = NULL;
6970             utf8_fold_flags = 0;
6971           do_nref:
6972
6973             /* For the named back references, find the corresponding buffer
6974              * number */
6975             n = reg_check_named_buff_matched(rex,scan);
6976
6977             if ( ! n ) {
6978                 sayNO;
6979             }
6980             goto do_nref_ref_common;
6981
6982         case REFFL:  /*  /\1/il  */
6983             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6984             folder = foldEQ_locale;
6985             fold_array = PL_fold_locale;
6986             utf8_fold_flags = FOLDEQ_LOCALE;
6987             goto do_ref;
6988
6989         case REFFA:  /*  /\1/iaa  */
6990             folder = foldEQ_latin1;
6991             fold_array = PL_fold_latin1;
6992             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6993             goto do_ref;
6994
6995         case REFFU:  /*  /\1/iu  */
6996             folder = foldEQ_latin1;
6997             fold_array = PL_fold_latin1;
6998             utf8_fold_flags = 0;
6999             goto do_ref;
7000
7001         case REFF:  /*  /\1/i  */
7002             folder = foldEQ;
7003             fold_array = PL_fold;
7004             utf8_fold_flags = 0;
7005             goto do_ref;
7006
7007         case REF:  /*  /\1/    */
7008             folder = NULL;
7009             fold_array = NULL;
7010             utf8_fold_flags = 0;
7011
7012           do_ref:
7013             type = OP(scan);
7014             n = ARG(scan);  /* which paren pair */
7015
7016           do_nref_ref_common:
7017             ln = rex->offs[n].start;
7018             endref = rex->offs[n].end;
7019             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7020             if (rex->lastparen < n || ln == -1 || endref == -1)
7021                 sayNO;                  /* Do not match unless seen CLOSEn. */
7022             if (ln == endref)
7023                 break;
7024
7025             s = reginfo->strbeg + ln;
7026             if (type != REF     /* REF can do byte comparison */
7027                 && (utf8_target || type == REFFU || type == REFFL))
7028             {
7029                 char * limit = reginfo->strend;
7030
7031                 /* This call case insensitively compares the entire buffer
7032                     * at s, with the current input starting at locinput, but
7033                     * not going off the end given by reginfo->strend, and
7034                     * returns in <limit> upon success, how much of the
7035                     * current input was matched */
7036                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7037                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7038                 {
7039                     sayNO;
7040                 }
7041                 locinput = limit;
7042                 break;
7043             }
7044
7045             /* Not utf8:  Inline the first character, for speed. */
7046             if (!NEXTCHR_IS_EOS &&
7047                 UCHARAT(s) != nextchr &&
7048                 (type == REF ||
7049                  UCHARAT(s) != fold_array[nextchr]))
7050                 sayNO;
7051             ln = endref - ln;
7052             if (locinput + ln > reginfo->strend)
7053                 sayNO;
7054             if (ln > 1 && (type == REF
7055                            ? memNE(s, locinput, ln)
7056                            : ! folder(s, locinput, ln)))
7057                 sayNO;
7058             locinput += ln;
7059             break;
7060         }
7061
7062         case NOTHING: /* null op; e.g. the 'nothing' following
7063                        * the '*' in m{(a+|b)*}' */
7064             break;
7065         case TAIL: /* placeholder while compiling (A|B|C) */
7066             break;
7067
7068 #undef  ST
7069 #define ST st->u.eval
7070 #define CUR_EVAL cur_eval->u.eval
7071
7072         {
7073             SV *ret;
7074             REGEXP *re_sv;
7075             regexp *re;
7076             regexp_internal *rei;
7077             regnode *startpoint;
7078             U32 arg;
7079
7080         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7081             arg= (U32)ARG(scan);
7082             if (cur_eval && cur_eval->locinput == locinput) {
7083                 if ( ++nochange_depth > max_nochange_depth )
7084                     Perl_croak(aTHX_
7085                         "Pattern subroutine nesting without pos change"
7086                         " exceeded limit in regex");
7087             } else {
7088                 nochange_depth = 0;
7089             }
7090             re_sv = rex_sv;
7091             re = rex;
7092             rei = rexi;
7093             startpoint = scan + ARG2L(scan);
7094             EVAL_CLOSE_PAREN_SET( st, arg );
7095             /* Detect infinite recursion
7096              *
7097              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7098              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7099              * So we track the position in the string we are at each time
7100              * we recurse and if we try to enter the same routine twice from
7101              * the same position we throw an error.
7102              */
7103             if ( rex->recurse_locinput[arg] == locinput ) {
7104                 /* FIXME: we should show the regop that is failing as part
7105                  * of the error message. */
7106                 Perl_croak(aTHX_ "Infinite recursion in regex");
7107             } else {
7108                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7109                 rex->recurse_locinput[arg]= locinput;
7110
7111                 DEBUG_r({
7112                     GET_RE_DEBUG_FLAGS_DECL;
7113                     DEBUG_STACK_r({
7114                         Perl_re_exec_indentf( aTHX_
7115                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7116                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7117                         );
7118                     });
7119                 });
7120             }
7121
7122             /* Save all the positions seen so far. */
7123             ST.cp = regcppush(rex, 0, maxopenparen);
7124             REGCP_SET(ST.lastcp);
7125
7126             /* and then jump to the code we share with EVAL */
7127             goto eval_recurse_doit;
7128             /* NOTREACHED */
7129
7130         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7131             if (cur_eval && cur_eval->locinput==locinput) {
7132                 if ( ++nochange_depth > max_nochange_depth )
7133                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7134             } else {
7135                 nochange_depth = 0;
7136             }
7137             {
7138                 /* execute the code in the {...} */
7139
7140                 dSP;
7141                 IV before;
7142                 OP * const oop = PL_op;
7143                 COP * const ocurcop = PL_curcop;
7144                 OP *nop;
7145                 CV *newcv;
7146
7147                 /* save *all* paren positions */
7148                 regcppush(rex, 0, maxopenparen);
7149                 REGCP_SET(ST.lastcp);
7150
7151                 if (!caller_cv)
7152                     caller_cv = find_runcv(NULL);
7153
7154                 n = ARG(scan);
7155
7156                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7157                     newcv = (ReANY(
7158                                     (REGEXP*)(rexi->data->data[n])
7159                             ))->qr_anoncv;
7160                     nop = (OP*)rexi->data->data[n+1];
7161                 }
7162                 else if (rexi->data->what[n] == 'l') { /* literal code */
7163                     newcv = caller_cv;
7164                     nop = (OP*)rexi->data->data[n];
7165                     assert(CvDEPTH(newcv));
7166                 }
7167                 else {
7168                     /* literal with own CV */
7169                     assert(rexi->data->what[n] == 'L');
7170                     newcv = rex->qr_anoncv;
7171                     nop = (OP*)rexi->data->data[n];
7172                 }
7173
7174                 /* Some notes about MULTICALL and the context and save stacks.
7175                  *
7176                  * In something like
7177                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7178                  * since codeblocks don't introduce a new scope (so that
7179                  * local() etc accumulate), at the end of a successful
7180                  * match there will be a SAVEt_CLEARSV on the savestack
7181                  * for each of $x, $y, $z. If the three code blocks above
7182                  * happen to have come from different CVs (e.g. via
7183                  * embedded qr//s), then we must ensure that during any
7184                  * savestack unwinding, PL_comppad always points to the
7185                  * right pad at each moment. We achieve this by
7186                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7187                  * there is a change of pad.
7188                  * In theory whenever we call a code block, we should
7189                  * push a CXt_SUB context, then pop it on return from
7190                  * that code block. This causes a bit of an issue in that
7191                  * normally popping a context also clears the savestack
7192                  * back to cx->blk_oldsaveix, but here we specifically
7193                  * don't want to clear the save stack on exit from the
7194                  * code block.
7195                  * Also for efficiency we don't want to keep pushing and
7196                  * popping the single SUB context as we backtrack etc.
7197                  * So instead, we push a single context the first time
7198                  * we need, it, then hang onto it until the end of this
7199                  * function. Whenever we encounter a new code block, we
7200                  * update the CV etc if that's changed. During the times
7201                  * in this function where we're not executing a code
7202                  * block, having the SUB context still there is a bit
7203                  * naughty - but we hope that no-one notices.
7204                  * When the SUB context is initially pushed, we fake up
7205                  * cx->blk_oldsaveix to be as if we'd pushed this context
7206                  * on first entry to S_regmatch rather than at some random
7207                  * point during the regexe execution. That way if we
7208                  * croak, popping the context stack will ensure that
7209                  * *everything* SAVEd by this function is undone and then
7210                  * the context popped, rather than e.g., popping the
7211                  * context (and restoring the original PL_comppad) then
7212                  * popping more of the savestack and restoring a bad
7213                  * PL_comppad.
7214                  */
7215
7216                 /* If this is the first EVAL, push a MULTICALL. On
7217                  * subsequent calls, if we're executing a different CV, or
7218                  * if PL_comppad has got messed up from backtracking
7219                  * through SAVECOMPPADs, then refresh the context.
7220                  */
7221                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7222                 {
7223                     U8 flags = (CXp_SUB_RE |
7224                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7225                     SAVECOMPPAD();
7226                     if (last_pushed_cv) {
7227                         CHANGE_MULTICALL_FLAGS(newcv, flags);
7228                     }
7229                     else {
7230                         PUSH_MULTICALL_FLAGS(newcv, flags);
7231                     }
7232                     /* see notes above */
7233                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7234
7235                     last_pushed_cv = newcv;
7236                 }
7237                 else {
7238                     /* these assignments are just to silence compiler
7239                      * warnings */
7240                     multicall_cop = NULL;
7241                 }
7242                 last_pad = PL_comppad;
7243
7244                 /* the initial nextstate you would normally execute
7245                  * at the start of an eval (which would cause error
7246                  * messages to come from the eval), may be optimised
7247                  * away from the execution path in the regex code blocks;
7248                  * so manually set PL_curcop to it initially */
7249                 {
7250                     OP *o = cUNOPx(nop)->op_first;
7251                     assert(o->op_type == OP_NULL);
7252                     if (o->op_targ == OP_SCOPE) {
7253                         o = cUNOPo->op_first;
7254                     }
7255                     else {
7256                         assert(o->op_targ == OP_LEAVE);
7257                         o = cUNOPo->op_first;
7258                         assert(o->op_type == OP_ENTER);
7259                         o = OpSIBLING(o);
7260                     }
7261
7262                     if (o->op_type != OP_STUB) {
7263                         assert(    o->op_type == OP_NEXTSTATE
7264                                 || o->op_type == OP_DBSTATE
7265                                 || (o->op_type == OP_NULL
7266                                     &&  (  o->op_targ == OP_NEXTSTATE
7267                                         || o->op_targ == OP_DBSTATE
7268                                         )
7269                                     )
7270                         );
7271                         PL_curcop = (COP*)o;
7272                     }
7273                 }
7274                 nop = nop->op_next;
7275
7276                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7277                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7278
7279                 rex->offs[0].end = locinput - reginfo->strbeg;
7280                 if (reginfo->info_aux_eval->pos_magic)
7281                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7282                                   reginfo->sv, reginfo->strbeg,
7283                                   locinput - reginfo->strbeg);
7284
7285                 if (sv_yes_mark) {
7286                     SV *sv_mrk = get_sv("REGMARK", 1);
7287                     sv_setsv(sv_mrk, sv_yes_mark);
7288                 }
7289
7290                 /* we don't use MULTICALL here as we want to call the
7291                  * first op of the block of interest, rather than the
7292                  * first op of the sub. Also, we don't want to free
7293                  * the savestack frame */
7294                 before = (IV)(SP-PL_stack_base);
7295                 PL_op = nop;
7296                 CALLRUNOPS(aTHX);                       /* Scalar context. */
7297                 SPAGAIN;
7298                 if ((IV)(SP-PL_stack_base) == before)
7299                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7300                 else {
7301                     ret = POPs;
7302                     PUTBACK;
7303                 }
7304
7305                 /* before restoring everything, evaluate the returned
7306                  * value, so that 'uninit' warnings don't use the wrong
7307                  * PL_op or pad. Also need to process any magic vars
7308                  * (e.g. $1) *before* parentheses are restored */
7309
7310                 PL_op = NULL;
7311
7312                 re_sv = NULL;
7313                 if (logical == 0)        /*   (?{})/   */
7314                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7315                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7316                     sw = cBOOL(SvTRUE_NN(ret));
7317                     logical = 0;
7318                 }
7319                 else {                   /*  /(??{})  */
7320                     /*  if its overloaded, let the regex compiler handle
7321                      *  it; otherwise extract regex, or stringify  */
7322                     if (SvGMAGICAL(ret))
7323                         ret = sv_mortalcopy(ret);
7324                     if (!SvAMAGIC(ret)) {
7325                         SV *sv = ret;
7326                         if (SvROK(sv))
7327                             sv = SvRV(sv);
7328                         if (SvTYPE(sv) == SVt_REGEXP)
7329                             re_sv = (REGEXP*) sv;
7330                         else if (SvSMAGICAL(ret)) {
7331                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7332                             if (mg)
7333                                 re_sv = (REGEXP *) mg->mg_obj;
7334                         }
7335
7336                         /* force any undef warnings here */
7337                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7338                             ret = sv_mortalcopy(ret);
7339                             (void) SvPV_force_nolen(ret);
7340                         }
7341                     }
7342
7343                 }
7344
7345                 /* *** Note that at this point we don't restore
7346                  * PL_comppad, (or pop the CxSUB) on the assumption it may
7347                  * be used again soon. This is safe as long as nothing
7348                  * in the regexp code uses the pad ! */
7349                 PL_op = oop;
7350                 PL_curcop = ocurcop;
7351                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7352                 PL_curpm_under = PL_curpm;
7353                 PL_curpm = PL_reg_curpm;
7354
7355                 if (logical != 2) {
7356                     PUSH_STATE_GOTO(EVAL_B, next, locinput);
7357                     /* NOTREACHED */
7358                 }
7359             }
7360
7361                 /* only /(??{})/  from now on */
7362                 logical = 0;
7363                 {
7364                     /* extract RE object from returned value; compiling if
7365                      * necessary */
7366
7367                     if (re_sv) {
7368                         re_sv = reg_temp_copy(NULL, re_sv);
7369                     }
7370                     else {
7371                         U32 pm_flags = 0;
7372
7373                         if (SvUTF8(ret) && IN_BYTES) {
7374                             /* In use 'bytes': make a copy of the octet
7375                              * sequence, but without the flag on */
7376                             STRLEN len;
7377                             const char *const p = SvPV(ret, len);
7378                             ret = newSVpvn_flags(p, len, SVs_TEMP);
7379                         }
7380                         if (rex->intflags & PREGf_USE_RE_EVAL)
7381                             pm_flags |= PMf_USE_RE_EVAL;
7382
7383                         /* if we got here, it should be an engine which
7384                          * supports compiling code blocks and stuff */
7385                         assert(rex->engine && rex->engine->op_comp);
7386                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7387                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7388                                     rex->engine, NULL, NULL,
7389                                     /* copy /msixn etc to inner pattern */
7390                                     ARG2L(scan),
7391                                     pm_flags);
7392
7393                         if (!(SvFLAGS(ret)
7394                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
7395                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7396                             /* This isn't a first class regexp. Instead, it's
7397                                caching a regexp onto an existing, Perl visible
7398                                scalar.  */
7399                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7400                         }
7401                     }
7402                     SAVEFREESV(re_sv);
7403                     re = ReANY(re_sv);
7404                 }
7405                 RXp_MATCH_COPIED_off(re);
7406                 re->subbeg = rex->subbeg;
7407                 re->sublen = rex->sublen;
7408                 re->suboffset = rex->suboffset;
7409                 re->subcoffset = rex->subcoffset;
7410                 re->lastparen = 0;
7411                 re->lastcloseparen = 0;
7412                 rei = RXi_GET(re);
7413                 DEBUG_EXECUTE_r(
7414                     debug_start_match(re_sv, utf8_target, locinput,
7415                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7416                 );
7417                 startpoint = rei->program + 1;
7418                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7419                                              * close_paren only for GOSUB */
7420                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7421                 /* Save all the seen positions so far. */
7422                 ST.cp = regcppush(rex, 0, maxopenparen);
7423                 REGCP_SET(ST.lastcp);
7424                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7425                 maxopenparen = 0;
7426                 /* run the pattern returned from (??{...}) */
7427
7428               eval_recurse_doit: /* Share code with GOSUB below this line
7429                             * At this point we expect the stack context to be
7430                             * set up correctly */
7431
7432                 /* invalidate the S-L poscache. We're now executing a
7433                  * different set of WHILEM ops (and their associated
7434                  * indexes) against the same string, so the bits in the
7435                  * cache are meaningless. Setting maxiter to zero forces
7436                  * the cache to be invalidated and zeroed before reuse.
7437                  * XXX This is too dramatic a measure. Ideally we should
7438                  * save the old cache and restore when running the outer
7439                  * pattern again */
7440                 reginfo->poscache_maxiter = 0;
7441
7442                 /* the new regexp might have a different is_utf8_pat than we do */
7443                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7444
7445                 ST.prev_rex = rex_sv;
7446                 ST.prev_curlyx = cur_curlyx;
7447                 rex_sv = re_sv;
7448                 SET_reg_curpm(rex_sv);
7449                 rex = re;
7450                 rexi = rei;
7451                 cur_curlyx = NULL;
7452                 ST.B = next;
7453                 ST.prev_eval = cur_eval;
7454                 cur_eval = st;
7455                 /* now continue from first node in postoned RE */
7456                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7457                 NOT_REACHED; /* NOTREACHED */
7458         }
7459
7460         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7461             /* note: this is called twice; first after popping B, then A */
7462             DEBUG_STACK_r({
7463                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7464                     depth, cur_eval, ST.prev_eval);
7465             });
7466
7467 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7468             if ( cur_eval && CUR_EVAL.close_paren ) {\
7469                 DEBUG_STACK_r({ \
7470                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7471                         depth,    \
7472                         CUR_EVAL.close_paren - 1,\
7473                         cur_eval, \
7474                         VAL);     \
7475                 });               \
7476                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7477             }
7478
7479             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7480
7481             rex_sv = ST.prev_rex;
7482             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7483             SET_reg_curpm(rex_sv);
7484             rex = ReANY(rex_sv);
7485             rexi = RXi_GET(rex);
7486             {
7487                 /* preserve $^R across LEAVE's. See Bug 121070. */
7488                 SV *save_sv= GvSV(PL_replgv);
7489                 SvREFCNT_inc(save_sv);
7490                 regcpblow(ST.cp); /* LEAVE in disguise */
7491                 sv_setsv(GvSV(PL_replgv), save_sv);
7492                 SvREFCNT_dec(save_sv);
7493             }
7494             cur_eval = ST.prev_eval;
7495             cur_curlyx = ST.prev_curlyx;
7496
7497             /* Invalidate cache. See "invalidate" comment above. */
7498             reginfo->poscache_maxiter = 0;
7499             if ( nochange_depth )
7500                 nochange_depth--;
7501
7502             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7503             sayYES;
7504
7505
7506         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7507             REGCP_UNWIND(ST.lastcp);
7508             sayNO;
7509
7510         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7511             /* note: this is called twice; first after popping B, then A */
7512             DEBUG_STACK_r({
7513                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7514                     depth, cur_eval, ST.prev_eval);
7515             });
7516
7517             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7518
7519             rex_sv = ST.prev_rex;
7520             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7521             SET_reg_curpm(rex_sv);
7522             rex = ReANY(rex_sv);
7523             rexi = RXi_GET(rex);
7524
7525             REGCP_UNWIND(ST.lastcp);
7526             regcppop(rex, &maxopenparen);
7527             cur_eval = ST.prev_eval;
7528             cur_curlyx = ST.prev_curlyx;
7529
7530             /* Invalidate cache. See "invalidate" comment above. */
7531             reginfo->poscache_maxiter = 0;
7532             if ( nochange_depth )
7533                 nochange_depth--;
7534
7535             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7536             sayNO_SILENT;
7537 #undef ST
7538
7539         case OPEN: /*  (  */
7540             n = ARG(scan);  /* which paren pair */
7541             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7542             if (n > maxopenparen)
7543                 maxopenparen = n;
7544             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7545                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7546                 depth,
7547                 PTR2UV(rex),
7548                 PTR2UV(rex->offs),
7549                 (UV)n,
7550                 (IV)rex->offs[n].start_tmp,
7551                 (UV)maxopenparen
7552             ));
7553             lastopen = n;
7554             break;
7555
7556         case SROPEN: /*  (*SCRIPT_RUN:  */
7557             script_run_begin = (U8 *) locinput;
7558             break;
7559
7560 /* XXX really need to log other places start/end are set too */
7561 #define CLOSE_CAPTURE                                                      \
7562     rex->offs[n].start = rex->offs[n].start_tmp;                           \
7563     rex->offs[n].end = locinput - reginfo->strbeg;                         \
7564     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
7565         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
7566         depth,                                                             \
7567         PTR2UV(rex),                                                       \
7568         PTR2UV(rex->offs),                                                 \
7569         (UV)n,                                                             \
7570         (IV)rex->offs[n].start,                                            \
7571         (IV)rex->offs[n].end                                               \
7572     ))
7573
7574         case CLOSE:  /*  )  */
7575             n = ARG(scan);  /* which paren pair */
7576             CLOSE_CAPTURE;
7577             if (n > rex->lastparen)
7578                 rex->lastparen = n;
7579             rex->lastcloseparen = n;
7580             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7581                 goto fake_end;
7582
7583             break;
7584
7585         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7586
7587             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7588             {
7589                 sayNO;
7590             }
7591
7592             break;
7593
7594
7595         case ACCEPT:  /*  (*ACCEPT)  */
7596             if (scan->flags)
7597                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7598             if (ARG2L(scan)){
7599                 regnode *cursor;
7600                 for (cursor=scan;
7601                      cursor && OP(cursor)!=END;
7602                      cursor=regnext(cursor))
7603                 {
7604                     if ( OP(cursor)==CLOSE ){
7605                         n = ARG(cursor);
7606                         if ( n <= lastopen ) {
7607                             CLOSE_CAPTURE;
7608                             if (n > rex->lastparen)
7609                                 rex->lastparen = n;
7610                             rex->lastcloseparen = n;
7611                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7612                                 break;
7613                         }
7614                     }
7615                 }
7616             }
7617             goto fake_end;
7618             /* NOTREACHED */
7619
7620         case GROUPP:  /*  (?(1))  */
7621             n = ARG(scan);  /* which paren pair */
7622             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7623             break;
7624
7625         case NGROUPP:  /*  (?(<name>))  */
7626             /* reg_check_named_buff_matched returns 0 for no match */
7627             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7628             break;
7629
7630         case INSUBP:   /*  (?(R))  */
7631             n = ARG(scan);
7632             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7633              * of SCAN is already set up as matches a eval.close_paren */
7634             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7635             break;
7636
7637         case DEFINEP:  /*  (?(DEFINE))  */
7638             sw = 0;
7639             break;
7640
7641         case IFTHEN:   /*  (?(cond)A|B)  */
7642             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7643             if (sw)
7644                 next = NEXTOPER(NEXTOPER(scan));
7645             else {
7646                 next = scan + ARG(scan);
7647                 if (OP(next) == IFTHEN) /* Fake one. */
7648                     next = NEXTOPER(NEXTOPER(next));
7649             }
7650             break;
7651
7652         case LOGICAL:  /* modifier for EVAL and IFMATCH */
7653             logical = scan->flags;
7654             break;
7655
7656 /*******************************************************************
7657
7658 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7659 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7660 STAR/PLUS/CURLY/CURLYN are used instead.)
7661
7662 A*B is compiled as <CURLYX><A><WHILEM><B>
7663
7664 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7665 state, which contains the current count, initialised to -1. It also sets
7666 cur_curlyx to point to this state, with any previous value saved in the
7667 state block.
7668
7669 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7670 since the pattern may possibly match zero times (i.e. it's a while {} loop
7671 rather than a do {} while loop).
7672
7673 Each entry to WHILEM represents a successful match of A. The count in the
7674 CURLYX block is incremented, another WHILEM state is pushed, and execution
7675 passes to A or B depending on greediness and the current count.
7676
7677 For example, if matching against the string a1a2a3b (where the aN are
7678 substrings that match /A/), then the match progresses as follows: (the
7679 pushed states are interspersed with the bits of strings matched so far):
7680
7681     <CURLYX cnt=-1>
7682     <CURLYX cnt=0><WHILEM>
7683     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7684     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7685     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7686     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7687
7688 (Contrast this with something like CURLYM, which maintains only a single
7689 backtrack state:
7690
7691     <CURLYM cnt=0> a1
7692     a1 <CURLYM cnt=1> a2
7693     a1 a2 <CURLYM cnt=2> a3
7694     a1 a2 a3 <CURLYM cnt=3> b
7695 )
7696
7697 Each WHILEM state block marks a point to backtrack to upon partial failure
7698 of A or B, and also contains some minor state data related to that
7699 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7700 overall state, such as the count, and pointers to the A and B ops.
7701
7702 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7703 must always point to the *current* CURLYX block, the rules are:
7704
7705 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7706 and set cur_curlyx to point the new block.
7707
7708 When popping the CURLYX block after a successful or unsuccessful match,
7709 restore the previous cur_curlyx.
7710
7711 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7712 to the outer one saved in the CURLYX block.
7713
7714 When popping the WHILEM block after a successful or unsuccessful B match,
7715 restore the previous cur_curlyx.
7716
7717 Here's an example for the pattern (AI* BI)*BO
7718 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7719
7720 cur_
7721 curlyx backtrack stack
7722 ------ ---------------
7723 NULL
7724 CO     <CO prev=NULL> <WO>
7725 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7726 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7727 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7728
7729 At this point the pattern succeeds, and we work back down the stack to
7730 clean up, restoring as we go:
7731
7732 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7733 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7734 CO     <CO prev=NULL> <WO>
7735 NULL
7736
7737 *******************************************************************/
7738
7739 #define ST st->u.curlyx
7740
7741         case CURLYX:    /* start of /A*B/  (for complex A) */
7742         {
7743             /* No need to save/restore up to this paren */
7744             I32 parenfloor = scan->flags;
7745
7746             assert(next); /* keep Coverity happy */
7747             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7748                 next += ARG(next);
7749
7750             /* XXXX Probably it is better to teach regpush to support
7751                parenfloor > maxopenparen ... */
7752             if (parenfloor > (I32)rex->lastparen)
7753                 parenfloor = rex->lastparen; /* Pessimization... */
7754
7755             ST.prev_curlyx= cur_curlyx;
7756             cur_curlyx = st;
7757             ST.cp = PL_savestack_ix;
7758
7759             /* these fields contain the state of the current curly.
7760              * they are accessed by subsequent WHILEMs */
7761             ST.parenfloor = parenfloor;
7762             ST.me = scan;
7763             ST.B = next;
7764             ST.minmod = minmod;
7765             minmod = 0;
7766             ST.count = -1;      /* this will be updated by WHILEM */
7767             ST.lastloc = NULL;  /* this will be updated by WHILEM */
7768
7769             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7770             NOT_REACHED; /* NOTREACHED */
7771         }
7772
7773         case CURLYX_end: /* just finished matching all of A*B */
7774             cur_curlyx = ST.prev_curlyx;
7775             sayYES;
7776             NOT_REACHED; /* NOTREACHED */
7777
7778         case CURLYX_end_fail: /* just failed to match all of A*B */
7779             regcpblow(ST.cp);
7780             cur_curlyx = ST.prev_curlyx;
7781             sayNO;
7782             NOT_REACHED; /* NOTREACHED */
7783
7784
7785 #undef ST
7786 #define ST st->u.whilem
7787
7788         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
7789         {
7790             /* see the discussion above about CURLYX/WHILEM */
7791             I32 n;
7792             int min, max;
7793             regnode *A;
7794
7795             assert(cur_curlyx); /* keep Coverity happy */
7796
7797             min = ARG1(cur_curlyx->u.curlyx.me);
7798             max = ARG2(cur_curlyx->u.curlyx.me);
7799             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7800             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7801             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7802             ST.cache_offset = 0;
7803             ST.cache_mask = 0;
7804
7805
7806             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
7807                   depth, (long)n, min, max)
7808             );
7809
7810             /* First just match a string of min A's. */
7811
7812             if (n < min) {
7813                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7814                 cur_curlyx->u.curlyx.lastloc = locinput;
7815                 REGCP_SET(ST.lastcp);
7816
7817                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7818                 NOT_REACHED; /* NOTREACHED */
7819             }
7820
7821             /* If degenerate A matches "", assume A done. */
7822
7823             if (locinput == cur_curlyx->u.curlyx.lastloc) {
7824                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
7825                    depth)
7826                 );
7827                 goto do_whilem_B_max;
7828             }
7829
7830             /* super-linear cache processing.
7831              *
7832              * The idea here is that for certain types of CURLYX/WHILEM -
7833              * principally those whose upper bound is infinity (and
7834              * excluding regexes that have things like \1 and other very
7835              * non-regular expresssiony things), then if a pattern like
7836              * /....A*.../ fails and we backtrack to the WHILEM, then we
7837              * make a note that this particular WHILEM op was at string
7838              * position 47 (say) when the rest of pattern failed. Then, if
7839              * we ever find ourselves back at that WHILEM, and at string
7840              * position 47 again, we can just fail immediately rather than
7841              * running the rest of the pattern again.
7842              *
7843              * This is very handy when patterns start to go
7844              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7845              * with a combinatorial explosion of backtracking.
7846              *
7847              * The cache is implemented as a bit array, with one bit per
7848              * string byte position per WHILEM op (up to 16) - so its
7849              * between 0.25 and 2x the string size.
7850              *
7851              * To avoid allocating a poscache buffer every time, we do an
7852              * initially countdown; only after we have  executed a WHILEM
7853              * op (string-length x #WHILEMs) times do we allocate the
7854              * cache.
7855              *
7856              * The top 4 bits of scan->flags byte say how many different
7857              * relevant CURLLYX/WHILEM op pairs there are, while the
7858              * bottom 4-bits is the identifying index number of this
7859              * WHILEM.
7860              */
7861
7862             if (scan->flags) {
7863
7864                 if (!reginfo->poscache_maxiter) {
7865                     /* start the countdown: Postpone detection until we
7866                      * know the match is not *that* much linear. */
7867                     reginfo->poscache_maxiter
7868                         =    (reginfo->strend - reginfo->strbeg + 1)
7869                            * (scan->flags>>4);
7870                     /* possible overflow for long strings and many CURLYX's */
7871                     if (reginfo->poscache_maxiter < 0)
7872                         reginfo->poscache_maxiter = I32_MAX;
7873                     reginfo->poscache_iter = reginfo->poscache_maxiter;
7874                 }
7875
7876                 if (reginfo->poscache_iter-- == 0) {
7877                     /* initialise cache */
7878                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7879                     regmatch_info_aux *const aux = reginfo->info_aux;
7880                     if (aux->poscache) {
7881                         if ((SSize_t)reginfo->poscache_size < size) {
7882                             Renew(aux->poscache, size, char);
7883                             reginfo->poscache_size = size;
7884                         }
7885                         Zero(aux->poscache, size, char);
7886                     }
7887                     else {
7888                         reginfo->poscache_size = size;
7889                         Newxz(aux->poscache, size, char);
7890                     }
7891                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7892       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
7893                               PL_colors[4], PL_colors[5])
7894                     );
7895                 }
7896
7897                 if (reginfo->poscache_iter < 0) {
7898                     /* have we already failed at this position? */
7899                     SSize_t offset, mask;
7900
7901                     reginfo->poscache_iter = -1; /* stop eventual underflow */
7902                     offset  = (scan->flags & 0xf) - 1
7903                                 +   (locinput - reginfo->strbeg)
7904                                   * (scan->flags>>4);
7905                     mask    = 1 << (offset % 8);
7906                     offset /= 8;
7907                     if (reginfo->info_aux->poscache[offset] & mask) {
7908                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
7909                             depth)
7910                         );
7911                         cur_curlyx->u.curlyx.count--;
7912                         sayNO; /* cache records failure */
7913                     }
7914                     ST.cache_offset = offset;
7915                     ST.cache_mask   = mask;
7916                 }
7917             }
7918
7919             /* Prefer B over A for minimal matching. */
7920
7921             if (cur_curlyx->u.curlyx.minmod) {
7922                 ST.save_curlyx = cur_curlyx;
7923                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7924                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7925                                     locinput);
7926                 NOT_REACHED; /* NOTREACHED */
7927             }
7928
7929             /* Prefer A over B for maximal matching. */
7930
7931             if (n < max) { /* More greed allowed? */
7932                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7933                             maxopenparen);
7934                 cur_curlyx->u.curlyx.lastloc = locinput;
7935                 REGCP_SET(ST.lastcp);
7936                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7937                 NOT_REACHED; /* NOTREACHED */
7938             }
7939             goto do_whilem_B_max;
7940         }
7941         NOT_REACHED; /* NOTREACHED */
7942
7943         case WHILEM_B_min: /* just matched B in a minimal match */
7944         case WHILEM_B_max: /* just matched B in a maximal match */
7945             cur_curlyx = ST.save_curlyx;
7946             sayYES;
7947             NOT_REACHED; /* NOTREACHED */
7948
7949         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7950             cur_curlyx = ST.save_curlyx;
7951             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7952             cur_curlyx->u.curlyx.count--;
7953             CACHEsayNO;
7954             NOT_REACHED; /* NOTREACHED */
7955
7956         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7957             REGCP_UNWIND(ST.lastcp);
7958             regcppop(rex, &maxopenparen);
7959             /* FALLTHROUGH */
7960         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7961             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7962             cur_curlyx->u.curlyx.count--;
7963             CACHEsayNO;
7964             NOT_REACHED; /* NOTREACHED */
7965
7966         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7967             REGCP_UNWIND(ST.lastcp);
7968             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7969             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
7970                 depth)
7971             );
7972           do_whilem_B_max:
7973             if (cur_curlyx->u.curlyx.count >= REG_INFTY
7974                 && ckWARN(WARN_REGEXP)
7975                 && !reginfo->warned)
7976             {
7977                 reginfo->warned = TRUE;
7978                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7979                      "Complex regular subexpression recursion limit (%d) "
7980                      "exceeded",
7981                      REG_INFTY - 1);
7982             }
7983
7984             /* now try B */
7985             ST.save_curlyx = cur_curlyx;
7986             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7987             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7988                                 locinput);
7989             NOT_REACHED; /* NOTREACHED */
7990
7991         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7992             cur_curlyx = ST.save_curlyx;
7993
7994             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7995                 /* Maximum greed exceeded */
7996                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7997                     && ckWARN(WARN_REGEXP)
7998                     && !reginfo->warned)
7999                 {
8000                     reginfo->warned     = TRUE;
8001                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8002                         "Complex regular subexpression recursion "
8003                         "limit (%d) exceeded",
8004                         REG_INFTY - 1);
8005                 }
8006                 cur_curlyx->u.curlyx.count--;
8007                 CACHEsayNO;
8008             }
8009
8010             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8011             );
8012             /* Try grabbing another A and see if it helps. */
8013             cur_curlyx->u.curlyx.lastloc = locinput;
8014             PUSH_STATE_GOTO(WHILEM_A_min,
8015                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8016                 locinput);
8017             NOT_REACHED; /* NOTREACHED */
8018
8019 #undef  ST
8020 #define ST st->u.branch
8021
8022         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
8023             next = scan + ARG(scan);
8024             if (next == scan)
8025                 next = NULL;
8026             scan = NEXTOPER(scan);
8027             /* FALLTHROUGH */
8028
8029         case BRANCH:        /*  /(...|A|...)/ */
8030             scan = NEXTOPER(scan); /* scan now points to inner node */
8031             ST.lastparen = rex->lastparen;
8032             ST.lastcloseparen = rex->lastcloseparen;
8033             ST.next_branch = next;
8034             REGCP_SET(ST.cp);
8035
8036             /* Now go into the branch */
8037             if (has_cutgroup) {
8038                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
8039             } else {
8040                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
8041             }
8042             NOT_REACHED; /* NOTREACHED */
8043
8044         case CUTGROUP:  /*  /(*THEN)/  */
8045             sv_yes_mark = st->u.mark.mark_name = scan->flags
8046                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8047                 : NULL;
8048             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
8049             NOT_REACHED; /* NOTREACHED */
8050
8051         case CUTGROUP_next_fail:
8052             do_cutgroup = 1;
8053             no_final = 1;
8054             if (st->u.mark.mark_name)
8055                 sv_commit = st->u.mark.mark_name;
8056             sayNO;
8057             NOT_REACHED; /* NOTREACHED */
8058
8059         case BRANCH_next:
8060             sayYES;
8061             NOT_REACHED; /* NOTREACHED */
8062
8063         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8064             if (do_cutgroup) {
8065                 do_cutgroup = 0;
8066                 no_final = 0;
8067             }
8068             REGCP_UNWIND(ST.cp);
8069             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8070             scan = ST.next_branch;
8071             /* no more branches? */
8072             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8073                 DEBUG_EXECUTE_r({
8074                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8075                         depth,
8076                         PL_colors[4],
8077                         PL_colors[5] );
8078                 });
8079                 sayNO_SILENT;
8080             }
8081             continue; /* execute next BRANCH[J] op */
8082             /* NOTREACHED */
8083
8084         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8085             minmod = 1;
8086             break;
8087
8088 #undef  ST
8089 #define ST st->u.curlym
8090
8091         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
8092
8093             /* This is an optimisation of CURLYX that enables us to push
8094              * only a single backtracking state, no matter how many matches
8095              * there are in {m,n}. It relies on the pattern being constant
8096              * length, with no parens to influence future backrefs
8097              */
8098
8099             ST.me = scan;
8100             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8101
8102             ST.lastparen      = rex->lastparen;
8103             ST.lastcloseparen = rex->lastcloseparen;
8104
8105             /* if paren positive, emulate an OPEN/CLOSE around A */
8106             if (ST.me->flags) {
8107                 U32 paren = ST.me->flags;
8108                 if (paren > maxopenparen)
8109                     maxopenparen = paren;
8110                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8111             }
8112             ST.A = scan;
8113             ST.B = next;
8114             ST.alen = 0;
8115             ST.count = 0;
8116             ST.minmod = minmod;
8117             minmod = 0;
8118             ST.c1 = CHRTEST_UNINIT;
8119             REGCP_SET(ST.cp);
8120
8121             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8122                 goto curlym_do_B;
8123
8124           curlym_do_A: /* execute the A in /A{m,n}B/  */
8125             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
8126             NOT_REACHED; /* NOTREACHED */
8127
8128         case CURLYM_A: /* we've just matched an A */
8129             ST.count++;
8130             /* after first match, determine A's length: u.curlym.alen */
8131             if (ST.count == 1) {
8132                 if (reginfo->is_utf8_target) {
8133                     char *s = st->locinput;
8134                     while (s < locinput) {
8135                         ST.alen++;
8136                         s += UTF8SKIP(s);
8137                     }
8138                 }
8139                 else {
8140                     ST.alen = locinput - st->locinput;
8141                 }
8142                 if (ST.alen == 0)
8143                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8144             }
8145             DEBUG_EXECUTE_r(
8146                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8147                           depth, (IV) ST.count, (IV)ST.alen)
8148             );
8149
8150             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8151                 goto fake_end;
8152
8153             {
8154                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8155                 if ( max == REG_INFTY || ST.count < max )
8156                     goto curlym_do_A; /* try to match another A */
8157             }
8158             goto curlym_do_B; /* try to match B */
8159
8160         case CURLYM_A_fail: /* just failed to match an A */
8161             REGCP_UNWIND(ST.cp);
8162
8163
8164             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8165                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8166                 sayNO;
8167
8168           curlym_do_B: /* execute the B in /A{m,n}B/  */
8169             if (ST.c1 == CHRTEST_UNINIT) {
8170                 /* calculate c1 and c2 for possible match of 1st char
8171                  * following curly */
8172                 ST.c1 = ST.c2 = CHRTEST_VOID;
8173                 assert(ST.B);
8174                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8175                     regnode *text_node = ST.B;
8176                     if (! HAS_TEXT(text_node))
8177                         FIND_NEXT_IMPT(text_node);
8178                     /* this used to be
8179
8180                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8181
8182                         But the former is redundant in light of the latter.
8183
8184                         if this changes back then the macro for
8185                         IS_TEXT and friends need to change.
8186                      */
8187                     if (PL_regkind[OP(text_node)] == EXACT) {
8188                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8189                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8190                            reginfo))
8191                         {
8192                             sayNO;
8193                         }
8194                     }
8195                 }
8196             }
8197
8198             DEBUG_EXECUTE_r(
8199                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8200                     depth, (IV)ST.count)
8201                 );
8202             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8203                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8204                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8205                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8206                     {
8207                         /* simulate B failing */
8208                         DEBUG_OPTIMISE_r(
8209                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8210                                 depth,
8211                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8212                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8213                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8214                         );
8215                         state_num = CURLYM_B_fail;
8216                         goto reenter_switch;
8217                     }
8218                 }
8219                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8220                     /* simulate B failing */
8221                     DEBUG_OPTIMISE_r(
8222                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8223                             depth,
8224                             (int) nextchr, ST.c1, ST.c2)
8225                     );
8226                     state_num = CURLYM_B_fail;
8227                     goto reenter_switch;
8228                 }
8229             }
8230
8231             if (ST.me->flags) {
8232                 /* emulate CLOSE: mark current A as captured */
8233                 I32 paren = ST.me->flags;
8234                 if (ST.count) {
8235                     rex->offs[paren].start
8236                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
8237                     rex->offs[paren].end = locinput - reginfo->strbeg;
8238                     if ((U32)paren > rex->lastparen)
8239                         rex->lastparen = paren;
8240                     rex->lastcloseparen = paren;
8241                 }
8242                 else
8243                     rex->offs[paren].end = -1;
8244
8245                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8246                 {
8247                     if (ST.count)
8248                         goto fake_end;
8249                     else
8250                         sayNO;
8251                 }
8252             }
8253
8254             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8255             NOT_REACHED; /* NOTREACHED */
8256
8257         case CURLYM_B_fail: /* just failed to match a B */
8258             REGCP_UNWIND(ST.cp);
8259             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8260             if (ST.minmod) {
8261                 I32 max = ARG2(ST.me);
8262                 if (max != REG_INFTY && ST.count == max)
8263                     sayNO;
8264                 goto curlym_do_A; /* try to match a further A */
8265             }
8266             /* backtrack one A */
8267             if (ST.count == ARG1(ST.me) /* min */)
8268                 sayNO;
8269             ST.count--;
8270             SET_locinput(HOPc(locinput, -ST.alen));
8271             goto curlym_do_B; /* try to match B */
8272
8273 #undef ST
8274 #define ST st->u.curly
8275
8276 #define CURLY_SETPAREN(paren, success) \
8277     if (paren) { \
8278         if (success) { \
8279             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
8280             rex->offs[paren].end = locinput - reginfo->strbeg; \
8281             if (paren > rex->lastparen) \
8282                 rex->lastparen = paren; \
8283             rex->lastcloseparen = paren; \
8284         } \
8285         else { \
8286             rex->offs[paren].end = -1; \
8287             rex->lastparen      = ST.lastparen; \
8288             rex->lastcloseparen = ST.lastcloseparen; \
8289         } \
8290     }
8291
8292         case STAR:              /*  /A*B/ where A is width 1 char */
8293             ST.paren = 0;
8294             ST.min = 0;
8295             ST.max = REG_INFTY;
8296             scan = NEXTOPER(scan);
8297             goto repeat;
8298
8299         case PLUS:              /*  /A+B/ where A is width 1 char */
8300             ST.paren = 0;
8301             ST.min = 1;
8302             ST.max = REG_INFTY;
8303             scan = NEXTOPER(scan);
8304             goto repeat;
8305
8306         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
8307             ST.paren = scan->flags;     /* Which paren to set */
8308             ST.lastparen      = rex->lastparen;
8309             ST.lastcloseparen = rex->lastcloseparen;
8310             if (ST.paren > maxopenparen)
8311                 maxopenparen = ST.paren;
8312             ST.min = ARG1(scan);  /* min to match */
8313             ST.max = ARG2(scan);  /* max to match */
8314             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8315             {
8316                 ST.min=1;
8317                 ST.max=1;
8318             }
8319             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8320             goto repeat;
8321
8322         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
8323             ST.paren = 0;
8324             ST.min = ARG1(scan);  /* min to match */
8325             ST.max = ARG2(scan);  /* max to match */
8326             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8327           repeat:
8328             /*
8329             * Lookahead to avoid useless match attempts
8330             * when we know what character comes next.
8331             *
8332             * Used to only do .*x and .*?x, but now it allows
8333             * for )'s, ('s and (?{ ... })'s to be in the way
8334             * of the quantifier and the EXACT-like node.  -- japhy
8335             */
8336
8337             assert(ST.min <= ST.max);
8338             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8339                 ST.c1 = ST.c2 = CHRTEST_VOID;
8340             }
8341             else {
8342                 regnode *text_node = next;
8343
8344                 if (! HAS_TEXT(text_node))
8345                     FIND_NEXT_IMPT(text_node);
8346
8347                 if (! HAS_TEXT(text_node))
8348                     ST.c1 = ST.c2 = CHRTEST_VOID;
8349                 else {
8350                     if ( PL_regkind[OP(text_node)] != EXACT ) {
8351                         ST.c1 = ST.c2 = CHRTEST_VOID;
8352                     }
8353                     else {
8354
8355                     /*  Currently we only get here when
8356
8357                         PL_rekind[OP(text_node)] == EXACT
8358
8359                         if this changes back then the macro for IS_TEXT and
8360                         friends need to change. */
8361                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8362                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8363                            reginfo))
8364                         {
8365                             sayNO;
8366                         }
8367                     }
8368                 }
8369             }
8370
8371             ST.A = scan;
8372             ST.B = next;
8373             if (minmod) {
8374                 char *li = locinput;
8375                 minmod = 0;
8376                 if (ST.min &&
8377                         regrepeat(rex, &li, ST.A, reginfo, ST.min)
8378                             < ST.min)
8379                     sayNO;
8380                 SET_locinput(li);
8381                 ST.count = ST.min;
8382                 REGCP_SET(ST.cp);
8383                 if (ST.c1 == CHRTEST_VOID)
8384                     goto curly_try_B_min;
8385
8386                 ST.oldloc = locinput;
8387
8388                 /* set ST.maxpos to the furthest point along the
8389                  * string that could possibly match */
8390                 if  (ST.max == REG_INFTY) {
8391                     ST.maxpos = reginfo->strend - 1;
8392                     if (utf8_target)
8393                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8394                             ST.maxpos--;
8395                 }
8396                 else if (utf8_target) {
8397                     int m = ST.max - ST.min;
8398                     for (ST.maxpos = locinput;
8399                          m >0 && ST.maxpos < reginfo->strend; m--)
8400                         ST.maxpos += UTF8SKIP(ST.maxpos);
8401                 }
8402                 else {
8403                     ST.maxpos = locinput + ST.max - ST.min;
8404                     if (ST.maxpos >= reginfo->strend)
8405                         ST.maxpos = reginfo->strend - 1;
8406                 }
8407                 goto curly_try_B_min_known;
8408
8409             }
8410             else {
8411                 /* avoid taking address of locinput, so it can remain
8412                  * a register var */
8413                 char *li = locinput;
8414                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8415                 if (ST.count < ST.min)
8416                     sayNO;
8417                 SET_locinput(li);
8418                 if ((ST.count > ST.min)
8419                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8420                 {
8421                     /* A{m,n} must come at the end of the string, there's
8422                      * no point in backing off ... */
8423                     ST.min = ST.count;
8424                     /* ...except that $ and \Z can match before *and* after
8425                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8426                        We may back off by one in this case. */
8427                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8428                         ST.min--;
8429                 }
8430                 REGCP_SET(ST.cp);
8431                 goto curly_try_B_max;
8432             }
8433             NOT_REACHED; /* NOTREACHED */
8434
8435         case CURLY_B_min_known_fail:
8436             /* failed to find B in a non-greedy match where c1,c2 valid */
8437
8438             REGCP_UNWIND(ST.cp);
8439             if (ST.paren) {
8440                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8441             }
8442             /* Couldn't or didn't -- move forward. */
8443             ST.oldloc = locinput;
8444             if (utf8_target)
8445                 locinput += UTF8SKIP(locinput);
8446             else
8447                 locinput++;
8448             ST.count++;
8449           curly_try_B_min_known:
8450              /* find the next place where 'B' could work, then call B */
8451             {
8452                 int n;
8453                 if (utf8_target) {
8454                     n = (ST.oldloc == locinput) ? 0 : 1;
8455                     if (ST.c1 == ST.c2) {
8456                         /* set n to utf8_distance(oldloc, locinput) */
8457                         while (locinput <= ST.maxpos
8458                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8459                         {
8460                             locinput += UTF8SKIP(locinput);
8461                             n++;
8462                         }
8463                     }
8464                     else {
8465                         /* set n to utf8_distance(oldloc, locinput) */
8466                         while (locinput <= ST.maxpos
8467                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8468                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8469                         {
8470                             locinput += UTF8SKIP(locinput);
8471                             n++;
8472                         }
8473                     }
8474                 }
8475                 else {  /* Not utf8_target */
8476                     if (ST.c1 == ST.c2) {
8477                         locinput = (char *) memchr(locinput,
8478                                                    ST.c1,
8479                                                    ST.maxpos + 1 - locinput);
8480                         if (! locinput) {
8481                             locinput = ST.maxpos + 1;
8482                         }
8483                     }
8484                     else {
8485                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8486
8487                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8488                             while (   locinput <= ST.maxpos
8489                                    && UCHARAT(locinput) != ST.c1
8490                                    && UCHARAT(locinput) != ST.c2)
8491                             {
8492                                 locinput++;
8493                             }
8494                         }
8495                         else {
8496                             /* If c1 and c2 only differ by a single bit, we can
8497                              * avoid a conditional each time through the loop,
8498                              * at the expense of a little preliminary setup and
8499                              * an extra mask each iteration.  By masking out
8500                              * that bit, we match exactly two characters, c1
8501                              * and c2, and so we don't have to test for both.
8502                              * On both ASCII and EBCDIC platforms, most of the
8503                              * ASCII-range and Latin1-range folded equivalents
8504                              * differ only in a single bit, so this is actually
8505                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8506                              * 0x61). */
8507                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8508                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8509                             while (   locinput <= ST.maxpos
8510                                    && (UCHARAT(locinput) & c1_c2_mask)
8511                                                                 != c1_masked)
8512                             {
8513                                 locinput++;
8514                             }
8515                         }
8516                     }
8517                     n = locinput - ST.oldloc;
8518                 }
8519                 if (locinput > ST.maxpos)
8520                     sayNO;
8521                 if (n) {
8522                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8523                      * at b; check that everything between oldloc and
8524                      * locinput matches */
8525                     char *li = ST.oldloc;
8526                     ST.count += n;
8527                     if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8528                         sayNO;
8529                     assert(n == REG_INFTY || locinput == li);
8530                 }
8531                 CURLY_SETPAREN(ST.paren, ST.count);
8532                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8533                     goto fake_end;
8534                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8535             }
8536             NOT_REACHED; /* NOTREACHED */
8537
8538         case CURLY_B_min_fail:
8539             /* failed to find B in a non-greedy match where c1,c2 invalid */
8540
8541             REGCP_UNWIND(ST.cp);
8542             if (ST.paren) {
8543                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8544             }
8545             /* failed -- move forward one */
8546             {
8547                 char *li = locinput;
8548                 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8549                     sayNO;
8550                 }
8551                 locinput = li;
8552             }
8553             {
8554                 ST.count++;
8555                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8556                         ST.count > 0)) /* count overflow ? */
8557                 {
8558                   curly_try_B_min:
8559                     CURLY_SETPAREN(ST.paren, ST.count);
8560                     if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8561                         goto fake_end;
8562                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8563                 }
8564             }
8565             sayNO;
8566             NOT_REACHED; /* NOTREACHED */
8567
8568           curly_try_B_max:
8569             /* a successful greedy match: now try to match B */
8570             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8571                 goto fake_end;
8572             {
8573                 bool could_match = locinput < reginfo->strend;
8574
8575                 /* If it could work, try it. */
8576                 if (ST.c1 != CHRTEST_VOID && could_match) {
8577                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8578                     {
8579                         could_match = memEQ(locinput,
8580                                             ST.c1_utf8,
8581                                             UTF8SKIP(locinput))
8582                                     || memEQ(locinput,
8583                                              ST.c2_utf8,
8584                                              UTF8SKIP(locinput));
8585                     }
8586                     else {
8587                         could_match = UCHARAT(locinput) == ST.c1
8588                                       || UCHARAT(locinput) == ST.c2;
8589                     }
8590                 }
8591                 if (ST.c1 == CHRTEST_VOID || could_match) {
8592                     CURLY_SETPAREN(ST.paren, ST.count);
8593                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8594                     NOT_REACHED; /* NOTREACHED */
8595                 }
8596             }
8597             /* FALLTHROUGH */
8598
8599         case CURLY_B_max_fail:
8600             /* failed to find B in a greedy match */
8601
8602             REGCP_UNWIND(ST.cp);
8603             if (ST.paren) {
8604                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8605             }
8606             /*  back up. */
8607             if (--ST.count < ST.min)
8608                 sayNO;
8609             locinput = HOPc(locinput, -1);
8610             goto curly_try_B_max;
8611
8612 #undef ST
8613
8614         case END: /*  last op of main pattern  */
8615           fake_end:
8616             if (cur_eval) {
8617                 /* we've just finished A in /(??{A})B/; now continue with B */
8618                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8619                 st->u.eval.prev_rex = rex_sv;           /* inner */
8620
8621                 /* Save *all* the positions. */
8622                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8623                 rex_sv = CUR_EVAL.prev_rex;
8624                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8625                 SET_reg_curpm(rex_sv);
8626                 rex = ReANY(rex_sv);
8627                 rexi = RXi_GET(rex);
8628
8629                 st->u.eval.prev_curlyx = cur_curlyx;
8630                 cur_curlyx = CUR_EVAL.prev_curlyx;
8631
8632                 REGCP_SET(st->u.eval.lastcp);
8633
8634                 /* Restore parens of the outer rex without popping the
8635                  * savestack */
8636                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8637
8638                 st->u.eval.prev_eval = cur_eval;
8639                 cur_eval = CUR_EVAL.prev_eval;
8640                 DEBUG_EXECUTE_r(
8641                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8642                                       depth, cur_eval););
8643                 if ( nochange_depth )
8644                     nochange_depth--;
8645
8646                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8647
8648                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8649                                     locinput); /* match B */
8650             }
8651
8652             if (locinput < reginfo->till) {
8653                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8654                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8655                                       PL_colors[4],
8656                                       (long)(locinput - startpos),
8657                                       (long)(reginfo->till - startpos),
8658                                       PL_colors[5]));
8659
8660                 sayNO_SILENT;           /* Cannot match: too short. */
8661             }
8662             sayYES;                     /* Success! */
8663
8664         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8665             DEBUG_EXECUTE_r(
8666             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8667                 depth, PL_colors[4], PL_colors[5]));
8668             sayYES;                     /* Success! */
8669
8670 #undef  ST
8671 #define ST st->u.ifmatch
8672
8673         {
8674             char *newstart;
8675
8676         case SUSPEND:   /* (?>A) */
8677             ST.wanted = 1;
8678             newstart = locinput;
8679             goto do_ifmatch;
8680
8681         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
8682             ST.wanted = 0;
8683             goto ifmatch_trivial_fail_test;
8684
8685         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
8686             ST.wanted = 1;
8687           ifmatch_trivial_fail_test:
8688             if (scan->flags) {
8689                 char * const s = HOPBACKc(locinput, scan->flags);
8690                 if (!s) {
8691                     /* trivial fail */
8692                     if (logical) {
8693                         logical = 0;
8694                         sw = 1 - cBOOL(ST.wanted);
8695                     }
8696                     else if (ST.wanted)
8697                         sayNO;
8698                     next = scan + ARG(scan);
8699                     if (next == scan)
8700                         next = NULL;
8701                     break;
8702                 }
8703                 newstart = s;
8704             }
8705             else
8706                 newstart = locinput;
8707
8708           do_ifmatch:
8709             ST.me = scan;
8710             ST.logical = logical;
8711             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8712
8713             /* execute body of (?...A) */
8714             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8715             NOT_REACHED; /* NOTREACHED */
8716         }
8717
8718         case IFMATCH_A_fail: /* body of (?...A) failed */
8719             ST.wanted = !ST.wanted;
8720             /* FALLTHROUGH */
8721
8722         case IFMATCH_A: /* body of (?...A) succeeded */
8723             if (ST.logical) {
8724                 sw = cBOOL(ST.wanted);
8725             }
8726             else if (!ST.wanted)
8727                 sayNO;
8728
8729             if (OP(ST.me) != SUSPEND) {
8730                 /* restore old position except for (?>...) */
8731                 locinput = st->locinput;
8732             }
8733             scan = ST.me + ARG(ST.me);
8734             if (scan == ST.me)
8735                 scan = NULL;
8736             continue; /* execute B */
8737
8738 #undef ST
8739
8740         case LONGJMP: /*  alternative with many branches compiles to
8741                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8742             next = scan + ARG(scan);
8743             if (next == scan)
8744                 next = NULL;
8745             break;
8746
8747         case COMMIT:  /*  (*COMMIT)  */
8748             reginfo->cutpoint = reginfo->strend;
8749             /* FALLTHROUGH */
8750
8751         case PRUNE:   /*  (*PRUNE)   */
8752             if (scan->flags)
8753                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8754             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8755             NOT_REACHED; /* NOTREACHED */
8756
8757         case COMMIT_next_fail:
8758             no_final = 1;
8759             /* FALLTHROUGH */
8760             sayNO;
8761             NOT_REACHED; /* NOTREACHED */
8762
8763         case OPFAIL:   /* (*FAIL)  */
8764             if (scan->flags)
8765                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8766             if (logical) {
8767                 /* deal with (?(?!)X|Y) properly,
8768                  * make sure we trigger the no branch
8769                  * of the trailing IFTHEN structure*/
8770                 sw= 0;
8771                 break;
8772             } else {
8773                 sayNO;
8774             }
8775             NOT_REACHED; /* NOTREACHED */
8776
8777 #define ST st->u.mark
8778         case MARKPOINT: /*  (*MARK:foo)  */
8779             ST.prev_mark = mark_state;
8780             ST.mark_name = sv_commit = sv_yes_mark
8781                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8782             mark_state = st;
8783             ST.mark_loc = locinput;
8784             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8785             NOT_REACHED; /* NOTREACHED */
8786
8787         case MARKPOINT_next:
8788             mark_state = ST.prev_mark;
8789             sayYES;
8790             NOT_REACHED; /* NOTREACHED */
8791
8792         case MARKPOINT_next_fail:
8793             if (popmark && sv_eq(ST.mark_name,popmark))
8794             {
8795                 if (ST.mark_loc > startpoint)
8796                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8797                 popmark = NULL; /* we found our mark */
8798                 sv_commit = ST.mark_name;
8799
8800                 DEBUG_EXECUTE_r({
8801                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8802                             depth,
8803                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8804                 });
8805             }
8806             mark_state = ST.prev_mark;
8807             sv_yes_mark = mark_state ?
8808                 mark_state->u.mark.mark_name : NULL;
8809             sayNO;
8810             NOT_REACHED; /* NOTREACHED */
8811
8812         case SKIP:  /*  (*SKIP)  */
8813             if (!scan->flags) {
8814                 /* (*SKIP) : if we fail we cut here*/
8815                 ST.mark_name = NULL;
8816                 ST.mark_loc = locinput;
8817                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8818             } else {
8819                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8820                    otherwise do nothing.  Meaning we need to scan
8821                  */
8822                 regmatch_state *cur = mark_state;
8823                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8824
8825                 while (cur) {
8826                     if ( sv_eq( cur->u.mark.mark_name,
8827                                 find ) )
8828                     {
8829                         ST.mark_name = find;
8830                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
8831                     }
8832                     cur = cur->u.mark.prev_mark;
8833                 }
8834             }
8835             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8836             break;
8837
8838         case SKIP_next_fail:
8839             if (ST.mark_name) {
8840                 /* (*CUT:NAME) - Set up to search for the name as we
8841                    collapse the stack*/
8842                 popmark = ST.mark_name;
8843             } else {
8844                 /* (*CUT) - No name, we cut here.*/
8845                 if (ST.mark_loc > startpoint)
8846                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8847                 /* but we set sv_commit to latest mark_name if there
8848                    is one so they can test to see how things lead to this
8849                    cut */
8850                 if (mark_state)
8851                     sv_commit=mark_state->u.mark.mark_name;
8852             }
8853             no_final = 1;
8854             sayNO;
8855             NOT_REACHED; /* NOTREACHED */
8856 #undef ST
8857
8858         case LNBREAK: /* \R */
8859             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8860                 locinput += n;
8861             } else
8862                 sayNO;
8863             break;
8864
8865         default:
8866             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8867                           PTR2UV(scan), OP(scan));
8868             Perl_croak(aTHX_ "regexp memory corruption");
8869
8870         /* this is a point to jump to in order to increment
8871          * locinput by one character */
8872           increment_locinput:
8873             assert(!NEXTCHR_IS_EOS);
8874             if (utf8_target) {
8875                 locinput += PL_utf8skip[nextchr];
8876                 /* locinput is allowed to go 1 char off the end (signifying
8877                  * EOS), but not 2+ */
8878                 if (locinput > reginfo->strend)
8879                     sayNO;
8880             }
8881             else
8882                 locinput++;
8883             break;
8884
8885         } /* end switch */
8886
8887         /* switch break jumps here */
8888         scan = next; /* prepare to execute the next op and ... */
8889         continue;    /* ... jump back to the top, reusing st */
8890         /* NOTREACHED */
8891
8892       push_yes_state:
8893         /* push a state that backtracks on success */
8894         st->u.yes.prev_yes_state = yes_state;
8895         yes_state = st;
8896         /* FALLTHROUGH */
8897       push_state:
8898         /* push a new regex state, then continue at scan  */
8899         {
8900             regmatch_state *newst;
8901
8902             DEBUG_STACK_r({
8903                 regmatch_state *cur = st;
8904                 regmatch_state *curyes = yes_state;
8905                 U32 i;
8906                 regmatch_slab *slab = PL_regmatch_slab;
8907                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8908                     if (cur < SLAB_FIRST(slab)) {
8909                         slab = slab->prev;
8910                         cur = SLAB_LAST(slab);
8911                     }
8912                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8913                         depth,
8914                         i ? "    " : "push",
8915                         depth - i, PL_reg_name[cur->resume_state],
8916                         (curyes == cur) ? "yes" : ""
8917                     );
8918                     if (curyes == cur)
8919                         curyes = cur->u.yes.prev_yes_state;
8920                 }
8921             } else
8922                 DEBUG_STATE_pp("push")
8923             );
8924             depth++;
8925             st->locinput = locinput;
8926             newst = st+1;
8927             if (newst >  SLAB_LAST(PL_regmatch_slab))
8928                 newst = S_push_slab(aTHX);
8929             PL_regmatch_state = newst;
8930
8931             locinput = pushinput;
8932             st = newst;
8933             continue;
8934             /* NOTREACHED */
8935         }
8936     }
8937 #ifdef SOLARIS_BAD_OPTIMIZER
8938 #  undef PL_charclass
8939 #endif
8940
8941     /*
8942     * We get here only if there's trouble -- normally "case END" is
8943     * the terminating point.
8944     */
8945     Perl_croak(aTHX_ "corrupted regexp pointers");
8946     NOT_REACHED; /* NOTREACHED */
8947
8948   yes:
8949     if (yes_state) {
8950         /* we have successfully completed a subexpression, but we must now
8951          * pop to the state marked by yes_state and continue from there */
8952         assert(st != yes_state);
8953 #ifdef DEBUGGING
8954         while (st != yes_state) {
8955             st--;
8956             if (st < SLAB_FIRST(PL_regmatch_slab)) {
8957                 PL_regmatch_slab = PL_regmatch_slab->prev;
8958                 st = SLAB_LAST(PL_regmatch_slab);
8959             }
8960             DEBUG_STATE_r({
8961                 if (no_final) {
8962                     DEBUG_STATE_pp("pop (no final)");
8963                 } else {
8964                     DEBUG_STATE_pp("pop (yes)");
8965                 }
8966             });
8967             depth--;
8968         }
8969 #else
8970         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8971             || yes_state > SLAB_LAST(PL_regmatch_slab))
8972         {
8973             /* not in this slab, pop slab */
8974             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8975             PL_regmatch_slab = PL_regmatch_slab->prev;
8976             st = SLAB_LAST(PL_regmatch_slab);
8977         }
8978         depth -= (st - yes_state);
8979 #endif
8980         st = yes_state;
8981         yes_state = st->u.yes.prev_yes_state;
8982         PL_regmatch_state = st;
8983
8984         if (no_final)
8985             locinput= st->locinput;
8986         state_num = st->resume_state + no_final;
8987         goto reenter_switch;
8988     }
8989
8990     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
8991                           PL_colors[4], PL_colors[5]));
8992
8993     if (reginfo->info_aux_eval) {
8994         /* each successfully executed (?{...}) block does the equivalent of
8995          *   local $^R = do {...}
8996          * When popping the save stack, all these locals would be undone;
8997          * bypass this by setting the outermost saved $^R to the latest
8998          * value */
8999         /* I dont know if this is needed or works properly now.
9000          * see code related to PL_replgv elsewhere in this file.
9001          * Yves
9002          */
9003         if (oreplsv != GvSV(PL_replgv))
9004             sv_setsv(oreplsv, GvSV(PL_replgv));
9005     }
9006     result = 1;
9007     goto final_exit;
9008
9009   no:
9010     DEBUG_EXECUTE_r(
9011         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9012             depth,
9013             PL_colors[4], PL_colors[5])
9014         );
9015
9016   no_silent:
9017     if (no_final) {
9018         if (yes_state) {
9019             goto yes;
9020         } else {
9021             goto final_exit;
9022         }
9023     }
9024     if (depth) {
9025         /* there's a previous state to backtrack to */
9026         st--;
9027         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9028             PL_regmatch_slab = PL_regmatch_slab->prev;
9029             st = SLAB_LAST(PL_regmatch_slab);
9030         }
9031         PL_regmatch_state = st;
9032         locinput= st->locinput;
9033
9034         DEBUG_STATE_pp("pop");
9035         depth--;
9036         if (yes_state == st)
9037             yes_state = st->u.yes.prev_yes_state;
9038
9039         state_num = st->resume_state + 1; /* failure = success + 1 */
9040         PERL_ASYNC_CHECK();
9041         goto reenter_switch;
9042     }
9043     result = 0;
9044
9045   final_exit:
9046     if (rex->intflags & PREGf_VERBARG_SEEN) {
9047         SV *sv_err = get_sv("REGERROR", 1);
9048         SV *sv_mrk = get_sv("REGMARK", 1);
9049         if (result) {
9050             sv_commit = &PL_sv_no;
9051             if (!sv_yes_mark)
9052                 sv_yes_mark = &PL_sv_yes;
9053         } else {
9054             if (!sv_commit)
9055                 sv_commit = &PL_sv_yes;
9056             sv_yes_mark = &PL_sv_no;
9057         }
9058         assert(sv_err);
9059         assert(sv_mrk);
9060         sv_setsv(sv_err, sv_commit);
9061         sv_setsv(sv_mrk, sv_yes_mark);
9062     }
9063
9064
9065     if (last_pushed_cv) {
9066         dSP;
9067         /* see "Some notes about MULTICALL" above */
9068         POP_MULTICALL;
9069         PERL_UNUSED_VAR(SP);
9070     }
9071     else
9072         LEAVE_SCOPE(orig_savestack_ix);
9073
9074     assert(!result ||  locinput - reginfo->strbeg >= 0);
9075     return result ?  locinput - reginfo->strbeg : -1;
9076 }
9077
9078 /*
9079  - regrepeat - repeatedly match something simple, report how many
9080  *
9081  * What 'simple' means is a node which can be the operand of a quantifier like
9082  * '+', or {1,3}
9083  *
9084  * startposp - pointer a pointer to the start position.  This is updated
9085  *             to point to the byte following the highest successful
9086  *             match.
9087  * p         - the regnode to be repeatedly matched against.
9088  * reginfo   - struct holding match state, such as strend
9089  * max       - maximum number of things to match.
9090  * depth     - (for debugging) backtracking depth.
9091  */
9092 STATIC I32
9093 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9094             regmatch_info *const reginfo, I32 max _pDEPTH)
9095 {
9096     char *scan;     /* Pointer to current position in target string */
9097     I32 c;
9098     char *loceol = reginfo->strend;   /* local version */
9099     I32 hardcount = 0;  /* How many matches so far */
9100     bool utf8_target = reginfo->is_utf8_target;
9101     unsigned int to_complement = 0;  /* Invert the result? */
9102     UV utf8_flags;
9103     _char_class_number classnum;
9104
9105     PERL_ARGS_ASSERT_REGREPEAT;
9106
9107     scan = *startposp;
9108     if (max == REG_INFTY)
9109         max = I32_MAX;
9110     else if (! utf8_target && loceol - scan > max)
9111         loceol = scan + max;
9112
9113     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
9114      * to the maximum of how far we should go in it (leaving it set to the real
9115      * end, if the maximum permissible would take us beyond that).  This allows
9116      * us to make the loop exit condition that we haven't gone past <loceol> to
9117      * also mean that we haven't exceeded the max permissible count, saving a
9118      * test each time through the loop.  But it assumes that the OP matches a
9119      * single byte, which is true for most of the OPs below when applied to a
9120      * non-UTF-8 target.  Those relatively few OPs that don't have this
9121      * characteristic will have to compensate.
9122      *
9123      * There is no adjustment for UTF-8 targets, as the number of bytes per
9124      * character varies.  OPs will have to test both that the count is less
9125      * than the max permissible (using <hardcount> to keep track), and that we
9126      * are still within the bounds of the string (using <loceol>.  A few OPs
9127      * match a single byte no matter what the encoding.  They can omit the max
9128      * test if, for the UTF-8 case, they do the adjustment that was skipped
9129      * above.
9130      *
9131      * Thus, the code above sets things up for the common case; and exceptional
9132      * cases need extra work; the common case is to make sure <scan> doesn't
9133      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
9134      * count doesn't exceed the maximum permissible */
9135
9136     switch (OP(p)) {
9137     case REG_ANY:
9138         if (utf8_target) {
9139             while (scan < loceol && hardcount < max && *scan != '\n') {
9140                 scan += UTF8SKIP(scan);
9141                 hardcount++;
9142             }
9143         } else {
9144             scan = (char *) memchr(scan, '\n', loceol - scan);
9145             if (! scan) {
9146                 scan = loceol;
9147             }
9148         }
9149         break;
9150     case SANY:
9151         if (utf8_target) {
9152             while (scan < loceol && hardcount < max) {
9153                 scan += UTF8SKIP(scan);
9154                 hardcount++;
9155             }
9156         }
9157         else
9158             scan = loceol;
9159         break;
9160     case EXACTL:
9161         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9162         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9163             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9164         }
9165         /* FALLTHROUGH */
9166     case EXACT:
9167         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9168
9169         c = (U8)*STRING(p);
9170
9171         /* Can use a simple find if the pattern char to match on is invariant
9172          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9173          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9174          * true iff it doesn't matter if the argument is in UTF-8 or not */
9175         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9176             if (utf8_target && loceol - scan > max) {
9177                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9178                  * since here, to match at all, 1 char == 1 byte */
9179                 loceol = scan + max;
9180             }
9181             scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9182         }
9183         else if (reginfo->is_utf8_pat) {
9184             if (utf8_target) {
9185                 STRLEN scan_char_len;
9186
9187                 /* When both target and pattern are UTF-8, we have to do
9188                  * string EQ */
9189                 while (hardcount < max
9190                        && scan < loceol
9191                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9192                        && memEQ(scan, STRING(p), scan_char_len))
9193                 {
9194                     scan += scan_char_len;
9195                     hardcount++;
9196                 }
9197             }
9198             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9199
9200                 /* Target isn't utf8; convert the character in the UTF-8
9201                  * pattern to non-UTF8, and do a simple find */
9202                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9203                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9204             } /* else pattern char is above Latin1, can't possibly match the
9205                  non-UTF-8 target */
9206         }
9207         else {
9208
9209             /* Here, the string must be utf8; pattern isn't, and <c> is
9210              * different in utf8 than not, so can't compare them directly.
9211              * Outside the loop, find the two utf8 bytes that represent c, and
9212              * then look for those in sequence in the utf8 string */
9213             U8 high = UTF8_TWO_BYTE_HI(c);
9214             U8 low = UTF8_TWO_BYTE_LO(c);
9215
9216             while (hardcount < max
9217                     && scan + 1 < loceol
9218                     && UCHARAT(scan) == high
9219                     && UCHARAT(scan + 1) == low)
9220             {
9221                 scan += 2;
9222                 hardcount++;
9223             }
9224         }
9225         break;
9226
9227     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9228         assert(! reginfo->is_utf8_pat);
9229         /* FALLTHROUGH */
9230     case EXACTFAA:
9231         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9232         goto do_exactf;
9233
9234     case EXACTFL:
9235         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9236         utf8_flags = FOLDEQ_LOCALE;
9237         goto do_exactf;
9238
9239     case EXACTF:   /* This node only generated for non-utf8 patterns */
9240         assert(! reginfo->is_utf8_pat);
9241         utf8_flags = 0;
9242         goto do_exactf;
9243
9244     case EXACTFLU8:
9245         if (! utf8_target) {
9246             break;
9247         }
9248         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9249                                     | FOLDEQ_S2_FOLDS_SANE;
9250         goto do_exactf;
9251
9252     case EXACTFU_SS:
9253     case EXACTFU:
9254         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9255
9256       do_exactf: {
9257         int c1, c2;
9258         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9259
9260         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9261
9262         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9263                                         reginfo))
9264         {
9265             if (c1 == CHRTEST_VOID) {
9266                 /* Use full Unicode fold matching */
9267                 char *tmpeol = reginfo->strend;
9268                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9269                 while (hardcount < max
9270                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9271                                              STRING(p), NULL, pat_len,
9272                                              reginfo->is_utf8_pat, utf8_flags))
9273                 {
9274                     scan = tmpeol;
9275                     tmpeol = reginfo->strend;
9276                     hardcount++;
9277                 }
9278             }
9279             else if (utf8_target) {
9280                 if (c1 == c2) {
9281                     while (scan < loceol
9282                            && hardcount < max
9283                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9284                     {
9285                         scan += UTF8SKIP(scan);
9286                         hardcount++;
9287                     }
9288                 }
9289                 else {
9290                     while (scan < loceol
9291                            && hardcount < max
9292                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9293                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9294                     {
9295                         scan += UTF8SKIP(scan);
9296                         hardcount++;
9297                     }
9298                 }
9299             }
9300             else if (c1 == c2) {
9301                 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c1);
9302             }
9303             else {
9304                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9305                  * a conditional each time through the loop if the characters
9306                  * differ only in a single bit, as is the usual situation */
9307                 U8 c1_c2_bits_differing = c1 ^ c2;
9308
9309                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9310                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9311
9312                     scan = (char *) find_span_end_mask((U8 *) scan,
9313                                                        (U8 *) loceol,
9314                                                        c1 & c1_c2_mask,
9315                                                        c1_c2_mask);
9316                 }
9317                 else {
9318                     while (    scan < loceol
9319                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9320                     {
9321                         scan++;
9322                     }
9323                 }
9324             }
9325         }
9326         break;
9327     }
9328     case ANYOFL:
9329         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9330
9331         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9332             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9333         }
9334         /* FALLTHROUGH */
9335     case ANYOFD:
9336     case ANYOF:
9337         if (utf8_target) {
9338             while (hardcount < max
9339                    && scan < loceol
9340                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9341             {
9342                 scan += UTF8SKIP(scan);
9343                 hardcount++;
9344             }
9345         }
9346         else if (ANYOF_FLAGS(p)) {
9347             while (scan < loceol
9348                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9349                 scan++;
9350         }
9351         else {
9352             while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9353                 scan++;
9354         }
9355         break;
9356
9357     case ANYOFM:
9358         if (utf8_target && loceol - scan > max) {
9359
9360             /* We didn't adjust <loceol> at the beginning of this routine
9361              * because is UTF-8, but it is actually ok to do so, since here, to
9362              * match, 1 char == 1 byte. */
9363             loceol = scan + max;
9364         }
9365
9366         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9367         break;
9368
9369     case ASCII:
9370         if (utf8_target && loceol - scan > max) {
9371             loceol = scan + max;
9372         }
9373
9374         scan = find_next_non_ascii(scan, loceol, utf8_target);
9375         break;
9376
9377     case NASCII:
9378         if (utf8_target) {
9379             while (     hardcount < max
9380                    &&   scan < loceol
9381                    && ! isASCII_utf8_safe(scan, loceol))
9382             {
9383                 scan += UTF8SKIP(scan);
9384                 hardcount++;
9385             }
9386         }
9387         else {
9388             scan = find_next_ascii(scan, loceol, utf8_target);
9389         }
9390         break;
9391
9392     /* The argument (FLAGS) to all the POSIX node types is the class number */
9393
9394     case NPOSIXL:
9395         to_complement = 1;
9396         /* FALLTHROUGH */
9397
9398     case POSIXL:
9399         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9400         if (! utf8_target) {
9401             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9402                                                                    *scan)))
9403             {
9404                 scan++;
9405             }
9406         } else {
9407             while (hardcount < max && scan < loceol
9408                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9409                                                                   (U8 *) scan,
9410                                                                   (U8 *) loceol)))
9411             {
9412                 scan += UTF8SKIP(scan);
9413                 hardcount++;
9414             }
9415         }
9416         break;
9417
9418     case POSIXD:
9419         if (utf8_target) {
9420             goto utf8_posix;
9421         }
9422         /* FALLTHROUGH */
9423
9424     case POSIXA:
9425         if (utf8_target && loceol - scan > max) {
9426
9427             /* We didn't adjust <loceol> at the beginning of this routine
9428              * because is UTF-8, but it is actually ok to do so, since here, to
9429              * match, 1 char == 1 byte. */
9430             loceol = scan + max;
9431         }
9432         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9433             scan++;
9434         }
9435         break;
9436
9437     case NPOSIXD:
9438         if (utf8_target) {
9439             to_complement = 1;
9440             goto utf8_posix;
9441         }
9442         /* FALLTHROUGH */
9443
9444     case NPOSIXA:
9445         if (! utf8_target) {
9446             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9447                 scan++;
9448             }
9449         }
9450         else {
9451
9452             /* The complement of something that matches only ASCII matches all
9453              * non-ASCII, plus everything in ASCII that isn't in the class. */
9454             while (hardcount < max && scan < loceol
9455                    && (   ! isASCII_utf8_safe(scan, reginfo->strend)
9456                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9457             {
9458                 scan += UTF8SKIP(scan);
9459                 hardcount++;
9460             }
9461         }
9462         break;
9463
9464     case NPOSIXU:
9465         to_complement = 1;
9466         /* FALLTHROUGH */
9467
9468     case POSIXU:
9469         if (! utf8_target) {
9470             while (scan < loceol && to_complement
9471                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9472             {
9473                 scan++;
9474             }
9475         }
9476         else {
9477           utf8_posix:
9478             classnum = (_char_class_number) FLAGS(p);
9479             switch (classnum) {
9480                 default:
9481                     while (   hardcount < max && scan < loceol
9482                            && to_complement ^ cBOOL(_invlist_contains_cp(
9483                                               PL_XPosix_ptrs[classnum],
9484                                               utf8_to_uvchr_buf((U8 *) scan,
9485                                                                 (U8 *) loceol,
9486                                                                 NULL))))
9487                     {
9488                         scan += UTF8SKIP(scan);
9489                         hardcount++;
9490                     }
9491                     break;
9492
9493                     /* For the classes below, the knowledge of how to handle
9494                      * every code point is compiled in to Perl via a macro.
9495                      * This code is written for making the loops as tight as
9496                      * possible.  It could be refactored to save space instead.
9497                      * */
9498
9499                 case _CC_ENUM_SPACE:
9500                     while (hardcount < max
9501                            && scan < loceol
9502                            && (to_complement
9503                                ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9504                     {
9505                         scan += UTF8SKIP(scan);
9506                         hardcount++;
9507                     }
9508                     break;
9509                 case _CC_ENUM_BLANK:
9510                     while (hardcount < max
9511                            && scan < loceol
9512                            && (to_complement
9513                                 ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9514                     {
9515                         scan += UTF8SKIP(scan);
9516                         hardcount++;
9517                     }
9518                     break;
9519                 case _CC_ENUM_XDIGIT:
9520                     while (hardcount < max
9521                            && scan < loceol
9522                            && (to_complement
9523                                ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9524                     {
9525                         scan += UTF8SKIP(scan);
9526                         hardcount++;
9527                     }
9528                     break;
9529                 case _CC_ENUM_VERTSPACE:
9530                     while (hardcount < max
9531                            && scan < loceol
9532                            && (to_complement
9533                                ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9534                     {
9535                         scan += UTF8SKIP(scan);
9536                         hardcount++;
9537                     }
9538                     break;
9539                 case _CC_ENUM_CNTRL:
9540                     while (hardcount < max
9541                            && scan < loceol
9542                            && (to_complement
9543                                ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9544                     {
9545                         scan += UTF8SKIP(scan);
9546                         hardcount++;
9547                     }
9548                     break;
9549             }
9550         }
9551         break;
9552
9553     case LNBREAK:
9554         if (utf8_target) {
9555             while (hardcount < max && scan < loceol &&
9556                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9557                 scan += c;
9558                 hardcount++;
9559             }
9560         } else {
9561             /* LNBREAK can match one or two latin chars, which is ok, but we
9562              * have to use hardcount in this situation, and throw away the
9563              * adjustment to <loceol> done before the switch statement */
9564             loceol = reginfo->strend;
9565             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9566                 scan+=c;
9567                 hardcount++;
9568             }
9569         }
9570         break;
9571
9572     case BOUNDL:
9573     case NBOUNDL:
9574         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9575         /* FALLTHROUGH */
9576     case BOUND:
9577     case BOUNDA:
9578     case BOUNDU:
9579     case EOS:
9580     case GPOS:
9581     case KEEPS:
9582     case NBOUND:
9583     case NBOUNDA:
9584     case NBOUNDU:
9585     case OPFAIL:
9586     case SBOL:
9587     case SEOL:
9588         /* These are all 0 width, so match right here or not at all. */
9589         break;
9590
9591     default:
9592         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9593         NOT_REACHED; /* NOTREACHED */
9594
9595     }
9596
9597     if (hardcount)
9598         c = hardcount;
9599     else
9600         c = scan - *startposp;
9601     *startposp = scan;
9602
9603     DEBUG_r({
9604         GET_RE_DEBUG_FLAGS_DECL;
9605         DEBUG_EXECUTE_r({
9606             SV * const prop = sv_newmortal();
9607             regprop(prog, prop, p, reginfo, NULL);
9608             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
9609                         depth, SvPVX_const(prop),(IV)c,(IV)max);
9610         });
9611     });
9612
9613     return(c);
9614 }
9615
9616
9617 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9618 /*
9619 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
9620 create a copy so that changes the caller makes won't change the shared one.
9621 If <altsvp> is non-null, will return NULL in it, for back-compat.
9622  */
9623 SV *
9624 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9625 {
9626     PERL_ARGS_ASSERT_REGCLASS_SWASH;
9627
9628     if (altsvp) {
9629         *altsvp = NULL;
9630     }
9631
9632     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9633 }
9634
9635 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9636
9637 /*
9638  - reginclass - determine if a character falls into a character class
9639
9640   n is the ANYOF-type regnode
9641   p is the target string
9642   p_end points to one byte beyond the end of the target string
9643   utf8_target tells whether p is in UTF-8.
9644
9645   Returns true if matched; false otherwise.
9646
9647   Note that this can be a synthetic start class, a combination of various
9648   nodes, so things you think might be mutually exclusive, such as locale,
9649   aren't.  It can match both locale and non-locale
9650
9651  */
9652
9653 STATIC bool
9654 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9655 {
9656     dVAR;
9657     const char flags = ANYOF_FLAGS(n);
9658     bool match = FALSE;
9659     UV c = *p;
9660
9661     PERL_ARGS_ASSERT_REGINCLASS;
9662
9663     /* If c is not already the code point, get it.  Note that
9664      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9665     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9666         STRLEN c_len = 0;
9667         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9668         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9669         if (c_len == (STRLEN)-1) {
9670             _force_out_malformed_utf8_message(p, p_end,
9671                                               utf8n_flags,
9672                                               1 /* 1 means die */ );
9673             NOT_REACHED; /* NOTREACHED */
9674         }
9675         if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9676             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9677         }
9678     }
9679
9680     /* If this character is potentially in the bitmap, check it */
9681     if (c < NUM_ANYOF_CODE_POINTS) {
9682         if (ANYOF_BITMAP_TEST(n, c))
9683             match = TRUE;
9684         else if ((flags
9685                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9686                   && OP(n) == ANYOFD
9687                   && ! utf8_target
9688                   && ! isASCII(c))
9689         {
9690             match = TRUE;
9691         }
9692         else if (flags & ANYOF_LOCALE_FLAGS) {
9693             if ((flags & ANYOFL_FOLD)
9694                 && c < 256
9695                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9696             {
9697                 match = TRUE;
9698             }
9699             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9700                      && c < 256
9701             ) {
9702
9703                 /* The data structure is arranged so bits 0, 2, 4, ... are set
9704                  * if the class includes the Posix character class given by
9705                  * bit/2; and 1, 3, 5, ... are set if the class includes the
9706                  * complemented Posix class given by int(bit/2).  So we loop
9707                  * through the bits, each time changing whether we complement
9708                  * the result or not.  Suppose for the sake of illustration
9709                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
9710                  * is set, it means there is a match for this ANYOF node if the
9711                  * character is in the class given by the expression (0 / 2 = 0
9712                  * = \w).  If it is in that class, isFOO_lc() will return 1,
9713                  * and since 'to_complement' is 0, the result will stay TRUE,
9714                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
9715                  * bit 1 is 1.  That means there is a match if the character
9716                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
9717                  * but will on bit 1.  On the second iteration 'to_complement'
9718                  * will be 1, so the exclusive or will reverse things, so we
9719                  * are testing for \W.  On the third iteration, 'to_complement'
9720                  * will be 0, and we would be testing for \s; the fourth
9721                  * iteration would test for \S, etc.
9722                  *
9723                  * Note that this code assumes that all the classes are closed
9724                  * under folding.  For example, if a character matches \w, then
9725                  * its fold does too; and vice versa.  This should be true for
9726                  * any well-behaved locale for all the currently defined Posix
9727                  * classes, except for :lower: and :upper:, which are handled
9728                  * by the pseudo-class :cased: which matches if either of the
9729                  * other two does.  To get rid of this assumption, an outer
9730                  * loop could be used below to iterate over both the source
9731                  * character, and its fold (if different) */
9732
9733                 int count = 0;
9734                 int to_complement = 0;
9735
9736                 while (count < ANYOF_MAX) {
9737                     if (ANYOF_POSIXL_TEST(n, count)
9738                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9739                     {
9740                         match = TRUE;
9741                         break;
9742                     }
9743                     count++;
9744                     to_complement ^= 1;
9745                 }
9746             }
9747         }
9748     }
9749
9750
9751     /* If the bitmap didn't (or couldn't) match, and something outside the
9752      * bitmap could match, try that. */
9753     if (!match) {
9754         if (c >= NUM_ANYOF_CODE_POINTS
9755             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9756         {
9757             match = TRUE;       /* Everything above the bitmap matches */
9758         }
9759             /* Here doesn't match everything above the bitmap.  If there is
9760              * some information available beyond the bitmap, we may find a
9761              * match in it.  If so, this is most likely because the code point
9762              * is outside the bitmap range.  But rarely, it could be because of
9763              * some other reason.  If so, various flags are set to indicate
9764              * this possibility.  On ANYOFD nodes, there may be matches that
9765              * happen only when the target string is UTF-8; or for other node
9766              * types, because runtime lookup is needed, regardless of the
9767              * UTF-8ness of the target string.  Finally, under /il, there may
9768              * be some matches only possible if the locale is a UTF-8 one. */
9769         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
9770                  && (   c >= NUM_ANYOF_CODE_POINTS
9771                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9772                          && (   UNLIKELY(OP(n) != ANYOFD)
9773                              || (utf8_target && ! isASCII_uni(c)
9774 #                               if NUM_ANYOF_CODE_POINTS > 256
9775                                                                  && c < 256
9776 #                               endif
9777                                 )))
9778                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9779                          && IN_UTF8_CTYPE_LOCALE)))
9780         {
9781             SV* only_utf8_locale = NULL;
9782             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9783                                                        &only_utf8_locale, NULL);
9784             if (sw) {
9785                 U8 utf8_buffer[2];
9786                 U8 * utf8_p;
9787                 if (utf8_target) {
9788                     utf8_p = (U8 *) p;
9789                 } else { /* Convert to utf8 */
9790                     utf8_p = utf8_buffer;
9791                     append_utf8_from_native_byte(*p, &utf8_p);
9792                     utf8_p = utf8_buffer;
9793                 }
9794
9795                 if (swash_fetch(sw, utf8_p, TRUE)) {
9796                     match = TRUE;
9797                 }
9798             }
9799             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9800                 match = _invlist_contains_cp(only_utf8_locale, c);
9801             }
9802         }
9803
9804         if (UNICODE_IS_SUPER(c)
9805             && (flags
9806                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9807             && OP(n) != ANYOFD
9808             && ckWARN_d(WARN_NON_UNICODE))
9809         {
9810             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9811                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9812         }
9813     }
9814
9815 #if ANYOF_INVERT != 1
9816     /* Depending on compiler optimization cBOOL takes time, so if don't have to
9817      * use it, don't */
9818 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9819 #endif
9820
9821     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9822     return (flags & ANYOF_INVERT) ^ match;
9823 }
9824
9825 STATIC U8 *
9826 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9827 {
9828     /* return the position 'off' UTF-8 characters away from 's', forward if
9829      * 'off' >= 0, backwards if negative.  But don't go outside of position
9830      * 'lim', which better be < s  if off < 0 */
9831
9832     PERL_ARGS_ASSERT_REGHOP3;
9833
9834     if (off >= 0) {
9835         while (off-- && s < lim) {
9836             /* XXX could check well-formedness here */
9837             U8 *new_s = s + UTF8SKIP(s);
9838             if (new_s > lim) /* lim may be in the middle of a long character */
9839                 return s;
9840             s = new_s;
9841         }
9842     }
9843     else {
9844         while (off++ && s > lim) {
9845             s--;
9846             if (UTF8_IS_CONTINUED(*s)) {
9847                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9848                     s--;
9849                 if (! UTF8_IS_START(*s)) {
9850                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9851                 }
9852             }
9853             /* XXX could check well-formedness here */
9854         }
9855     }
9856     return s;
9857 }
9858
9859 STATIC U8 *
9860 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9861 {
9862     PERL_ARGS_ASSERT_REGHOP4;
9863
9864     if (off >= 0) {
9865         while (off-- && s < rlim) {
9866             /* XXX could check well-formedness here */
9867             s += UTF8SKIP(s);
9868         }
9869     }
9870     else {
9871         while (off++ && s > llim) {
9872             s--;
9873             if (UTF8_IS_CONTINUED(*s)) {
9874                 while (s > llim && UTF8_IS_CONTINUATION(*s))
9875                     s--;
9876                 if (! UTF8_IS_START(*s)) {
9877                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9878                 }
9879             }
9880             /* XXX could check well-formedness here */
9881         }
9882     }
9883     return s;
9884 }
9885
9886 /* like reghop3, but returns NULL on overrun, rather than returning last
9887  * char pos */
9888
9889 STATIC U8 *
9890 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9891 {
9892     PERL_ARGS_ASSERT_REGHOPMAYBE3;
9893
9894     if (off >= 0) {
9895         while (off-- && s < lim) {
9896             /* XXX could check well-formedness here */
9897             s += UTF8SKIP(s);
9898         }
9899         if (off >= 0)
9900             return NULL;
9901     }
9902     else {
9903         while (off++ && s > lim) {
9904             s--;
9905             if (UTF8_IS_CONTINUED(*s)) {
9906                 while (s > lim && UTF8_IS_CONTINUATION(*s))
9907                     s--;
9908                 if (! UTF8_IS_START(*s)) {
9909                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9910                 }
9911             }
9912             /* XXX could check well-formedness here */
9913         }
9914         if (off <= 0)
9915             return NULL;
9916     }
9917     return s;
9918 }
9919
9920
9921 /* when executing a regex that may have (?{}), extra stuff needs setting
9922    up that will be visible to the called code, even before the current
9923    match has finished. In particular:
9924
9925    * $_ is localised to the SV currently being matched;
9926    * pos($_) is created if necessary, ready to be updated on each call-out
9927      to code;
9928    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9929      isn't set until the current pattern is successfully finished), so that
9930      $1 etc of the match-so-far can be seen;
9931    * save the old values of subbeg etc of the current regex, and  set then
9932      to the current string (again, this is normally only done at the end
9933      of execution)
9934 */
9935
9936 static void
9937 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9938 {
9939     MAGIC *mg;
9940     regexp *const rex = ReANY(reginfo->prog);
9941     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9942
9943     eval_state->rex = rex;
9944
9945     if (reginfo->sv) {
9946         /* Make $_ available to executed code. */
9947         if (reginfo->sv != DEFSV) {
9948             SAVE_DEFSV;
9949             DEFSV_set(reginfo->sv);
9950         }
9951
9952         if (!(mg = mg_find_mglob(reginfo->sv))) {
9953             /* prepare for quick setting of pos */
9954             mg = sv_magicext_mglob(reginfo->sv);
9955             mg->mg_len = -1;
9956         }
9957         eval_state->pos_magic = mg;
9958         eval_state->pos       = mg->mg_len;
9959         eval_state->pos_flags = mg->mg_flags;
9960     }
9961     else
9962         eval_state->pos_magic = NULL;
9963
9964     if (!PL_reg_curpm) {
9965         /* PL_reg_curpm is a fake PMOP that we can attach the current
9966          * regex to and point PL_curpm at, so that $1 et al are visible
9967          * within a /(?{})/. It's just allocated once per interpreter the
9968          * first time its needed */
9969         Newxz(PL_reg_curpm, 1, PMOP);
9970 #ifdef USE_ITHREADS
9971         {
9972             SV* const repointer = &PL_sv_undef;
9973             /* this regexp is also owned by the new PL_reg_curpm, which
9974                will try to free it.  */
9975             av_push(PL_regex_padav, repointer);
9976             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9977             PL_regex_pad = AvARRAY(PL_regex_padav);
9978         }
9979 #endif
9980     }
9981     SET_reg_curpm(reginfo->prog);
9982     eval_state->curpm = PL_curpm;
9983     PL_curpm_under = PL_curpm;
9984     PL_curpm = PL_reg_curpm;
9985     if (RXp_MATCH_COPIED(rex)) {
9986         /*  Here is a serious problem: we cannot rewrite subbeg,
9987             since it may be needed if this match fails.  Thus
9988             $` inside (?{}) could fail... */
9989         eval_state->subbeg     = rex->subbeg;
9990         eval_state->sublen     = rex->sublen;
9991         eval_state->suboffset  = rex->suboffset;
9992         eval_state->subcoffset = rex->subcoffset;
9993 #ifdef PERL_ANY_COW
9994         eval_state->saved_copy = rex->saved_copy;
9995 #endif
9996         RXp_MATCH_COPIED_off(rex);
9997     }
9998     else
9999         eval_state->subbeg = NULL;
10000     rex->subbeg = (char *)reginfo->strbeg;
10001     rex->suboffset = 0;
10002     rex->subcoffset = 0;
10003     rex->sublen = reginfo->strend - reginfo->strbeg;
10004 }
10005
10006
10007 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10008
10009 static void
10010 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10011 {
10012     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10013     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10014     regmatch_slab *s;
10015
10016     Safefree(aux->poscache);
10017
10018     if (eval_state) {
10019
10020         /* undo the effects of S_setup_eval_state() */
10021
10022         if (eval_state->subbeg) {
10023             regexp * const rex = eval_state->rex;
10024             rex->subbeg     = eval_state->subbeg;
10025             rex->sublen     = eval_state->sublen;
10026             rex->suboffset  = eval_state->suboffset;
10027             rex->subcoffset = eval_state->subcoffset;
10028 #ifdef PERL_ANY_COW
10029             rex->saved_copy = eval_state->saved_copy;
10030 #endif
10031             RXp_MATCH_COPIED_on(rex);
10032         }
10033         if (eval_state->pos_magic)
10034         {
10035             eval_state->pos_magic->mg_len = eval_state->pos;
10036             eval_state->pos_magic->mg_flags =
10037                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10038                | (eval_state->pos_flags & MGf_BYTES);
10039         }
10040
10041         PL_curpm = eval_state->curpm;
10042     }
10043
10044     PL_regmatch_state = aux->old_regmatch_state;
10045     PL_regmatch_slab  = aux->old_regmatch_slab;
10046
10047     /* free all slabs above current one - this must be the last action
10048      * of this function, as aux and eval_state are allocated within
10049      * slabs and may be freed here */
10050
10051     s = PL_regmatch_slab->next;
10052     if (s) {
10053         PL_regmatch_slab->next = NULL;
10054         while (s) {
10055             regmatch_slab * const osl = s;
10056             s = s->next;
10057             Safefree(osl);
10058         }
10059     }
10060 }
10061
10062
10063 STATIC void
10064 S_to_utf8_substr(pTHX_ regexp *prog)
10065 {
10066     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10067      * on the converted value */
10068
10069     int i = 1;
10070
10071     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10072
10073     do {
10074         if (prog->substrs->data[i].substr
10075             && !prog->substrs->data[i].utf8_substr) {
10076             SV* const sv = newSVsv(prog->substrs->data[i].substr);
10077             prog->substrs->data[i].utf8_substr = sv;
10078             sv_utf8_upgrade(sv);
10079             if (SvVALID(prog->substrs->data[i].substr)) {
10080                 if (SvTAIL(prog->substrs->data[i].substr)) {
10081                     /* Trim the trailing \n that fbm_compile added last
10082                        time.  */
10083                     SvCUR_set(sv, SvCUR(sv) - 1);
10084                     /* Whilst this makes the SV technically "invalid" (as its
10085                        buffer is no longer followed by "\0") when fbm_compile()
10086                        adds the "\n" back, a "\0" is restored.  */
10087                     fbm_compile(sv, FBMcf_TAIL);
10088                 } else
10089                     fbm_compile(sv, 0);
10090             }
10091             if (prog->substrs->data[i].substr == prog->check_substr)
10092                 prog->check_utf8 = sv;
10093         }
10094     } while (i--);
10095 }
10096
10097 STATIC bool
10098 S_to_byte_substr(pTHX_ regexp *prog)
10099 {
10100     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10101      * on the converted value; returns FALSE if can't be converted. */
10102
10103     int i = 1;
10104
10105     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10106
10107     do {
10108         if (prog->substrs->data[i].utf8_substr
10109             && !prog->substrs->data[i].substr) {
10110             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10111             if (! sv_utf8_downgrade(sv, TRUE)) {
10112                 return FALSE;
10113             }
10114             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10115                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10116                     /* Trim the trailing \n that fbm_compile added last
10117                         time.  */
10118                     SvCUR_set(sv, SvCUR(sv) - 1);
10119                     fbm_compile(sv, FBMcf_TAIL);
10120                 } else
10121                     fbm_compile(sv, 0);
10122             }
10123             prog->substrs->data[i].substr = sv;
10124             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10125                 prog->check_substr = sv;
10126         }
10127     } while (i--);
10128
10129     return TRUE;
10130 }
10131
10132 #ifndef PERL_IN_XSUB_RE
10133
10134 bool
10135 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10136 {
10137     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10138      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10139      * the larger string bounded by 'strbeg' and 'strend'.
10140      *
10141      * 'cp' needs to be assigned (if not a future version of the Unicode
10142      * Standard could make it something that combines with adjacent characters,
10143      * so code using it would then break), and there has to be a GCB break
10144      * before and after the character. */
10145
10146     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10147     const U8 * prev_cp_start;
10148
10149     PERL_ARGS_ASSERT__IS_GRAPHEME;
10150
10151     /* Unassigned code points are forbidden */
10152     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10153                                     _invlist_search(PL_Assigned_invlist, cp))))
10154     {
10155         return FALSE;
10156     }
10157
10158     cp_gcb_val = getGCB_VAL_CP(cp);
10159
10160     /* Find the GCB value of the previous code point in the input */
10161     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10162     if (UNLIKELY(prev_cp_start == s)) {
10163         prev_cp_gcb_val = GCB_EDGE;
10164     }
10165     else {
10166         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10167     }
10168
10169     /* And check that is a grapheme boundary */
10170     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10171                 TRUE /* is UTF-8 encoded */ ))
10172     {
10173         return FALSE;
10174     }
10175
10176     /* Similarly verify there is a break between the current character and the
10177      * following one */
10178     s += UTF8SKIP(s);
10179     if (s >= strend) {
10180         next_cp_gcb_val = GCB_EDGE;
10181     }
10182     else {
10183         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10184     }
10185
10186     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10187 }
10188
10189 /*
10190 =head1 Unicode Support
10191
10192 =for apidoc isSCRIPT_RUN
10193
10194 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10195 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10196 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10197 two degenerate cases given below, this function returns TRUE iff all code
10198 points in it come from any combination of three "scripts" given by the Unicode
10199 "Script Extensions" property: Common, Inherited, and possibly one other.
10200 Additionally all decimal digits must come from the same consecutive sequence of
10201 10.
10202
10203 For example, if all the characters in the sequence are Greek, or Common, or
10204 Inherited, this function will return TRUE, provided any decimal digits in it
10205 are the ASCII digits "0".."9".  For scripts (unlike Greek) that have their own
10206 digits defined this will accept either digits from that set or from 0..9, but
10207 not a combination of the two.  Some scripts, such as Arabic, have more than one
10208 set of digits.  All digits must come from the same set for this function to
10209 return TRUE.
10210
10211 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10212 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10213 C<SCX_INVALID> if the function returns FALSE.
10214
10215 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10216 will be C<SCX_INVALID>.
10217
10218 If the sequence contains a single code point which is unassigned to a character
10219 in the version of Unicode being used, the function will return TRUE, and the
10220 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10221 in the input sequence will result in the function treating the input as not
10222 being a script run.
10223
10224 The returned script will be C<SCX_Inherited> iff all the code points in it are
10225 from the Inherited script.
10226
10227 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10228 it are from the Inherited or Common scripts.
10229
10230 =cut
10231
10232 */
10233
10234 bool
10235 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10236 {
10237     /* Basically, it looks at each character in the sequence to see if the
10238      * above conditions are met; if not it fails.  It uses an inversion map to
10239      * find the enum corresponding to the script of each character.  But this
10240      * is complicated by the fact that a few code points can be in any of
10241      * several scripts.  The data has been constructed so that there are
10242      * additional enum values (all negative) for these situations.  The
10243      * absolute value of those is an index into another table which contains
10244      * pointers to auxiliary tables for each such situation.  Each aux array
10245      * lists all the scripts for the given situation.  There is another,
10246      * parallel, table that gives the number of entries in each aux table.
10247      * These are all defined in charclass_invlists.h */
10248
10249     /* XXX Here are the additional things UTS 39 says could be done:
10250      * Mark Chinese strings as “mixed script” if they contain both simplified
10251      * (S) and traditional (T) Chinese characters, using the Unihan data in the
10252      * Unicode Character Database [UCD].  The criterion can only be applied if
10253      * the language of the string is known to be Chinese. So, for example, the
10254      * string “写真だけの結婚式 ” is Japanese, and should not be marked as
10255      * mixed script because of a mixture of S and T characters.  Testing for
10256      * whether a character is S or T needs to be based not on whether the
10257      * character has a S or T variant , but whether the character is an S or T
10258      * variant. khw notes that the sample contains a Hiragana character, and it
10259      * is unclear if absence of any foreign script marks the script as
10260      * "Chinese"
10261      *
10262      * Forbid sequences of the same nonspacing mark
10263      *
10264      * Check to see that all the characters are in the sets of exemplar
10265      * characters for at least one language in the Unicode Common Locale Data
10266      * Repository [CLDR]. */
10267
10268
10269     /* Things that match /\d/u */
10270     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10271     UV * decimals_array = invlist_array(decimals_invlist);
10272
10273     /* What code point is the digit '0' of the script run? */
10274     UV zero_of_run = 0;
10275     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10276     SCX_enum script_of_char = SCX_INVALID;
10277
10278     /* If the script remains not fully determined from iteration to iteration,
10279      * this is the current intersection of the possiblities.  */
10280     SCX_enum * intersection = NULL;
10281     PERL_UINT_FAST8_T intersection_len = 0;
10282
10283     bool retval = TRUE;
10284
10285     /* This is supposed to be a return parameter, but currently unused */
10286     SCX_enum * ret_script = NULL;
10287
10288     assert(send >= s);
10289
10290     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10291
10292     /* All code points in 0..255 are either Common or Latin, so must be a
10293      * script run.  We can special case it */
10294     if (! utf8_target && LIKELY(send > s)) {
10295         if (ret_script == NULL) {
10296             return TRUE;
10297         }
10298
10299         /* If any character is Latin, the run is Latin */
10300         while (s < send) {
10301             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10302                 *ret_script = SCX_Latin;
10303                 return TRUE;
10304             }
10305         }
10306
10307         /* If all are Common ... */
10308         *ret_script = SCX_Common;
10309         return TRUE;
10310     }
10311
10312     /* Look at each character in the sequence */
10313     while (s < send) {
10314         UV cp;
10315
10316         /* The code allows all scripts to use the ASCII digits.  This is
10317          * because they are used in commerce even in scripts that have their
10318          * own set.  Hence any ASCII ones found are ok, unless a digit from
10319          * another set has already been encountered.  (The other digit ranges
10320          * in Common are not similarly blessed) */
10321         if (UNLIKELY(isDIGIT(*s))) {
10322             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10323                 retval = FALSE;
10324                 break;
10325             }
10326             if (zero_of_run > 0) {
10327                 if (zero_of_run != '0') {
10328                     retval = FALSE;
10329                     break;
10330                 }
10331             }
10332             else {
10333                 zero_of_run = '0';
10334             }
10335             s++;
10336             continue;
10337         }
10338
10339         /* Here, isn't an ASCII digit.  Find the code point of the character */
10340         if (! UTF8_IS_INVARIANT(*s)) {
10341             Size_t len;
10342             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10343             s += len;
10344         }
10345         else {
10346             cp = *(s++);
10347         }
10348
10349         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10350          * digit in that script.  We can skip the rest of this code for this
10351          * character. */
10352         if (UNLIKELY(   zero_of_run > 0
10353                      && cp >= zero_of_run
10354                      && cp - zero_of_run <= 9))
10355         {
10356             continue;
10357         }
10358
10359         /* Find the character's script.  The correct values are hard-coded here
10360          * for small-enough code points. */
10361         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10362                                unlikely to change */
10363             if (       cp > 255
10364                 || (   isALPHA_L1(cp)
10365                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10366             {
10367                 script_of_char = SCX_Latin;
10368             }
10369             else {
10370                 script_of_char = SCX_Common;
10371             }
10372         }
10373         else {
10374             script_of_char = _Perl_SCX_invmap[
10375                                        _invlist_search(PL_SCX_invlist, cp)];
10376         }
10377
10378         /* We arbitrarily accept a single unassigned character, but not in
10379          * combination with anything else, and not a run of them. */
10380         if (   UNLIKELY(script_of_run == SCX_Unknown)
10381             || UNLIKELY(   script_of_run != SCX_INVALID
10382                         && script_of_char == SCX_Unknown))
10383         {
10384             retval = FALSE;
10385             break;
10386         }
10387
10388         /* For the first character, or the run is inherited, the run's script
10389          * is set to the char's */
10390         if (   UNLIKELY(script_of_run == SCX_INVALID)
10391             || UNLIKELY(script_of_run == SCX_Inherited))
10392         {
10393             script_of_run = script_of_char;
10394         }
10395
10396         /* For the character's script to be Unknown, it must be the first
10397          * character in the sequence (for otherwise a test above would have
10398          * prevented us from reaching here), and we have set the run's script
10399          * to it.  Nothing further to be done for this character */
10400         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10401             continue;
10402         }
10403
10404         /* We accept 'inherited' script characters currently even at the
10405          * beginning.  (We know that no characters in Inherited are digits, or
10406          * we'd have to check for that) */
10407         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10408             continue;
10409         }
10410
10411         /* If the run so far is Common, and the new character isn't, change the
10412          * run's script to that of this character */
10413         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10414
10415             /* But Common contains several sets of digits.  Only the '0' set
10416              * can be part of another script. */
10417             if (zero_of_run > 0 && zero_of_run != '0') {
10418                 retval = FALSE;
10419                 break;
10420             }
10421
10422             script_of_run = script_of_char;
10423         }
10424
10425         /* All decimal digits must be from the same sequence of 10.  Above, we
10426          * handled any ASCII digits without descending to here.  We also
10427          * handled the case where we already knew what digit sequence is the
10428          * one to use, and the character is in that sequence.  Now that we know
10429          * the script, we can use script_zeros[] to directly find which
10430          * sequence the script uses, except in a few cases it returns 0 */
10431         if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
10432             zero_of_run = script_zeros[script_of_char];
10433         }
10434
10435         /* Now we can see if the script of the character is the same as that of
10436          * the run */
10437         if (LIKELY(script_of_char == script_of_run)) {
10438             /* By far the most common case */
10439             goto scripts_match;
10440         }
10441
10442
10443         /* Here, the script of the run isn't Common.  But characters in Common
10444          * match any script */
10445         if (script_of_char == SCX_Common) {
10446             goto scripts_match;
10447         }
10448
10449 #ifndef HAS_SCX_AUX_TABLES
10450
10451         /* Too early a Unicode version to have a code point belonging to more
10452          * than one script, so, if the scripts don't exactly match, fail */
10453         PERL_UNUSED_VAR(intersection_len);
10454         retval = FALSE;
10455         break;
10456
10457 #else
10458
10459         /* Here there is no exact match between the character's script and the
10460          * run's.  And we've handled the special cases of scripts Unknown,
10461          * Inherited, and Common.
10462          *
10463          * Negative script numbers signify that the value may be any of several
10464          * scripts, and we need to look at auxiliary information to make our
10465          * deterimination.  But if both are non-negative, we can fail now */
10466         if (LIKELY(script_of_char >= 0)) {
10467             const SCX_enum * search_in;
10468             PERL_UINT_FAST8_T search_in_len;
10469             PERL_UINT_FAST8_T i;
10470
10471             if (LIKELY(script_of_run >= 0)) {
10472                 retval = FALSE;
10473                 break;
10474             }
10475
10476             /* Use the previously constructed set of possible scripts, if any.
10477              * */
10478             if (intersection) {
10479                 search_in = intersection;
10480                 search_in_len = intersection_len;
10481             }
10482             else {
10483                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10484                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10485             }
10486
10487             for (i = 0; i < search_in_len; i++) {
10488                 if (search_in[i] == script_of_char) {
10489                     script_of_run = script_of_char;
10490                     goto scripts_match;
10491                 }
10492             }
10493
10494             retval = FALSE;
10495             break;
10496         }
10497         else if (LIKELY(script_of_run >= 0)) {
10498             /* script of character could be one of several, but run is a single
10499              * script */
10500             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10501             const PERL_UINT_FAST8_T search_in_len
10502                                      = SCX_AUX_TABLE_lengths[-script_of_char];
10503             PERL_UINT_FAST8_T i;
10504
10505             for (i = 0; i < search_in_len; i++) {
10506                 if (search_in[i] == script_of_run) {
10507                     script_of_char = script_of_run;
10508                     goto scripts_match;
10509                 }
10510             }
10511
10512             retval = FALSE;
10513             break;
10514         }
10515         else {
10516             /* Both run and char could be in one of several scripts.  If the
10517              * intersection is empty, then this character isn't in this script
10518              * run.  Otherwise, we need to calculate the intersection to use
10519              * for future iterations of the loop, unless we are already at the
10520              * final character */
10521             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10522             const PERL_UINT_FAST8_T char_len
10523                                       = SCX_AUX_TABLE_lengths[-script_of_char];
10524             const SCX_enum * search_run;
10525             PERL_UINT_FAST8_T run_len;
10526
10527             SCX_enum * new_overlap = NULL;
10528             PERL_UINT_FAST8_T i, j;
10529
10530             if (intersection) {
10531                 search_run = intersection;
10532                 run_len = intersection_len;
10533             }
10534             else {
10535                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10536                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10537             }
10538
10539             intersection_len = 0;
10540
10541             for (i = 0; i < run_len; i++) {
10542                 for (j = 0; j < char_len; j++) {
10543                     if (search_run[i] == search_char[j]) {
10544
10545                         /* Here, the script at i,j matches.  That means this
10546                          * character is in the run.  But continue on to find
10547                          * the complete intersection, for the next loop
10548                          * iteration, and for the digit check after it.
10549                          *
10550                          * On the first found common script, we malloc space
10551                          * for the intersection list for the worst case of the
10552                          * intersection, which is the minimum of the number of
10553                          * scripts remaining in each set. */
10554                         if (intersection_len == 0) {
10555                             Newx(new_overlap,
10556                                  MIN(run_len - i, char_len - j),
10557                                  SCX_enum);
10558                         }
10559                         new_overlap[intersection_len++] = search_run[i];
10560                     }
10561                 }
10562             }
10563
10564             /* Here we've looked through everything.  If they have no scripts
10565              * in common, not a run */
10566             if (intersection_len == 0) {
10567                 retval = FALSE;
10568                 break;
10569             }
10570
10571             /* If there is only a single script in common, set to that.
10572              * Otherwise, use the intersection going forward */
10573             Safefree(intersection);
10574             if (intersection_len == 1) {
10575                 script_of_run = script_of_char = new_overlap[0];
10576                 Safefree(new_overlap);
10577             }
10578             else {
10579                 intersection = new_overlap;
10580             }
10581         }
10582
10583 #endif
10584
10585   scripts_match:
10586
10587         /* Here, the script of the character is compatible with that of the
10588          * run.  Either they match exactly, or one or both can be any of
10589          * several scripts, and the intersection is not empty.  If the
10590          * character is not a decimal digit, we are done with it.  Otherwise,
10591          * it could still fail if it is from a different set of 10 than seen
10592          * already (or we may not have seen any, and we need to set the
10593          * sequence).  If we have determined a single script and that script
10594          * only has one set of digits (almost all scripts are like that), then
10595          * this isn't a problem, as any digit must come from the same sequence.
10596          * The only scripts that have multiple sequences have been constructed
10597          * to be 0 in 'script_zeros[]'.
10598          *
10599          * Here we check if it is a digit. */
10600         if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
10601             && (   (          zero_of_run == 0
10602                     || (  (   script_of_char >= 0
10603                            && script_zeros[script_of_char] == 0)
10604                         ||    intersection))))
10605         {
10606             SSize_t range_zero_index;
10607             range_zero_index = _invlist_search(decimals_invlist, cp);
10608             if (   LIKELY(range_zero_index >= 0)
10609                 && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
10610             {
10611                 UV range_zero = decimals_array[range_zero_index];
10612                 if (zero_of_run) {
10613                     if (zero_of_run != range_zero) {
10614                         retval = FALSE;
10615                         break;
10616                     }
10617                 }
10618                 else {
10619                     zero_of_run = range_zero;
10620                 }
10621             }
10622         }
10623     } /* end of looping through CLOSESR text */
10624
10625     Safefree(intersection);
10626
10627     if (ret_script != NULL) {
10628         if (retval) {
10629             *ret_script = script_of_run;
10630         }
10631         else {
10632             *ret_script = SCX_INVALID;
10633         }
10634     }
10635
10636     return retval;
10637 }
10638
10639 #endif /* ifndef PERL_IN_XSUB_RE */
10640
10641 /*
10642  * ex: set ts=8 sts=4 sw=4 et:
10643  */